In [5]:
# Configure
print_debug = True
csv_file = './examples/cv.csv' # the csv text data include the following fields: id (int), text (string)

In [6]:
import pandas as pd

# Load and clean data
print('Load and clean data')
df = pd.read_csv(csv_file)
df = df.dropna()
df['text'] = df['text'].str.lower()
if print_debug:
    print(df.head(4))

Load and clean data
    id                                               text
0  1.0  "job title: software engineer\ncompany: xyz te...
1  2.0  "job title: data scientist\ncompany: abc data\...
4  3.0  "resume\nname: john doe\ncontact: john.doe@exa...
5  4.0  "resume\nname: jane smith\ncontact: jane.smith...


In [10]:
# Data preprocessing, but not needed for this project becuase the hugging face pre-trained model already contains a tokenizer.
# If there are more specific requirements on the text extrcation, could build own tokenizer to facilitate the need.
# However, this tokenzier has not considered symbols that are inclued in the skills such as C++ or .NET.

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download([
     "punkt",
     "wordnet",
     "stopwords"
])

tokenised_df = df.copy(deep=True)

# Tokenization
tokenised_df['tokens'] = tokenised_df['text'].apply(word_tokenize)
if print_debug:
     print("Tokenized data")
     print(tokenised_df.head(4))

# Remove stop words, punctuation, and non-alphabetic characters
stop_words = set(stopwords.words('english'))
tokenised_df['tokens'] = tokenised_df['tokens'].apply(lambda x: [word for word in x if word.isalpha() and word not in stop_words])
if print_debug:
     print("Cleaned tokenized data")
     print(tokenised_df.head(4))

# Lemmatization
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
tokenised_df['tokens'] = tokenised_df['tokens'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
if print_debug:
     print("Lemmatized data")
     print(tokenised_df.head(4))

tokenised_df.to_csv('./output/cv_tokenised.csv', index=False)


Tokenized data
    id                                               text
0  1.0  [``, job, title, :, software, engineer\ncompan...
1  2.0  [``, job, title, :, data, scientist\ncompany, ...
4  3.0  [``, resume\nname, :, john, doe\ncontact, :, j...
5  4.0  [``, resume\nname, :, jane, smith\ncontact, :,...
Cleaned tokenized data
    id                                               text
0  1.0  [job, title, software, xyz, san, francisco, de...
1  2.0  [job, title, data, abc, new, york, description...
4  3.0  [john, utilize, skills, experience, software, ...
5  4.0  [jane, seeking, challenging, role, data, scien...
Lemmatized data
    id                                               text
0  1.0  [job, title, software, xyz, san, francisco, de...
1  2.0  [job, title, data, abc, new, york, description...
4  3.0  [john, utilize, skill, experience, software, e...
5  4.0  [jane, seeking, challenging, role, data, scien...


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ttmet\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ttmet\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ttmet\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ttmet\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [11]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
# Load pre-trained models 
# soft skills extraction: https://huggingface.co/jjzha/jobbert_skill_extraction
# hard skills extraction: https://huggingface.co/jjzha/jobbert_knowledge_extraction
token_soft_skill_classifier = pipeline(model='jjzha/jobbert_skill_extraction', aggregation_strategy='first')
token_hard_skill_classifier = pipeline(model='jjzha/jobbert_knowledge_extraction', aggregation_strategy='first')

def aggregate_skill_span(results):
    """Aggregate consecutive classified ntities into one.
    """    
    new_results = []
    current_result = results[0]

    for result in results[1:]:
        if result["start"] <= current_result["end"] + 1:
            current_result["word"] += " " + result["word"]
            current_result["end"] = result["end"]
        else:
            new_results.append(current_result)
            current_result = result

    new_results.append(current_result)
    
    # remove invalid skills that are 1 character and none-alphabet (e.g. punctuation or other symbols that are wrongly classifierd)
    new_results = [x for x in new_results if (len(x["word"]) > 1 or x["word"].isalpha())]

    # remove invalid skills that are all numeric
    new_results = [x for x in new_results if (not x["word"].isnumeric())]

    return [x for x in new_results if (len(x["word"]) > 1 or x["word"].isalpha())]

def extract_skill_entities(text):
    """Extract both soft and hard skills
    """   
    # soft skills     
    output_soft_skills = token_soft_skill_classifier(text)
    for result in output_soft_skills:
        if result.get("entity_group"):
            result["entity"] = "Soft Skill"
            del result["entity_group"]
    # hard skills
    output_hard_skills = token_hard_skill_classifier(text)
    for result in output_hard_skills:
        if result.get("entity_group"):
            result["entity"] = "Hard Skill"
            del result["entity_group"]
    # aggregates
    if len(output_soft_skills) > 0:
        output_soft_skills = aggregate_skill_span(output_soft_skills)
    if len(output_hard_skills) > 0:
        output_hard_skills = aggregate_skill_span(output_hard_skills)


    return output_soft_skills, output_hard_skills

# Copy dataframe and create placeholder for soft and hard skills
out_df = df.copy(deep=True)
out_df['soft_skills'] = pd.Series(dtype='string')
out_df['hard_skills'] = pd.Series(dtype='string')

# Loop through each row
score_thres = 0.5
for i, row in out_df.iterrows():
    output_soft_skills, output_hard_skills = extract_skill_entities(row['text'])

    # Extract soft skills
    soft_skills = set()
    for soft_skill in output_soft_skills:
        if soft_skill['score'] > score_thres:
            soft_skills.add(soft_skill['word'])
    if soft_skills:
        out_df.at[i,'soft_skills'] = str(soft_skills)
    
    # Extract hard skills
    hard_skills = set()
    for hard_skill in output_hard_skills:
        if hard_skill['score'] > score_thres:
            hard_skills.add(hard_skill['word'])
    if hard_skills:
        out_df.at[i,'hard_skills'] = str(hard_skills)

    if print_debug:
        print(f'CV ID: {i}')
        print('\tSoft Skills')
        print(soft_skills)
        print('\tHard Skills')
        print(hard_skills)

out_df.to_csv('./output/cv_classified.csv', index=False)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


CV ID: 0
	Soft Skills
{'organizational and leadership skills', 'defining system functionality', 'work independently', 'gathering user requirements', 'design and develop software solutions', 'writing code', 'problem - solving aptitude', 'analytical mind', 'passionate'}
	Hard Skills
{'software', 'system monitoring tools', 'project management', 'ruby on rails', 'software development', 'selected programming languages', '. net programming languages', 'nosql databases', 'java', 'scripting', 'c + +', 'relational databases', 'mysql', 'jscript. net', 'java / j2ee platform', 'automated testing frameworks', 'new relic'}
CV ID: 1
	Soft Skills
{'develop analysis and reporting capabilities', 'business acumen', 'conducting full lifecycle analysis', 'analyze large amounts of raw information', 'find patterns', 'problem - solving aptitude', 'communication and presentation skills', ', activities and design', 'math skills', 'analytical mind', 'identify improvements', 'monitor performance and quality contr