# Read Dataset

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('raw_resume.csv')
data.head()

Unnamed: 0,ID,Resume_str,Resume_html,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2484 entries, 0 to 2483
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ID           2484 non-null   int64 
 1   Resume_str   2484 non-null   object
 2   Resume_html  2484 non-null   object
 3   Category     2484 non-null   object
dtypes: int64(1), object(3)
memory usage: 77.8+ KB


In [4]:
LENGTH = len(data)
LENGTH

2484

In [5]:
data.iloc[0]['Resume_str']

"         HR ADMINISTRATOR/MARKETING ASSOCIATE\n\nHR ADMINISTRATOR       Summary     Dedicated Customer Service Manager with 15+ years of experience in Hospitality and Customer Service Management.   Respected builder and leader of customer-focused teams; strives to instill a shared, enthusiastic commitment to customer service.         Highlights         Focused on customer satisfaction  Team management  Marketing savvy  Conflict resolution techniques     Training and development  Skilled multi-tasker  Client relations specialist           Accomplishments      Missouri DOT Supervisor Training Certification  Certified by IHG in Customer Loyalty and Marketing by Segment   Hilton Worldwide General Manager Training Certification  Accomplished Trainer for cross server hospitality systems such as    Hilton OnQ  ,   Micros    Opera PMS   , Fidelio    OPERA    Reservation System (ORS) ,   Holidex    Completed courses and seminars in customer service, sales strategies, inventory control, loss pr

# Preprocessing

### Drop unnessessary columns

In [6]:
data = data.drop(["Resume_html"], axis=1)
data.head()

Unnamed: 0,ID,Resume_str,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...",HR
2,33176873,HR DIRECTOR Summary Over 2...,HR
3,27018550,HR SPECIALIST Summary Dedica...,HR
4,17812897,HR MANAGER Skill Highlights ...,HR


### Skills processing

In [7]:
# %pip install spacy
# %pip install gensim
# %pip install PyLDAvis
# %pip install wordcloud
# %pip install plotly
# %pip install nltk

In [8]:
#spacy
import spacy
from spacy.pipeline import EntityRuler
from spacy.lang.en import English
from spacy.tokens import Doc

#gensim
import gensim
from gensim import corpora

#Visualization
from spacy import displacy
import pyLDAvis.gensim_models
from wordcloud import WordCloud
import plotly.express as px
import matplotlib.pyplot as plt

#Data loading/ Data manipulation
import pandas as pd
import numpy as np
import jsonlines

#nltk
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download(['stopwords','wordnet'])

from tqdm import tqdm
tqdm.pandas()

#warning
import warnings 
warnings.filterwarnings('ignore')

  np.bool8: (False, True),
[nltk_data] Downloading package stopwords to /home/peichi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/peichi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [9]:
# %%python -m spacy download en_core_web_lg

In [10]:
nlp = spacy.load("en_core_web_lg")
skill_pattern_path = "jz_skill_patterns.jsonl"

In [11]:
ruler = nlp.add_pipe("entity_ruler")
ruler.from_disk(skill_pattern_path)
nlp.pipe_names

['tok2vec',
 'tagger',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner',
 'entity_ruler']

In [12]:
def get_skills(text):
    doc = nlp(text)
    myset = []
    subset = []
    for ent in doc.ents:
        if ent.label_ == "SKILL":
            subset.append(ent.text)
    myset.append(subset)
    return subset


def unique_skills(x):
    return list(set(x))

In [13]:
clean = []
for i in tqdm(range(LENGTH)):
    review = re.sub(
        '(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?"',
        " ",
        data["Resume_str"].iloc[i],
    )
    review = review.lower()
    review = review.split()
    lm = WordNetLemmatizer()
    review = [
        lm.lemmatize(word)
        for word in review
        if not word in set(stopwords.words("english"))
    ]
    review = " ".join(review)
    clean.append(review)

100%|██████████| 2484/2484 [09:39<00:00,  4.29it/s]


In [None]:
data["Clean_Resume"] = clean
data["skills"] = data["Clean_Resume"].str.lower().progress_apply(get_skills)
data["skills"] = data["skills"].apply(unique_skills)
data.head()

 78%|███████▊  | 1939/2484 [02:55<00:37, 14.50it/s]

### Grouping job titles

In [None]:
data['Category'].value_counts()

In [None]:
# PRIMARY INTEREST : TECHNOLOGY, BUSINESS
# BPO: BUSINESS PROCESS OUTSOURCING
business_field = ["BPO","HR","PUBLIC-RELATIONS","CONSULTANT","BANKING","SALES","ACCOUNTANT","FINANCE","BUSINESS-DEVELOPMENT",]
technology_field = ["AUTOMOBILE","AVIATION","ENGINEERING","INFORMATION-TECHNOLOGY"]
# SECONDARY INTEREST : ARTS, HUMANITIES FASHION
secondary_interest = ["AGRICULTURE","DIGITAL-MEDIA","APPAREL","TEACHER","ARTS","DESIGNER","CONSTRUCTION","HEALTHCARE","FITNESS","CHEF","ADVOCATE"]

In [None]:
group = []
for i in range(LENGTH):
    if data.iloc[i]['Category'] in business_field:
        group.append('business')
    elif data.iloc[i]['Category'] in technology_field:
        group.append('technology')
    else:
        group.append('others')
data['group'] = group
data.head()

In [None]:
data['group'].value_counts()

### Save CSV

In [None]:
data.to_csv("resume_data.csv")

# Analysis

### Job distribution

In [None]:
fig = px.histogram(
    data, x="Category", title="Distribution of Jobs Categories"
).update_xaxes(categoryorder="total descending")
fig.show()

### Skills distribution

In [None]:
Job_cat = data["Category"].unique()
Job_cat = np.append(Job_cat, "ALL")

In [None]:
Job_Category = "ALL"

In [None]:
Total_skills = []
if Job_Category != "ALL":
    fltr = data[data["Category"] == Job_Category]["skills"]
    for x in fltr:
        for i in x:
            Total_skills.append(i)
else:
    fltr = data["skills"]
    for x in fltr:
        for i in x:
            Total_skills.append(i)

fig = px.histogram(
    x=Total_skills,
    labels={"x": "Skills"},
    title=f"{Job_Category} Distribution of Skills",
).update_xaxes(categoryorder="total descending")
fig.show()

In [None]:
from collections import Counter
Counter(Total_skills).most_common(10)

### Most used words

In [None]:
text = ""
if Job_Category == "ALL":
    for i in data["Clean_Resume"].values:
        text += i + " "
else:
    for i in data[data["Category"] == Job_Category]["Clean_Resume"].values:
        text += i + " "

plt.figure(figsize=(8, 8))

x, y = np.ogrid[:300, :300]

mask = (x - 150) ** 2 + (y - 150) ** 2 > 130 ** 2
mask = 255 * mask.astype(int)

wc = WordCloud(
    width=800,
    height=800,
    background_color="white",
    min_font_size=6,
    repeat=True,
    mask=mask,
)
wc.generate(text)

plt.axis("off")
plt.imshow(wc, interpolation="bilinear")
plt.title(f"Most Used Words in {Job_Category} Resume", fontsize=20)

In [None]:
list(wc.words_.keys())[:15]

### Entity Recognition

In [None]:
sent = nlp(data["Resume_str"].iloc[0])
displacy.render(sent, style="ent", jupyter=True)

### Dependency parsing

In [None]:
displacy.render(sent[0:10], style="dep", jupyter=True, options={"distance": 90})

### Analyze document length

tokenize length: https://www.kaggle.com/code/alexkarev/resume-classification

In [None]:
resume_lengths = []
for i in range(LENGTH):
    resume_lengths.append(len(data.iloc[i]['Clean_Resume']))
fig = plt.figure(figsize =(10, 7))
plt.boxplot(pd.DataFrame(resume_lengths))
plt.ylabel("Resume length")
plt.xticks([1], [""])
plt.show()