In [24]:
#spacy
import spacy
from spacy.pipeline import EntityRuler
from spacy.lang.en import English
from spacy.tokens import Doc

#gensim
import gensim
from gensim import corpora

#Visualization
from spacy import displacy
import pyLDAvis.gensim_models
from wordcloud import WordCloud
import plotly.express as px
import matplotlib.pyplot as plt

#Data loading/ Data manipulation
import pandas as pd
import numpy as np
import jsonlines

#nltk
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download(['stopwords','wordnet'])

#warning
import warnings 
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rajchhabria/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/rajchhabria/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [25]:
df = pd.read_csv('Resume.csv')
df.head()

Unnamed: 0,Category,link,id,Resume,Raw_html
0,hr,https://www.livecareer.com/resume-search/r/hr-...,203361909970992332506290823189098544432,HR EMPLOYEE SERVICE REPRESENTATIVE ...,"<div class=""fontsize fontface vmargins hmargin..."
1,hr,https://www.livecareer.com/resume-search/r/bri...,176452925245576845188344162105053478091,"BRIDGES OF AMERICA, HR SPECIALIST II ...","<div class=""fontsize fontface vmargins hmargin..."
2,hr,https://www.livecareer.com/resume-search/r/reg...,224895221270146291614712443228903825828,REGIONAL HR MANAGER Summary ...,"<div class=""fontsize fontface vmargins hmargin..."
3,hr,https://www.livecareer.com/resume-search/r/hr-...,120168133438024183006274626332357739684,HR GENERALIST Professional Prof...,"<div class=""fontsize fontface vmargins hmargin..."
4,hr,https://www.livecareer.com/resume-search/r/hr-...,37082950160311880080159288707042371082,HR COORDINATOR Summary To o...,"<div class=""fontsize fontface vmargins hmargin..."


In [26]:
df.shape

(277, 5)

In [27]:
df = df[['id','Category','Resume','Raw_html']]
df.head()

Unnamed: 0,id,Category,Resume,Raw_html
0,203361909970992332506290823189098544432,hr,HR EMPLOYEE SERVICE REPRESENTATIVE ...,"<div class=""fontsize fontface vmargins hmargin..."
1,176452925245576845188344162105053478091,hr,"BRIDGES OF AMERICA, HR SPECIALIST II ...","<div class=""fontsize fontface vmargins hmargin..."
2,224895221270146291614712443228903825828,hr,REGIONAL HR MANAGER Summary ...,"<div class=""fontsize fontface vmargins hmargin..."
3,120168133438024183006274626332357739684,hr,HR GENERALIST Professional Prof...,"<div class=""fontsize fontface vmargins hmargin..."
4,37082950160311880080159288707042371082,hr,HR COORDINATOR Summary To o...,"<div class=""fontsize fontface vmargins hmargin..."


In [28]:
nlp = spacy.load("en_core_web_lg")
skill_pattern_path = "jz_skill_patterns.jsonl"

In [29]:
# !python -m spacy download en_core_web_sm

In [30]:
# !python -m spacy download en_core_web_lg

In [31]:
ruler = nlp.add_pipe("entity_ruler")
ruler.from_disk(skill_pattern_path)
nlp.pipe_names


['tok2vec',
 'tagger',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner',
 'entity_ruler']

In [32]:
def get_skills(text):
    doc = nlp(text)
    myset = []
    subset = []
    for ent in doc.ents:
        if ent.label_ == "SKILL":
            subset.append(ent.text)
    myset.append(subset)
    return subset


def unique_skills(x):
    return list(set(x))

In [33]:
clean = []
for i in range(df.shape[0]):
    review = re.sub(
        '(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?"',
        " ",
        df["Resume"].iloc[i],
    )
    review = review.lower()
    review = review.split()
    lm = WordNetLemmatizer()
    review = [
        lm.lemmatize(word)
        for word in review
        if not word in set(stopwords.words("english"))
    ]
    review = " ".join(review)
    clean.append(review)

In [34]:
df["Clean_Resume"] = clean
df["skills"] = df["Clean_Resume"].str.lower().apply(get_skills)
df["skills"] = df["skills"].apply(unique_skills)
df.head()

Unnamed: 0,id,Category,Resume,Raw_html,Clean_Resume,skills
0,203361909970992332506290823189098544432,hr,HR EMPLOYEE SERVICE REPRESENTATIVE ...,"<div class=""fontsize fontface vmargins hmargin...",hr employee service representative summary exc...,"[specification, material, security, relay]"
1,176452925245576845188344162105053478091,hr,"BRIDGES OF AMERICA, HR SPECIALIST II ...","<div class=""fontsize fontface vmargins hmargin...",bridge america hr specialist ii hr benefit spe...,"[sorting, schedule, support]"
2,224895221270146291614712443228903825828,hr,REGIONAL HR MANAGER Summary ...,"<div class=""fontsize fontface vmargins hmargin...",regional hr manager summary holistic hr profes...,"[business, electrical engineering]"
3,120168133438024183006274626332357739684,hr,HR GENERALIST Professional Prof...,"<div class=""fontsize fontface vmargins hmargin...",hr generalist professional profile responsible...,"[business administration, security, workflow, ..."
4,37082950160311880080159288707042371082,hr,HR COORDINATOR Summary To o...,"<div class=""fontsize fontface vmargins hmargin...",hr coordinator summary obtain human resource g...,"[business administration, certificate, monitor..."


In [35]:
df.tail()

Unnamed: 0,id,Category,Resume,Raw_html,Clean_Resume,skills
272,326696588254006082178195394864812018153,cloud architect,AWS SOLUTIONS ARCHITECT/SECURITY SPEC...,"<div class=""MPR skn-smp1 fontsize fontface vma...",aws solution architect security specialist pro...,"[python, box, certificate, monitoring, operati..."
273,323758996602989822681552635853293565365,cloud architect,PRINCIPAL SOFTWARE ARCHITECT ...,"<div class=""fontsize fontface vmargins hmargin...",principal software architect professional summ...,"[azure, release, certificate, software, code c..."
274,56625186222409135371580665289356755558,cloud architect,"DIRECTOR SOFTWARE APPLICATION, CLOUD ...","<div class=""fontsize fontface vmargins hmargin...",director software application cloud architectu...,"[release, python, computer science, monitoring..."
275,21583661162788348076515365723517256627,cloud architect,CLOUD ENGINEER Professional S...,"<div class=""MPR skn-cbg2 fontsize fontface vma...",cloud engineer professional summary experience...,"[release, computer science, python, certificat..."
276,112950596602995969298265417192508380139,cloud architect,"DIRECTOR OF PRODUCT MANAGEMENT, DATA ...","<div class=""fontsize fontface vmargins hmargin...",director product management data analytics pro...,"[python, computer science, data visualization,..."


In [36]:
df.sample(5)

Unnamed: 0,id,Category,Resume,Raw_html,Clean_Resume,skills
173,241126540342435531536621380877908399896,engineering,ENGINEERING LAB TECHNICIAN Summ...,"<div class=""fontsize fontface vmargins hmargin...",engineering lab technician summary obtain posi...,"[release, visual studio, certificate, linux, s..."
81,54012533145753307756024505833530805832,agriculture,FEMA VACCINE DISTRIBUTION Wor...,"<div class=""MPR skn-cbg1 fontsize fontface vma...",fema vaccine distribution work history fema va...,"[schedule, software, data analysis, marketing]"
17,320824777988444720890104466482797072130,designer,COSTUME DESIGNER Expe...,"<div class=""fontsize fontface vmargins hmargin...",costume designer experience october 2015 decem...,[design]
68,293265751432027233478933846774234798433,healthcare,HOME HEALTHCARE TECHNICIAN Summ...,"<div class=""fontsize fontface vmargins hmargin...",home healthcare technician summary certified n...,[]
228,308182583074127215470328780307112076953,aviation,AVIATION PARTS SALESMAN Caree...,"<div class=""fontsize fontface vmargins hmargin...",aviation part salesman career focus earned pri...,"[material, box, software, aeronautics, testing..."


In [37]:
fig = px.histogram(
    df, x="Category", title="Distribution of Jobs Categories"
).update_xaxes(categoryorder="total descending")
fig.show()

In [40]:
sent = nlp(df["Resume"].iloc[3])
displacy.render(sent, style="ent", jupyter=True)

In [41]:
displacy.render(sent[0:10], style="dep", jupyter=True, options={"distance": 90})

In [42]:
patterns = df.Category.unique()
for a in patterns:
    ruler.add_patterns([{"label": "Job-Category", "pattern": a}])


In [43]:
colors = {
    "Job-Category": "linear-gradient(90deg, #aa9cfc, #fc9ce7)",
    "SKILL": "linear-gradient(90deg, #9BE15D, #00E3AE)",
    "ORG": "#ffd966",
    "PERSON": "#e06666",
    "GPE": "#9fc5e8",
    "DATE": "#c27ba0",
    "ORDINAL": "#674ea7",
    "PRODUCT": "#f9cb9c",
}
options = {
    "ents": [
        "Job-Category",
        "SKILL",
        "ORG",
        "PERSON",
        "GPE",
        "DATE",
        "ORDINAL",
        "PRODUCT",
    ],
    "colors": colors,
}
sent = nlp(df["Resume"].iloc[5])
displacy.render(sent, style="ent", jupyter=True, options=options)

In [44]:
input_resume = input("Enter Resume Text")

Enter Resume TextAbid Ali Awan Data Scientist I am a certified data scientist professional, who loves building machine learning models and blogs about the latest AI technologies. I am currently testing AI Products at PEC-PITC, which later gets approved for human trials. abidaliawan@tutamail.com +923456855126 Islamabad, Pakistan abidaliawan.me WORK EXPERIENCE Data Scientist Pakistan Innovation and Testing Center - PEC 04/2021 - Present, Islamabad, Pakistan Redesigned data of engineers that were mostly scattered and unavailable. Designed dashboard and data analysis report to help higher management make better decisions. Accessibility of key information has created a new culture of making data-driven decisions. Contact: Ali Raza Asif - darkslayerraza10@gmail.com Data Scientist Freelancing/Kaggle 11/2020 - Present, Islamabad, Pakistan Engineered a healthcare system. Used machine learning to detect some of the common decisions. The project has paved the way for others to use new techniques 

In [45]:
sent2 = nlp(input_resume)
displacy.render(sent2, style="ent", jupyter=True, options=options)

In [46]:
input_skills = input("Enter skills: ")

Enter skills: Data Science,Data Analysis,Database,SQL,Machine Learning,tableau


In [47]:
req_skills = input_skills.lower().split(",")
resume_skills = unique_skills(get_skills(input_resume.lower()))
score = 0
for x in req_skills:
    if x in resume_skills:
        score += 1
req_skills_len = len(req_skills)
match = round(score / req_skills_len * 100, 1)

print(f"The current Resume is {match}% matched to your requirements")

The current Resume is 66.7% matched to your requirements
