In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import re
import spacy
import csv

In [2]:
# old database
# filename = "./UpdatedResumeDataSet.csv"
# df = pd.read_csv(filename)

In [3]:
#define binary search to used in remNonSkills
def bs(target, arr):
    lo = 0
    hi = len(arr)-1
    while lo <= hi:
        mid = (lo + hi)//2
        if arr[mid] == target:
            return True
        elif target < arr[mid]:
            hi = mid-1
        else:
            lo = mid+1
    return False 

#define function to remove stop words from a type doc resume and returns a new doc
def remStopWords(docParam):
    # remove stopwords
    stopwords = nlp.Defaults.stop_words
    text = docParam.text
    lst=[]
    for token in text.split():
        if token.lower() not in stopwords:    #checking whether the word is not 
            lst.append(token)                    #present in the stopword list.
    return nlp(' '.join(lst).lower())    

#define a function that splits a type doc resume with stop words removed into base noun phrases 
def chunkSplit(doc):
    chunks = set()
    individual_words = set()
    for chunk in doc.noun_chunks:
        chunks.add(chunk.text)
        individual_words.add(chunk.root.head.text)
        
    clean_chunks = []  
    for chunk in chunks: 
        clean_chunks.extend(re.split(r"(, | － )", chunk))
    return clean_chunks, individual_words

#define a function that takes a cleaned array and removes any words that are not in the skills set
#returns a dataframe
def remNonSkills(df, docParam, chunksParam, individual_words, i, skills): 
    ind = 0
    removeWords = []
    chunksParam.extend(list(individual_words))
    for word in chunksParam:
        if(not bs(word, skills)):
            removeWords.append(word)

    #write new resume to dataframe
    resume = ' '.join([_word for _word in chunksParam if _word not in removeWords])
    df.loc[i, "Resume_str"] = resume
    return df

def cleanResume(df,i, skills):
    doc = nlp(df.loc[i, "Resume_str"])
    doc = remStopWords(doc)
    clean_chunks,individual_words=chunkSplit(doc)
    remNonSkills(df,doc,clean_chunks,individual_words,i,skills)

In [4]:
nlp = spacy.load('en_core_web_md')

In [5]:
df = pd.read_csv("./resume.csv")

In [6]:
df['Resume_str']

0                HR ADMINISTRATOR/MARKETING ASSOCIATE\...
1                HR SPECIALIST, US HR OPERATIONS      ...
2                HR DIRECTOR       Summary      Over 2...
3                HR SPECIALIST       Summary    Dedica...
4                HR MANAGER         Skill Highlights  ...
                              ...                        
2479             RANK: SGT/E-5 NON- COMMISSIONED OFFIC...
2480             GOVERNMENT RELATIONS, COMMUNICATIONS ...
2481             GEEK SQUAD AGENT         Professional...
2482             PROGRAM DIRECTOR / OFFICE MANAGER    ...
2483             STOREKEEPER II       Professional Sum...
Name: Resume_str, Length: 2484, dtype: object

In [7]:
filename = "./linkedinskill"
f = open(filename, 'rb')
Lines = f.readlines()
skills = []
for line in Lines:
    line = line[:-1].decode("utf-8")
    skills.append(line[:-1].lower())
# skills

In [8]:
# df.loc[0, "Resume_str"]

In [9]:
from IPython.display import clear_output
for i in range(len(df)):
    cleanResume(df,i, skills)
    clear_output(wait=True)
    print(i)

2483


In [13]:
df["Resume_str"]

0       paperwork documentation employee relations gov...
1       graphics lotus notes microsoft office presenta...
2       workers compensation organizational management...
3       microsoft outlook excel filing filing routing ...
4       strategy art benefits administration staff dev...
                              ...                        
2479    hazardous materials federal regulations drawin...
2480    graphics airports word program management visi...
2481    multiple disciplines shipping specifications u...
2482    word outlook forms fundraising travel arrangem...
2483    purchase orders fleet services autocad dispatc...
Name: Resume_str, Length: 2484, dtype: object

In [12]:
from time import time
t0 = time()

content = df["Resume_str"]

# Create a Vectorizer Object
vectorizer = CountVectorizer(min_df=1)
vectorizer.fit(content)
vector = vectorizer.transform(content)
# vector_arr = vector.toarray
duration = time() - t0

print("Vocabulary: ", len(vectorizer.vocabulary_))
print(f"done in {duration:.3f} s")
print(f"Found {len(vectorizer.get_feature_names_out())} unique terms")
print("Encoded Document is:")
print(vector)
vectorizer.vocabulary_


Vocabulary:  3708
done in 0.151 s
Found 3708 unique terms
Encoded Document is:
  (0, 135)	1
  (0, 820)	1
  (0, 931)	1
  (0, 1278)	1
  (0, 1344)	1
  (0, 1383)	1
  (0, 1624)	1
  (0, 1724)	1
  (0, 1751)	1
  (0, 1759)	1
  (0, 1998)	1
  (0, 2125)	1
  (0, 2166)	1
  (0, 2187)	2
  (0, 2376)	1
  (0, 2380)	2
  (0, 2407)	1
  (0, 2481)	1
  (0, 2497)	1
  (0, 2584)	1
  (0, 2625)	3
  (0, 2661)	1
  (0, 2715)	3
  (0, 2972)	1
  (0, 3100)	3
  :	:
  (2483, 267)	1
  (2483, 624)	1
  (2483, 792)	2
  (2483, 1026)	1
  (2483, 1095)	1
  (2483, 1138)	1
  (2483, 1180)	1
  (2483, 1819)	1
  (2483, 1998)	1
  (2483, 2044)	1
  (2483, 2097)	1
  (2483, 2181)	1
  (2483, 2228)	1
  (2483, 2356)	1
  (2483, 2376)	1
  (2483, 2495)	1
  (2483, 2497)	2
  (2483, 2670)	1
  (2483, 2805)	1
  (2483, 2890)	1
  (2483, 2922)	1
  (2483, 3070)	1
  (2483, 3269)	1
  (2483, 3630)	1
  (2483, 3648)	3


{'paperwork': 2125,
 'documentation': 820,
 'employee': 931,
 'relations': 2625,
 'government': 1278,
 'layout': 1624,
 'office': 1998,
 'payroll': 2166,
 'performance': 2187,
 'reviews': 2715,
 'presentations': 2376,
 'public': 2481,
 'purchasing': 2497,
 'reporting': 2661,
 'statistics': 3100,
 'productivity': 2407,
 'health': 1344,
 'analytical': 135,
 'skills': 2972,
 'records': 2584,
 'holidex': 1383,
 'time': 3330,
 'management': 1751,
 'loss': 1724,
 'prevention': 2380,
 'training': 3386,
 'mar': 1759,
 'graphics': 1293,
 'lotus': 1726,
 'notes': 1974,
 'microsoft': 1819,
 'publications': 2482,
 'web': 3600,
 'design': 711,
 'articles': 214,
 'asset': 236,
 'events': 1020,
 'union': 3489,
 'launches': 1617,
 'photoshop': 2237,
 'workers': 3652,
 'compensation': 549,
 'organizational': 2050,
 'staff': 3076,
 'databases': 655,
 'quattro': 2523,
 'pro': 2397,
 'strategic': 3131,
 'planning': 2276,
 'publisher': 2484,
 'regulatory': 2616,
 'compliance': 552,
 'harvard': 1334,
 'exce