In [14]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import re
import spacy
import csv
from time import time
from IPython.display import clear_output

In [16]:
# define binary search to used in remNonSkills
def bs(target, arr):
    lo = 0
    hi = len(arr)-1
    while lo <= hi:
        mid = (lo + hi)//2
        if arr[mid] == target:
            return True
        elif target < arr[mid]:
            hi = mid-1
        else:
            lo = mid+1
    return False 

#define function to remove stop words from a type doc resume and returns a new doc
def remStopWords(docParam):
    # remove stopwords
    stopwords = nlp.Defaults.stop_words
    text = docParam.text
    lst=[]
    for token in text.split():
        if token.lower() not in stopwords:    #checking whether the word is not 
            lst.append(token)                    #present in the stopword list.
    return nlp(' '.join(lst).lower())    

# define a function that splits a type doc resume with stop words removed into base noun phrases 
def chunkSplit(doc):
    chunks = set()
    individual_words = set()
    for chunk in doc.noun_chunks:
        chunks.add(chunk.text)
        individual_words.add(chunk.root.head.text)
        
    clean_chunks = []  
    for chunk in chunks: 
        clean_chunks.extend(re.split(r"(, | － )", chunk))
    return clean_chunks, individual_words

# define a function that takes a cleaned array and removes any words that are not in the skills set
# returns a dataframe
def remNonSkills(df, docParam, chunksParam, individual_words, i, skills, column): 
    ind = 0
    removeWords = []
    chunksParam.extend(list(individual_words))
    for word in chunksParam:
        if(not bs(word, skills)):
            removeWords.append(word)

    #write new resume to dataframe
    resume = ' '.join([_word for _word in chunksParam if _word not in removeWords])
    df.loc[i, column] = resume
    return df

# Combine all cleaning functions into one function call
def cleanResume(df,i, skills, column):
    doc = nlp(df.loc[i, column])
    doc = remStopWords(doc)
    clean_chunks,individual_words=chunkSplit(doc)
    remNonSkills(df,doc,clean_chunks,individual_words,i,skills,column)

# Clean jobs data set and combine job description and required skills
def refineData(df):
    #drop 2 columns
    df = df.drop(columns = ['company_review'])
    df = df.drop(columns =['salary_offered'])
    #merge two columns
    df["Description + Skills"] = df['job_description'].astype(str) +": Skill List: "+ df['required_skills']
    return df

In [20]:
nlp = spacy.load('en_core_web_md')

In [24]:
df = pd.read_csv("./resume.csv")

In [18]:
# read linkedin keywords
filename = "./linkedinskill"
f = open(filename, 'rb')
Lines = f.readlines()
skills = []
for line in Lines:
    line = line[:-1].decode("utf-8")
    skills.append(line[:-1].lower())
# skills

In [26]:
# Clean resumes based on Linkedin keywords
t0 = time()
for i in range(len(df)):
    cleanResume(df,i, skills, 'Resume_str')
    clear_output(wait=True)
    print(f"processing resume index: {i}")
duration = time() - t0
clear_output(wait=True)
print(f"done in {duration:.3f} s")

done in 552.879 s


In [48]:
# Count Vectorize cleaned resumes
t0 = time()
content = df["Resume_str"]

# Create a Vectorizer Object
vectorizer = CountVectorizer(min_df=1)
vectorizer.fit(content)
vector = vectorizer.transform(content)
vector_features = vectorizer.get_feature_names_out()
vector_arr = vector.toarray()
count_vect_df = pd.DataFrame(vector.todense(), columns=vector_features)
# df = pd.concat([df, count_vect_df.reset_index(drop=True)])
duration = time() - t0


print("Vocabulary: ", len(vectorizer.vocabulary_))
print(f"done in {duration:.3f} s")
print(f"Found {len(vectorizer.get_feature_names_out())} unique terms")
print("Encoded Document is:")
print(vector)
vectorizer.vocabulary_

ValueError: np.nan is an invalid document, expected byte or unicode string.

In [49]:
#save vocabulary to text file
import json
with open('ResumeCountVecVocab.txt', 'w') as convert_file:
     convert_file.write(json.dumps(vectorizer.vocabulary_))

AttributeError: 'CountVectorizer' object has no attribute 'vocabulary_'

In [37]:
count_vect_df

Unnamed: 0,10,1120s,12c,133,1x,2008,2012,21,25,264,...,zebra,zemax,zendesk,zenworks,zeta,zoho,zoning,zoom,zumba,zynx
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2479,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2480,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2481,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2482,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
count_vect_df.describe()

Unnamed: 0,10,1120s,12c,133,1x,2008,2012,21,25,264,...,zebra,zemax,zendesk,zenworks,zeta,zoho,zoning,zoom,zumba,zynx
count,2484.0,2484.0,2484.0,2484.0,2484.0,2484.0,2484.0,2484.0,2484.0,2484.0,...,2484.0,2484.0,2484.0,2484.0,2484.0,2484.0,2484.0,2484.0,2484.0,2484.0
mean,0.004026,0.000403,0.000403,0.000403,0.001208,0.001208,0.000805,0.000805,0.000403,0.004026,...,0.000805,0.000805,0.002818,0.000805,0.000403,0.002013,0.000805,0.000403,0.002415,0.000403
std,0.089658,0.020064,0.020064,0.020064,0.044858,0.034738,0.02837,0.02837,0.020064,0.085048,...,0.040129,0.040129,0.072303,0.02837,0.020064,0.060171,0.02837,0.020064,0.063416,0.020064
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,3.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,2.0,...,2.0,2.0,2.0,1.0,1.0,2.0,1.0,1.0,2.0,1.0


In [12]:
# read job listing dataset
jobs = pd.read_csv("./JobsData.csv")
jobs = refineData(jobs)

In [21]:
# clean job listing dataset based on Linkedin keywords
t0 = time()
for i in range(len(jobs)):
    cleanResume(jobs,i, skills, "Description + Skills")
    clear_output(wait=True)
    print(f"processing job posting index: {i}")
duration = time() - t0
clear_output(wait=True)
print(f"done in {duration:.3f} s")

done in 694.163 s


In [42]:
jobs["Description + Skills"][1000:2000]

1000                 
1001    design design
1002            rdbms
1003          project
1004                 
            ...      
1995                 
1996                 
1997                 
1998                 
1999                 
Name: Description + Skills, Length: 1000, dtype: object

In [44]:
# Count Vectorize cleaned jobs
t0 = time()
content = jobs["Description + Skills"]

# Create a Vectorizer Object
vectorizerJobs = CountVectorizer(min_df=1)
vectorizerJobs.fit(content)
vector = vectorizerJobs.transform(content)
vector_features = vectorizerJobs.get_feature_names_out()
vector_arr = vector.toarray()
count_vect_jobs = pd.DataFrame(vector.todense(), columns=vector_features)
duration = time() - t0

print("Vocabulary: ", len(vectorizerJobs.vocabulary_))
print(f"done in {duration:.3f} s")
print(f"Found {len(vectorizerJobs.get_feature_names_out())} unique terms")
print("Encoded Document is:")
print(vector)
vectorizerJobs.vocabulary_

Vocabulary:  1179
done in 0.229 s
Found 1179 unique terms
Encoded Document is:
  (0, 834)	1
  (8, 1147)	1
  (9, 860)	1
  (11, 1087)	1
  (14, 278)	1
  (14, 582)	2
  (18, 1087)	1
  (22, 636)	1
  (22, 950)	1
  (23, 1088)	1
  (25, 227)	1
  (26, 343)	1
  (26, 1030)	1
  (26, 1065)	1
  (30, 1087)	1
  (30, 1088)	1
  (32, 491)	1
  (33, 490)	1
  (34, 24)	1
  (34, 62)	1
  (34, 248)	1
  (36, 666)	1
  (38, 745)	1
  (38, 787)	1
  (39, 1106)	1
  :	:
  (32687, 315)	1
  (32687, 323)	2
  (32687, 874)	1
  (32691, 889)	1
  (32695, 227)	1
  (32695, 243)	1
  (32695, 248)	1
  (32695, 745)	1
  (32695, 1135)	1
  (32701, 834)	1
  (32704, 42)	1
  (32705, 845)	1
  (32707, 845)	1
  (32710, 322)	1
  (32710, 1158)	1
  (32721, 1155)	2
  (32731, 1155)	2
  (32732, 1026)	1
  (32733, 824)	1
  (32733, 964)	1
  (32734, 430)	1
  (32734, 860)	1
  (32735, 824)	1
  (32735, 860)	1
  (32737, 824)	1


{'resolve': 834,
 'windows': 1147,
 'sap': 860,
 'troubleshooting': 1087,
 'microsoft': 582,
 'dynamics': 278,
 'open': 636,
 'source': 950,
 'tuning': 1088,
 'debugging': 227,
 'fault': 343,
 'tolerant': 1065,
 'systems': 1030,
 'jenkins': 491,
 'java': 490,
 'agile': 24,
 'application': 62,
 'development': 248,
 'pages': 666,
 'product': 745,
 'quality': 787,
 'unix': 1106,
 'linux': 543,
 'hiring': 442,
 'databases': 223,
 'mongodb': 598,
 'cs': 205,
 'research': 831,
 'design': 243,
 'microservices': 581,
 'hana': 430,
 'architecture': 72,
 'technical': 1041,
 'writing': 1163,
 'leadership': 525,
 'python': 779,
 'spark': 954,
 'reporting': 824,
 'fusion': 402,
 'crm': 203,
 'technology': 1047,
 'crts': 204,
 'automation': 95,
 'spring': 966,
 'mvc': 607,
 'boot': 130,
 'looker': 555,
 'telecom': 1050,
 'electronics': 300,
 'vue': 1131,
 'government': 417,
 'proof': 757,
 'mysql': 608,
 'test': 1054,
 'cases': 146,
 'patterns': 674,
 'rdbms': 795,
 'analytical': 45,
 'skills': 930,

In [46]:
#save vocabulary to text file
import json
with open('JobsCountVecVocab.txt', 'w') as convert_file:
     convert_file.write(json.dumps(vectorizerJobs.vocabulary_))