## Job Description to Resume Comparator - FreqDist

This program compares the words found in a job description to the words in a resume. The current version compares all words and gives a naive percentage match.

In [2]:
from nltk import sent_tokenize, word_tokenize, pos_tag

from nltk.corpus import stopwords
import pandas as pd
from nltk import FreqDist
import codecs
from nltk.corpus import stopwords

# NLTK's default english stopwords
default_stopwords = stopwords.words('english')

#File Locations

document_folder = '../data/'
resume_file = document_folder + 'resume.txt'
job_description_file = document_folder + 'job_description.txt'
custom_stopwords_file = document_folder + 'custom_stopwords.txt'

custom_stopwords = codecs.open(custom_stopwords_file, 'r', 'utf-8').read().splitlines()
all_stopwords = set(default_stopwords + custom_stopwords)

def process_text(text,stopwords):
    tokens = word_tokenize(text)
    words = [t for t in tokens if t.isalpha()]
    words = [w for w in words if len(w)>1]
    words = [w for w in words if not w.isnumeric()]
    words = [w for w in words if w not in all_stopwords]
    words = [w.lower() for w in words]
    return FreqDist(words)


f_resume=open(resume_file,'r',)
f_desc = open(job_description_file,'r')

raw_resume =f_resume.read()
raw_desc = f_desc.read()

resume_words = process_text(raw_resume,all_stopwords)
job_words = process_text(raw_desc,all_stopwords)

df_desc = pd.DataFrame.from_dict(job_words,orient='index')
df_desc.columns = ['Frequency']
df_desc.index.name = 'Term'


df_resume = pd.DataFrame.from_dict(resume_words, orient='index')
df_resume.columns = ['Frequency']
df_resume.index.name = 'Term'


df = pd.merge(df_desc,df_resume,how='left',left_index=True,right_index=True).fillna(0)

df_matches = pd.merge(df_desc,df_resume,how='inner',left_index=True,right_index=True)
df.sort_values(by='Frequency_x',ascending=False,inplace=True)
# df.sort_values(by='Frequency_y',inplace=True,na_position='first')

# df_missing = df[df['Frequency_y']==0]
df_missing = df[df['Frequency_y']==0]
df_missing.columns = ['In Job Description','In Resume']


print ('You resume matches at ',"{0:.0%}".format(df_matches.size/df_desc.size))

import pandasql as ps

q1 = """select * from 
        (SELECT df_desc.Term,df_desc.Frequency,df_resume.Frequency
        from df_desc
        left join df_resume on (lower(df_desc.Term) = lower(df_resume.Term)
        and df_resume.Term is null)
        order by 2 desc
        )
        """

print(ps.sqldf(q1, locals()))

You resume matches at  59%
              Term  Frequency Frequency:1
0             data         21        None
1            alexa         12        None
2         services          5        None
3         learning          4        None
4         business          4        None
5       delivering          3        None
6              sql          3        None
7           python          3        None
8          analyst          2        None
9        analytics          2        None
10        customer          2        None
11        insights          2        None
12            team          2        None
13            echo          2        None
14            deep          2        None
15          engine          2        None
16   communication          2        None
17           large          2        None
18             the          2        None
19         machine          2        None
20      scientists          2        None
21      processing          2        None
22     

In [3]:
resume_set = set(resume_words)

## Next Steps: Improve Comparisons

1. Exclude low information parts of speach like prepositions, conjunctions.
2. Develop a list of skills.
3. Break comparisons by parts of speech. (Nouns, verbs, adjectives).
4. Look for key bigrams.
5. Enumerate and compare sentence subjects



## Next Steps: File Import of different formats