In [1]:
import sys
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.layout import LAParams
from io import StringIO

In [49]:
import pandas as pd
import numpy as np
import os
import regex as re
import nltk
import string
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import wordnet 
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import RegexpTokenizer
import random
from random import shuffle
from collections import OrderedDict
import operator

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
# import spacy
# nlp = spacy.load('en_core_web_sm')

In [5]:
def convert_pdf_to_txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()
    return text

In [6]:
resumes = os.listdir('Resumes')
resumes.remove('.DS_Store')
resume_dictionary = {}
for resume in resumes:
    root = 'Resumes/'
    index = resume.find('_')
    resume_dictionary[resume[:index]] = convert_pdf_to_txt(root + resume)

In [7]:
# df_resumes = pd.read_csv("Data/UpdatedResumeDataSet.csv")
# df_resumes = df_resumes.dropna()
# df_resumes = df_resumes.drop_duplicates(subset=['Resume'])
# for i in range(0, df_resumes.shape[0]):
#     resume_dictionary[df_resumes['Name'].values[i]] = df_resumes['Resume'].values[i]

#### Cleaning Resume

In [8]:
default_stemmer = PorterStemmer()
default_stopwords = stopwords.words('english') # or any other list of your choice
default_stopwords = list(string.ascii_lowercase) + default_stopwords

In [9]:
def misc_cleaning(text):
    text = re.sub('\n', ' ', text)
    text = re.sub('√¢¬Ä¬¢', ' ', text)
    text = re.sub('√¢¬ù¬ñ', ' ', text)
    text = re.sub('○␣', ' ', text)
    text = re.sub(" rt ", " ", text)
    text = re.sub("@\S+", "", text)
    text = re.sub(' y ', '', text) # gets rid of random y accent stuff scattered through the text
    text = re.sub('yyy', 'y', text)
    text = re.sub('\n', '', text)
    text = text.replace("("," ").replace(")"," ")
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub(' +', ' ', text)
    return text

def tokenize_text(text):
    return [w for s in sent_tokenize(text) for w in word_tokenize(s)]

def clean_text(text, remove_punctuation = False, stem_text = False, 
               remove_stopwords = False, remove_num = False):
        
        text = " " + text + " "
        text = text.lower()
        text = misc_cleaning(text) # look at function, random cleaning stuff
        # removes punctuation
        if remove_punctuation:
            text = "".join([(ch if ch not in string.punctuation else " ") for ch in text]).strip()
        # optional: stems text using Porter Stemmer
        if stem_text:
            stemmer = default_stemmer
            tokens = tokenize_text(text)
            text = " ".join([stemmer.stem(t) for t in tokens])
        # removes stop words such as "a", "the", etc.
        if remove_stopwords:
            stop_words = default_stopwords
            tokens = [w for w in tokenize_text(text) if w not in stop_words]
            text = " ".join(tokens)
        # optional: removes numbers completely from the ext
        if remove_num:
            text=text.split()
            text=[x for x in text if not x.isnumeric()]
            text= " ".join(text)
        text = " " + text + " "
        
        return text


def _reduce_redundancy(text):
    """
    Takes in text that has been cleaned by the _base_clean and uses set to reduce the repeating words
    giving only a single word that is needed.
    """
    words = text.split(' ')
    return " ".join(list(set(words)))


def _get_target_words(text):
    """
    Takes in text and uses Spacy Tags on it, to extract the relevant Noun, Proper Noun words that contain words related to tech and JD. 
    """
    target = []
    doc = nlp(text)
    for token in doc:
        if token.tag_ in ['NN', 'NNP']:
            target.append(token.text)
    return " ".join(target)

In [10]:
for person in resume_dictionary:
    resume = resume_dictionary[person]
    clean_resume = clean_text(resume, remove_punctuation = True, stem_text = False, 
               remove_stopwords = True, remove_num = True)
    resume_dictionary[person] = _reduce_redundancy(clean_resume)

#### Similarity Score Utility Functions

In [40]:
def calculate_jaccard(word_tokens1, word_tokens2):
    # Combine both tokens to find union.
    both_tokens = word_tokens1 + word_tokens2
    union = set(both_tokens)
    # Calculate intersection.
    intersection = set()
    for w in word_tokens1:
        if w in word_tokens2:
            intersection.add(w)
    jaccard_score = len(intersection)/len(union)
    return jaccard_score


def jaccard_similarity(resume_dictionary):
    base_resume = list(resume_dictionary.values())[0]
    other_resumes = list(resume_dictionary.values())[1:]
    score_dictionary = {}
    highest_individual = None
    highest_score = 0
    for i, other in enumerate(other_resumes):
        score = calculate_jaccard(base_resume.split(' '), other.split(' '))
        score_dictionary[list(resume_dictionary.keys())[i+1]] = score
        if score > highest_score:
            highest_score = score
            highest_individual = list(resume_dictionary.keys())[i+1]
    return score_dictionary, highest_individual


def process_tfidf_similarity(resume_dictionary):
    # First key of dictionary should b resume of comparison
    documents = list(resume_dictionary.values())
    vectorizer = TfidfVectorizer()
    # To make uniformed vectors, both documents need to be combined first.
    embeddings = vectorizer.fit_transform(documents)
    cosine_similarities = cosine_similarity(embeddings[0:1], embeddings[1:]).flatten()
    score_dictionary = {}
    highest_individual = None
    highest_score = 0
    for i, score in enumerate(cosine_similarities):
        score_dictionary[list(resume_dictionary.keys())[i+1]] = score
        if score > highest_score:
            highest_score = score
            highest_individual = list(resume_dictionary.keys())[i+1]
    return score_dictionary, highest_individual, embeddings, vectorizer

In [41]:
individual = 'SurajRajendran'
individual = individual + '.pd'
other_individuals = list(resume_dictionary.keys())
other_individuals.remove(individual)
key_order = [individual] + other_individuals
resume_dictionary = {k : resume_dictionary[k] for k in key_order}

In [42]:
# Jaccard Matching Matching
jaccard_score_dictionary, jacc_matched_individual = jaccard_similarity(resume_dictionary)

In [43]:
# TF-IDF Matching
tfidf_score_dictionary, tf_matched_individual, embeddings, vectorizer = process_tfidf_similarity(resume_dictionary)

In [50]:
word_importances = dict(zip(vectorizer.get_feature_names(), embeddings.toarray()[0]))
word_importances = dict( sorted(word_importances.items(), key=operator.itemgetter(1),reverse=True))

In [51]:
word_importances

{'2nd': 0.05097974530770921,
 '3dconvnet': 0.05097974530770921,
 'aami': 0.05097974530770921,
 'accolades': 0.05097974530770921,
 'accounted': 0.05097974530770921,
 'aces': 0.05097974530770921,
 'actions': 0.05097974530770921,
 'activated': 0.05097974530770921,
 'acute': 0.05097974530770921,
 'adapted': 0.05097974530770921,
 'added': 0.05097974530770921,
 'adversarial': 0.05097974530770921,
 'agents': 0.05097974530770921,
 'aki': 0.05097974530770921,
 'al': 0.05097974530770921,
 'alexa': 0.05097974530770921,
 'allowing': 0.05097974530770921,
 'allows': 0.05097974530770921,
 'assistants': 0.05097974530770921,
 'attack': 0.05097974530770921,
 'attributes': 0.05097974530770921,
 'autocorrect': 0.05097974530770921,
 'aws': 0.05097974530770921,
 'bash': 0.05097974530770921,
 'batch': 0.05097974530770921,
 'biased': 0.05097974530770921,
 'care': 0.05097974530770921,
 'cci': 0.05097974530770921,
 'centers': 0.05097974530770921,
 'certain': 0.05097974530770921,
 'certifications': 0.05097974530

In [31]:
print("Matches from Similarity Metrics:")
print(f"Most Similar (Jaccard): {jacc_matched_individual[:-3]}")
print(f"Most Similar (TF-IDF): {tf_matched_individual[:-3]}")

Matches from Similarity Metrics:
Most Similar (Jaccard): JoshParadise
Most Similar (TF-IDF): JoshParadise
