In [1]:

import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import nltk
import re

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\cocsa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
from pyresparser import ResumeParser
import docx2txt



# Main Code

In [4]:
ui_resume = "../resumes/UIUX_Resume1.pdf"
ds_resume = "../resumes/data-scientist-1559725114.pdf"

In [5]:
ui_job_d = docx2txt.process("./job_description_ui.docx")
ds_job_d = docx2txt.process("./job_description_data_science.docx")

In [6]:
from pyresparser import ResumeParser
from pdfminer3.layout import LAParams, LTTextBox
from pdfminer3.pdfpage import PDFPage
from pdfminer3.pdfinterp import PDFResourceManager
from pdfminer3.pdfinterp import PDFPageInterpreter
from pdfminer3.converter import TextConverter
import io

In [7]:
def pdf_reader(file):
    resource_manager = PDFResourceManager()
    fake_file_handle = io.StringIO()
    converter = TextConverter(resource_manager, fake_file_handle, laparams=LAParams())
    page_interpreter = PDFPageInterpreter(resource_manager, converter)
    with open(file, 'rb') as fh:
        for page in PDFPage.get_pages(fh,
                                      caching=True,
                                      check_extractable=True):
            page_interpreter.process_page(page)
            print(page)
        text = fake_file_handle.getvalue()

    # close open handles
    converter.close()
    fake_file_handle.close()
    return text

In [8]:
def cleanResume(resumeText):
    resumeText = re.sub('http\S+\s*', ' ', resumeText)  # remove URLs
    resumeText = re.sub('RT|cc', ' ', resumeText)  # remove RT and cc
    resumeText = re.sub('#\S+', '', resumeText)  # remove hashtags
    resumeText = re.sub('@\S+', '  ', resumeText)  # remove mentions
    resumeText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', resumeText)  # remove punctuations
    resumeText = re.sub(r'[^\x00-\x7f]',r' ', resumeText) 
    resumeText = re.sub('\s+', ' ', resumeText)  # remove extra whitespace
    return resumeText

## For UI Developer

In [9]:
# Reading resume
resume_data_ui = pdf_reader(ui_resume)
resume_data_ui = cleanResume(resume_data_ui)

<PDFPage: Resources={'ColorSpace': {'CS0': <PDFObjRef:11>}, 'ExtGState': {'GS0': <PDFObjRef:24>, 'GS1': <PDFObjRef:25>}, 'Font': {'C2_0': <PDFObjRef:14>, 'T1_0': <PDFObjRef:15>, 'TT0': <PDFObjRef:16>}, 'ProcSet': [/'PDF', /'Text']}, MediaBox=[0.0, 0.0, 612.0, 792.0]>


In [10]:
print(resume_data_ui)

art director graphic designer experience brokaw art director 1 18 present Collaborate to develop unexpected strategy based creative campaigns from start to finish A ountable for a variety of print digital collateral and social content Direct photo and video shoots to support campaigns Drink a lot of the world s worst coffee and Great Lakes beer contact hello 216 212 2327 behance net allisonbeer education marcus thomas art direction intern 9 17 12 17 Created print and digital assets to support existing campaigns Designed app identity for one of the agency s largest clients Assisted on photo and video shoots in in house production studio Pet many dogs miami university 2017 B F A in Graphic Design Minors in Interactive Media and Art History Honors Summa Cum Laude Stamps Leadership Scholar President s List punk talks web designer 5 17 3 19 Maintained punktalks org Worked with developer to design and launch custom Wordpress site Created graphics for social and print distribution Advocated t

In [11]:
text_ui = [resume_data_ui, ui_job_d]

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [13]:
cv = CountVectorizer()
count_matrix_ui = cv.fit_transform(text_ui)

#Print the similarity scores
print("\nSimilarity Scores:")
print(cosine_similarity(count_matrix_ui))


Similarity Scores:
[[1.         0.57137089]
 [0.57137089 1.        ]]


In [14]:
matchPercentage_ui = cosine_similarity(count_matrix_ui)[0][1] * 100
matchPercentage_ui = round(matchPercentage_ui, 2) # round to two decimal
print("Your resume matches about "+ str(matchPercentage_ui)+ "% of the job description.")

Your resume matches about 57.14% of the job description.


## For Data scientist

In [15]:
resume_data_ds = pdf_reader(ds_resume)
resume_data_ds = cleanResume(resume_data_ds)
resume_data_ds

<PDFPage: Resources={'Font': <PDFObjRef:40>, 'ProcSet': [/'PDF', /'Text']}, MediaBox=[0, 0, 612, 792]>
<PDFPage: Resources={'Font': <PDFObjRef:40>, 'ProcSet': [/'PDF', /'Text']}, MediaBox=[0, 0, 612, 792]>


'Data Scientist ROBE SMITH Phone 123 456 78 99 Email info Website www qwikresume com LinkedIn linkedin com qwikresume Address 1737 Marshville Road Alabama Objective Data Scientist with PhD in Physics and 1 industrial experience Two years of working experience in Data Analysis team of LIGO Scientific Collaboration 3M Special Breakthrough Prize winner of 2016 Over ten years of su essful research experience in both theoretical and computational physics Strong problem solving and analytical skills Advanced programming proficiency Certified in Data Analysis and Machine Learning Skills Data Mining Data Analysis Machine Learning Python R MATLAB Sphinx LaTeX Mathematica Maple GIT CVS HTCondor Work Experience Data Scientist ABC Corporation May 1994 May 2005 Assisted in determining client needs deliverable design estimates and feasibility for analytical projects concerning a custom study for a manufacturer who is using the results to support a litigation claim Served as an internal resource for 

In [16]:
text_ds = [resume_data_ds, ds_job_d]

In [17]:
cv = CountVectorizer()
count_matrix_ds = cv.fit_transform(text_ds)

#Print the similarity scores
print("\nSimilarity Scores:")
print(cosine_similarity(count_matrix_ds))


Similarity Scores:
[[1.         0.67008087]
 [0.67008087 1.        ]]


In [18]:
matchPercentage_ds = cosine_similarity(count_matrix_ds)[0][1] * 100
matchPercentage_ds = round(matchPercentage_ds, 2) # round to two decimal
print("Your resume matches about "+ str(matchPercentage_ds)+ "% of the job description.")

Your resume matches about 67.01% of the job description.


## UI Developer applies to Data Scientist Role

In [37]:
data = ResumeParser(ui_resume).get_extracted_data()
string = ""

string += data['name'] + " "
string += " ".join(data['skills'])
string += " ".join(data['experience'])

print(string)

resume_data_ud = cleanResume(string)
print(resume_data_ud)


# skills = []
# skills.append(' '.join(word for word in resume))
# org_name_clean = skills

# resume_data_ui = pdf_reader(ui_resume)
# resume_data_ui = cleanResume(resume_data_ui)



art director Photoshop Design Brand Website Prototyping Video Adobe Html Wordpress Cloud Illustrator Css Graphic design Distribution Indesign Strategy Content Interactivebrokaw  •  art director  •  1/18–present Collaborate to develop unexpected, strategy-based creative campaigns from start to finish. Accountable for a variety of print & digital collateral and social content. Direct photo and video shoots to support campaigns. Drink a lot of the world’s worst coffee and Great Lakes beer. contact hello@allisonbeer.com 216.212.2327 behance.net/allisonbeer
art director Photoshop Design Brand Website Prototyping Video Adobe Html Wordpress Cloud Illustrator Css Graphic design Distribution Indesign Strategy Content Interactivebrokaw art director 1 18 present Collaborate to develop unexpected strategy based creative campaigns from start to finish A ountable for a variety of print digital collateral and social content Direct photo and video shoots to support campaigns Drink a lot of the world s

In [41]:
text_ud = [resume_data_ud, ds_job_d]

In [42]:
# Convert a collection of text documents to a matrix of token counts.
cv = CountVectorizer()
# Learn the vocabulary dictionary and return document-term matrix.
# This is equivalent to fit followed by transform, but more efficiently implemented.
count_matrix_ud = cv.fit_transform(text_ud)

#Print the similarity scores
print("\nSimilarity Scores:")
print(cosine_similarity(count_matrix_ud))


Similarity Scores:
[[1.        0.2706388]
 [0.2706388 1.       ]]


In [43]:
matchPercentage_ud = cosine_similarity(count_matrix_ud)[0][1] * 100
matchPercentage_ud = round(matchPercentage_ud, 2) # round to two decimal
print("Your resume matches about "+ str(matchPercentage_ud)+ "% of the job description.")

Your resume matches about 27.06% of the job description.
