In [1]:
from docx import Document
import os

In [2]:
import requests as requests
from bs4 import BeautifulSoup

In [None]:
import pandas as pd

In [8]:
import spacy
import textacy

In [4]:
source_link = "https://resources.workable.com/job-descriptions/"

### Scraping resources.workable for position data

In [50]:
jobs = {}
url_links = []

def scrape(source_link):
    # Parse letter folders and get links to jobs
    html = requests.get(source_link, headers={'User-Agent': 'Mozilla/5.0'})

    soup = BeautifulSoup(html.text, 'html.parser')

    links = soup.find_all('a')
    
    for link in links:
        if "job-description" in link.get("href"):
            if "job description" not in link.text:
                jobs[link.text[:-1].strip()] = {}
                jobs[link.text[:-1].strip()]['url'] = link.get('href')
                jobs[link.text[:-1].strip()]['text'] = ""
                url_links.append(link.get('href'))
                


In [51]:
scrape(source_link)

In [52]:
url_links[0:5]

['https://resources.workable.com/category/job-descriptions/accounting-job-descriptions/',
 'https://resources.workable.com/budget-manager-job-description',
 'https://resources.workable.com/corporate-accountant-job-description',
 'https://resources.workable.com/collection-specialist-job-description',
 'https://resources.workable.com/billing-analyst-job-description']

### Scrape the individual positions

In [140]:
def parse_job(url):
    # Parse job html and collect links, adding them to list in the job dict
    
    paragraphs = []
    title = ""
    
    html = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
    soup = BeautifulSoup(html.text, 'html.parser')

    title = soup.find_all('h1')
    text = soup.find_all('p')

    uls = soup.find_all('ul')
    responsibilities = uls[5]
    requirements = uls[6]
    
    return_text = ""
    return_resp = ""
    return_req = ""

    for paragraph in text:
        paragraphs.append(paragraph.text)
        
    try:
        return_text = title[0].text

        for para in paragraphs[4:-9]:
            return_text += para+"\n"

        for ul in responsibilities:
            return_resp += ul.text+"\n"

        for ul in requirements:
            return_req += ul.text+"\n"

        return [return_text, return_resp, return_req]
    
    except IndexError:
        pass


### Parse job data into dictionary

In [141]:
for job in jobs:
    try:
        jobs[job]['text'] = parse_job(jobs[job]['url'])[0]
        jobs[job]['responsibilities'] = parse_job(jobs[job]['url'])[1]
        jobs[job]['requirements'] = parse_job(jobs[job]['url'])[2]
    except TypeError:
        pass

## Reload the dataframe

In [5]:
jobdf = pd.DataFrame.from_dict(jobs, orient='index')

NameError: name 'jobs' is not defined

In [145]:
jobdf.to_csv("jobs.csv")

In [148]:
responsibilities = jobs['iOS Developer']['requirements'].split("\n")

In [149]:
for r in responsibilities:
    print(r)

BS/MS degree in Computer Science, Engineering or a related subject
Proven working experience in software development
Working experience in iOS development
Have published one or more iOS apps in the app store
A deep familiarity with Objective-C and Cocoa Touch
Experience working with iOS frameworks such as Core Data, Core Animation, Core Graphics and Core Text
Experience with third-party libraries and APIs
Working knowledge of the general mobile landscape, architectures, trends, and emerging technologies
Solid understanding of the full mobile development life cycle



In [6]:
jobsdf = pd.read_csv("jobs.csv")

In [102]:
jobsdf.head(15)

Unnamed: 0,title,responsibilities,requirements,url,text
0,.Net Developer,Participate in requirements analysis\nCollabor...,Proven experience as a .NET Developer or Appli...,https://resources.workable.com/net-developer-j...,".Net Developer job descriptionIn this role, yo..."
1,Account Coordinator,"Prepare, file and retrieve sales-related docum...",Proven work experience as an Account Coordinat...,https://resources.workable.com/account-coordin...,"Account Coordinator job descriptionUltimately,..."
2,Account Director,Plan budgets and activities for account manage...,Proven experience as an account director or si...,https://resources.workable.com/account-directo...,Account Director job descriptionAn effective A...
3,Account Executive,Create detailed business plans to facilitate t...,"Proven experience as an Account Executive, or ...",https://resources.workable.com/account-executi...,Account Executive job descriptionThe ideal can...
4,Account Manager,Serve as the lead point of contact for all cus...,"Proven work experience as an Account Manager, ...",https://resources.workable.com/account-manager...,Account Manager job descriptionWe are looking ...
5,Account Representative,Be the main point of contact of assigned custo...,Proven experience as an Account Representative...,https://resources.workable.com/account-represe...,Account Representative job descriptionWe expec...
6,Account Supervisor,Lead account executives and account representa...,Proven experience as an Account Supervisor\nSo...,https://resources.workable.com/account-supervi...,Account Supervisor job descriptionIn this role...
7,Accountant,Manage all accounting operations based on acco...,Proven working experience as a cost accountant...,https://resources.workable.com/accountant-job-...,Accountant job description
9,Accounting Clerk,Provide accounting and clerical support to the...,"Proven accounting experience, preferably as an...",https://resources.workable.com/accounting-cler...,Accounting Clerk job descriptionAccounting Cle...
10,Accounting Manager,Manage and oversee the daily operations of the...,month and end-year process\naccounts payable/r...,https://resources.workable.com/accounting-mana...,Accounting Manager job description


In [14]:
jobsdf.columns = ['title', 'responsibilities', 'requirements', 'url', 'text']

In [103]:
jobsdf.dropna(inplace=True)

In [104]:
def textacy_process(text):
    return textacy.preprocess_text(str(text), 
        lowercase=True, no_punct=True)

In [118]:
jobsdf['requirements'].apply(textacy_process)

0      proven experience as a net developer or applic...
1      proven work experience as an account coordinat...
2      proven experience as an account director or si...
3      proven experience as an account executive or i...
4      proven work experience as an account manager k...
5      proven experience as an account representative...
6      proven experience as an account supervisor\nso...
7      proven working experience as a cost accountant...
9      proven accounting experience preferably as an ...
10     month and endyear process\naccounts payablerec...
11     proven work experience as an accounting superv...
12     proven working experience as accounts payable ...
13     proven working experience as accounts receivab...
14     proven work experience as an accounts receivab...
15     proven experience as administration manager\ni...
17     proven experience as an administrative assista...
18     proven work experience as an administrative of...
19     proven experience as an 

### Start machine learning work

In [None]:
# Load spacy as NLP engine

nlp = spacy.load('en')

### Import a word document via url

In [119]:
# script for parsing and returning docx text

def import_document(url):

    document = Document(url)

    text = ""

    for paragraph in document.paragraphs:
        text += "{}\n\n".format(paragraph.text)

    return text

In [120]:
target = "/home/chris/Downloads/chris_cv.docx"

In [121]:
cv_text = import_document(target)

proc_text = textacy.preprocess_text(cv_text, lowercase=True, no_punct=True)

cv = nlp(proc_text)

In [122]:
opportunity_matches = []

for index, row in jobsdf.iterrows():
    title, responsibilities, requirements, url, text = row
    
    try:
        all_text = requirements
        comparitor = nlp(all_text)
        
    except TypeError:
        pass
    
    score = comparitor.similarity(cv)
    print("{}: {}".format(title, score))
    opportunity_matches.append([score, title])

.Net Developer: 0.9333491252892031
Account Coordinator: 0.9519248977011571
Account Director: 0.9633647013840346
Account Executive: 0.9576160369087261
Account Manager: 0.957345021342607
Account Representative: 0.9518015452060837
Account Supervisor: 0.9554550488098147
Accountant: 0.9482393980756267
Accounting Clerk: 0.9493753378909255
Accounting Manager: 0.9034626708240268
Accounting Supervisor: 0.9522978430565884
Accounts Payable Clerk: 0.9561886645682836
Accounts Receivable Clerk: 0.9586737134021707
Accounts Receivable Manager: 0.9560859120961479
Administration Manager: 0.957898340727788
Administrative Assistant: 0.9587087971921768
Administrative officer: 0.9528981164908731
Administrator: 0.9546459231132632
Advertising Account Executive: 0.9580567898243872
Analytics Manager: 0.9533094416641532
Android Developer: 0.9482333319180261
Animator: 0.9418025949934352
Application Developer: 0.9544839946335405
Architect: 0.9581624382442253
Area Manager: 0.9605287035559499
Art Director: 0.9362406

IT Consultant: 0.9602419828024
IT Coordinator: 0.954613716934202
IT Director: 0.9552581017372707
IT Help Desk Technician: 0.94807935749699
IT Manager: 0.957364296920485
IT Technician: 0.9566760254310703
Illustrator: 0.9416315936430838
Inside Sales Manager: 0.9599136759151234
Inside Sales Representative: 0.9459519580345066
Instructional Designer: 0.944500250510022
Insurance Agent: 0.9380828386744123
Insurance Broker: 0.9373141835827599
Insurance Sales Representative: 0.9487058677262236
Insurance Underwriter: 0.9562586286703264
Interior Designer: 0.9365462167328553
Internal Auditor: 0.9544881522237462
Inventory Manager: 0.9592242471587757
Investment Analyst: 0.9411867591448416
Investment Banker: 0.9572533326061587
Janitor: 0.9222165174886295
Java Developer: 0.9407806260611059
Job Coach: 0.9485040326403948
Journalist: 0.9539343971836811
Junior Account Manager: 0.958315904976903
Junior Accountant: 0.9558730466456729
Junior Designer: 0.929284517505645
Key Account Manager: 0.950708214248907


Social Media Coordinator: 0.9571119700524494
Social Media Manager: 0.9478985926742154
Social Media Specialist: 0.9465879761802339
Social Media Strategist: 0.9580932168469568
Social Worker: 0.9399527525574285
Software Architect: 0.9504971198905475
Software Developer: 0.9389380617604712
Software Engineer: 0.9414251631164066
Software Security Engineer: 0.9502780305962768
Sound Engineer: 0.9340459133722732
Sourcing Manager: 0.9597143119477864
Sous Chef: 0.9561206577855788
Spa Therapist: 0.9422477904991843
Staff Assistant: 0.9601840080720238
Staff Writer: 0.947008476210683
Staffing Coordinator: 0.9519562139362729
Store Manager: 0.9454165681746313
Strategic Account Manager: 0.8776819016964541
Strategic Planner: 0.955873035202567
Supervisor: 0.9626018950520722
Supply Chain Analyst: 0.9530257887896205
System Administrator: 0.9433681890354434
System Analyst: 0.9481524280315663
System Security Engineer: 0.9448572791737447
Systems Engineer: 0.9015109350548304
Talent Acquisition Manager: 0.9655443

In [123]:
sorted_opportunities = sorted(opportunity_matches, reverse=True)

sorted_opportunities[:15]

[[0.96755574243909115, 'HR Coordinator'],
 [0.96746681318954597, 'HR Director'],
 [0.96704397189354896, 'CFO'],
 [0.96700685572019129, 'Security Manager'],
 [0.96648105100566839, 'Recruiting Coordinator'],
 [0.9655443899095093, 'Talent Acquisition Manager'],
 [0.96544344380044622, 'Production Manager'],
 [0.96527703158541822, 'Human Resources Manager (HR Manager)'],
 [0.96517482285587131, 'Construction Project Manager'],
 [0.96495636107910576, 'Hospital Administration Manager'],
 [0.96481194568756612, 'Operations Manager'],
 [0.96468050321566068, 'Executive Administrative Assistant'],
 [0.96437791466947242, 'Medical Office Manager'],
 [0.96434687951907505, 'Project Manager'],
 [0.96434055486291093, 'Assistant Manager']]

In [124]:
sorted_opportunities[-15:]

[[0.8776819016964541, 'Senior Account Manager'],
 [0.8776819016964541, 'National Account Manager'],
 [0.8776819016964541, 'Digital Account Manager'],
 [0.87407052662553286, 'Mobile Developer'],
 [0.87223754197963743, 'Database Developer'],
 [0.85093236672701067, 'Back-end Developer'],
 [0.84245135997600917, 'Web Programmer'],
 [0.83430161527301849, 'Marketing Strategist'],
 [0.83430161527301849, 'Digital Marketing Strategist'],
 [0.83430161527301849, 'Digital Director'],
 [0.83430161527301849, 'Content Manager'],
 [0.79318550809716948, 'Medical Administrative Assistant'],
 [0.77755632552517895, 'Technical Account Manager'],
 [0.77367598696749373, 'Senior Account Executive'],
 [0.73416075863698227, 'Office Manager']]

### Need to refine matching

### Test for 2nd CV

In [125]:
target = "/home/chris/Downloads/kevin_test.docx"

In [126]:
cv_text = import_document(target)

proc_text = textacy.preprocess_text(cv_text, lowercase=True, no_punct=True)

cv = nlp(proc_text)

In [127]:
opportunity_matches = []

for index, row in jobsdf.iterrows():
    title, responsibilities, requirements, url, text = row
    try:
        all_text = requirements
        comparitor = nlp(all_text)
    except TypeError:
        pass
    
    score = comparitor.similarity(cv)
    
    print("{}: {}".format(title, score))
    opportunity_matches.append([score, title])

.Net Developer: 0.941447132446878
Account Coordinator: 0.9444739967891259
Account Director: 0.9526168992726773
Account Executive: 0.9535339938306272
Account Manager: 0.9617957619614915
Account Representative: 0.9550280783529382
Account Supervisor: 0.9394391821591953
Accountant: 0.9470972706528932
Accounting Clerk: 0.9589605798934688
Accounting Manager: 0.8885800530381588
Accounting Supervisor: 0.9547831429520133
Accounts Payable Clerk: 0.9599815827062378
Accounts Receivable Clerk: 0.9594346666700049
Accounts Receivable Manager: 0.962087884541183
Administration Manager: 0.9435340310096747
Administrative Assistant: 0.9608905815737847
Administrative officer: 0.9537455796358736
Administrator: 0.9572674310557143
Advertising Account Executive: 0.9677985920189759
Analytics Manager: 0.9421782749901271
Android Developer: 0.9485918178100642
Animator: 0.9372524306623485
Application Developer: 0.9612201044908767
Architect: 0.9570542897436207
Area Manager: 0.9427805700887851
Art Director: 0.9300806

IT Coordinator: 0.9575344542100664
IT Director: 0.9513405065480826
IT Help Desk Technician: 0.9459016348906157
IT Manager: 0.9494519566457491
IT Technician: 0.9471681741117535
Illustrator: 0.9472212480644562
Inside Sales Manager: 0.9476922627306461
Inside Sales Representative: 0.9526805477484909
Instructional Designer: 0.9302045139631681
Insurance Agent: 0.9529532434337811
Insurance Broker: 0.9572223348434183
Insurance Sales Representative: 0.9569294727928744
Insurance Underwriter: 0.9521940316485757
Interior Designer: 0.9256286378292706
Internal Auditor: 0.9544821015480054
Inventory Manager: 0.9519435236939146
Investment Analyst: 0.9257949007845213
Investment Banker: 0.9561896232258525
Janitor: 0.9346203022261078
Java Developer: 0.9423104961035942
Job Coach: 0.9571363790291858
Journalist: 0.9554175435066475
Junior Account Manager: 0.955418959097134
Junior Accountant: 0.9587509876546199
Junior Designer: 0.9313744492861936
Key Account Manager: 0.9427826682801017
Kindergarten Teacher: 0.

Social Media Manager: 0.9322922803238363
Social Media Specialist: 0.9275624064336563
Social Media Strategist: 0.9496676363742087
Social Worker: 0.9466277511869606
Software Architect: 0.9413746205604456
Software Developer: 0.9397222095138575
Software Engineer: 0.9400249625194896
Software Security Engineer: 0.9538980208210535
Sound Engineer: 0.9347176064722591
Sourcing Manager: 0.9552748071622843
Sous Chef: 0.9614848534971763
Spa Therapist: 0.9388633004650163
Staff Assistant: 0.9456863001170523
Staff Writer: 0.9445267369707804
Staffing Coordinator: 0.9493962803965103
Store Manager: 0.911330431994599
Strategic Account Manager: 0.926895289534404
Strategic Planner: 0.9411275963277941
Supervisor: 0.9612420131534298
Supply Chain Analyst: 0.9427894638762051
System Administrator: 0.9440484184471698
System Analyst: 0.9458733746478609
System Security Engineer: 0.9425964369441682
Systems Engineer: 0.905711593554234
Talent Acquisition Manager: 0.9678865987136787
Tax Accountant: 0.9418406288464871
T

In [131]:
sorted_opportunities = sorted(opportunity_matches, reverse=True)

sorted_opportunities[:10]

[[0.97346069905012356, 'Fraud Investigator'],
 [0.97068486363493345, 'Telesales Representative'],
 [0.96906971043342416, 'Product Manager'],
 [0.96874722778847722, 'Telemarketer'],
 [0.9686856875795905, 'Sales Manager'],
 [0.96848914522718221, 'Office Coordinator'],
 [0.96789658001317802, 'Hotel Manager'],
 [0.96788659871367866, 'Talent Acquisition Manager'],
 [0.96779859201897589, 'Advertising Account Executive'],
 [0.96642232507251813, 'HR Generalist']]

In [132]:
sorted_opportunities[-10:]

[[0.85893617207789186, 'Back-end Developer'],
 [0.81868651100086254, 'Marketing Strategist'],
 [0.81868651100086254, 'Digital Marketing Strategist'],
 [0.81868651100086254, 'Digital Director'],
 [0.81868651100086254, 'Content Manager'],
 [0.80621010681831196, 'Web Programmer'],
 [0.76444252426609771, 'Technical Account Manager'],
 [0.76109781574982061, 'Medical Administrative Assistant'],
 [0.74101445417023404, 'Senior Account Executive'],
 [0.73380010487029801, 'Office Manager']]