In [139]:
import numpy as np
import pandas as pd

In [140]:
df = pd.read_csv('Combined_Jobs_Final.csv')
df.head(3)

Unnamed: 0,Job.ID,Provider,Status,Slug,Title,Position,Company,City,State.Name,State.Code,...,Industry,Job.Description,Requirements,Salary,Listing.Start,Listing.End,Employment.Type,Education.Required,Created.At,Updated.At
0,111,1,open,palo-alto-ca-tacolicious-server,Server @ Tacolicious,Server,Tacolicious,Palo Alto,California,CA,...,Food and Beverages,Tacolicious' first Palo Alto store just opened...,,8.0,,,Part-Time,,2013-03-12 02:08:28 UTC,2014-08-16 15:35:36 UTC
1,113,1,open,san-francisco-ca-claude-lane-kitchen-staff-chef,Kitchen Staff/Chef @ Claude Lane,Kitchen Staff/Chef,Claude Lane,San Francisco,California,CA,...,Food and Beverages,\r\n\r\nNew French Brasserie in S.F. Financia...,,0.0,,,Part-Time,,2013-04-12 08:36:36 UTC,2014-08-16 15:35:36 UTC
2,117,1,open,san-francisco-ca-machka-restaurants-corp-barte...,Bartender @ Machka Restaurants Corp.,Bartender,Machka Restaurants Corp.,San Francisco,California,CA,...,Food and Beverages,We are a popular Mediterranean wine bar and re...,,11.0,,,Part-Time,,2013-07-16 09:34:10 UTC,2014-08-16 15:35:37 UTC


In [141]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84090 entries, 0 to 84089
Data columns (total 23 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Job.ID              84090 non-null  int64  
 1   Provider            84090 non-null  int64  
 2   Status              84090 non-null  object 
 3   Slug                84090 non-null  object 
 4   Title               84090 non-null  object 
 5   Position            84090 non-null  object 
 6   Company             81819 non-null  object 
 7   City                83955 non-null  object 
 8   State.Name          83919 non-null  object 
 9   State.Code          83919 non-null  object 
 10  Address             36 non-null     object 
 11  Latitude            84090 non-null  float64
 12  Longitude           84090 non-null  float64
 13  Industry            267 non-null    object 
 14  Job.Description     84034 non-null  object 
 15  Requirements        0 non-null      float64
 16  Sala

In [142]:
df = df[['Title', 'Job.Description']]

In [143]:
df.head(3)

Unnamed: 0,Title,Job.Description
0,Server @ Tacolicious,Tacolicious' first Palo Alto store just opened...
1,Kitchen Staff/Chef @ Claude Lane,\r\n\r\nNew French Brasserie in S.F. Financia...
2,Bartender @ Machka Restaurants Corp.,We are a popular Mediterranean wine bar and re...


In [144]:
df['Title'][0]

'Server @ Tacolicious'

In [145]:
df['Job.Description'][0]

"Tacolicious' first Palo Alto store just opened recently, and we are hiring! If you love tacos, you will love working at our restaurant! \r\n\r\n ● Serve food/drinks to customers in a professional manner \r\n ● Act as a cashier when needed \r\n ● Clean up the dining space \r\n ● Train the new staff \r\n"

In [146]:
df = df.sample(n=1000, random_state=42)
df.shape

(1000, 2)

In [147]:
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import re
#nltk.download('punkt')
#nltk.download('stopwords')

In [148]:
ps = PorterStemmer()

In [149]:
def cleaning(input):
    cleaned = re.sub(r'[^a-zA-Z0-9\s]','',input)
    tokens = nltk.word_tokenize(cleaned.lower())
    stemming = [ps.stem(word) for word in tokens if word not in stopwords.words('english')]
    
    return " ".join(stemming)

In [150]:
cleaned = cleaning("This is my text \n\r %$@& loving moved drove")

In [151]:
print(cleaned)

text love move drove


In [152]:
df['Title'] = df['Title'].astype(str).apply(lambda x:cleaning(x))
df['Job.Description'] = df['Job.Description'].astype(str).apply(lambda x:cleaning(x))

In [153]:
df['Job.Description'][64119]

'job summari knowledg univers ku site director site leader inspir children teacher alik learn grow passion educ excel confid teach children adult use nation recogn curriculum framework creat uniqu engag classroom experi commit make site success know meaning relationship children famili team import success fulli engag enthusiast work eager share knowledg other job respons essenti function basic expect site director cours creativ new way meet exceed expect encourag long requir essenti function also met supervis children staff record keep licens record child file lesson plan implement mainten safe welcom classroom environ build relationship commun school recruit new student program applic must strong organiz skill'

In [154]:
df['Details'] = df['Title'] + " " + df['Job.Description']

In [155]:
df.head(3)

Unnamed: 0,Title,Job.Description,Details
64119,site director knowledg univers,job summari knowledg univers ku site director ...,site director knowledg univers job summari kno...
35827,administr assist officeteam,ref id 03110118480classif secretaryadmin asstc...,administr assist officeteam ref id 03110118480...
72100,account manag chi payment system,yoursquor energet motiv hardwork look prosper ...,account manag chi payment system yoursquor ene...


In [156]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [157]:
tfif = TfidfVectorizer()
matrix = tfif.fit_transform(df['Details'])
similarity = cosine_similarity(matrix)

In [158]:
similarity

array([[1.        , 0.04042121, 0.02576217, ..., 0.05812558, 0.02553415,
        0.08439393],
       [0.04042121, 1.        , 0.02654339, ..., 0.03223153, 0.0051686 ,
        0.02355136],
       [0.02576217, 0.02654339, 1.        , ..., 0.05841122, 0.03157436,
        0.03907004],
       ...,
       [0.05812558, 0.03223153, 0.05841122, ..., 1.        , 0.06388816,
        0.1154483 ],
       [0.02553415, 0.0051686 , 0.03157436, ..., 0.06388816, 1.        ,
        0.38480582],
       [0.08439393, 0.02355136, 0.03907004, ..., 0.1154483 , 0.38480582,
        1.        ]])

In [159]:
enumerate(similarity[0])

<enumerate at 0x2739c3c6500>

In [160]:
def recommendation(title):
    placeIndex = df[df['Title'] == title].index[0]
    placeIndex = df.index.get_loc(placeIndex)
    distances = sorted(list(enumerate(similarity[placeIndex])),key=lambda x:x[1], reverse=False)[1:20]
    
    jobs = []
    for i in distances:
        jobs.append(df.iloc[i[0]].Title)
    return jobs

In [161]:
recommendation("site director knowledg univers")

['live home health aid hha immedi need shift 7am sat 7am monday home hospit medic personnel inc',
 'inventori analyst accountemp',
 'dietitian',
 'housekeep laundri assist evangel lutheran good samaritan societi',
 'physician autism center part time provid health servic',
 'shoe repair carl shoe',
 'surgic scrub tech',
 'account clerk accountemp',
 'medic offic manag psychiatr offic',
 'physic therapi assist parttim american merci home care',
 'custom servic represnt officeteam',
 'mech engin intern hargrov engin constructor',
 'vascular technician swedish health',
 'commerci collect specialist accountemp',
 'part time sale green drop lawn care',
 'gener offic clerk officeteam',
 'data entri officeteam',
 'housekeep crown plaza independ own oper',
 'secur guard abc bank']

In [162]:
import pickle
pickle.dump(df, open("df.pkl", 'wb'))
pickle.dump(similarity, open('similarity.pkl', 'wb'))