In [1]:
import numpy as np
import pandas as pd

---

In [2]:
udf = pd.read_csv('user_df.csv')
users_set = set(udf.ApplicantID.values)
print(udf.isna().sum())
print(udf.info())
udf.head()

ApplicantID      0
Corpus         854
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15959 entries, 0 to 15958
Data columns (total 2 columns):
ApplicantID    15959 non-null int64
Corpus         15105 non-null object
dtypes: int64(1), object(1)
memory usage: 249.5+ KB
None


Unnamed: 0,ApplicantID,Corpus
0,2,writer uloop blog volunteer
1,3,prep cook server market intern
2,6,project assistant
3,8,deli clerk server cashier food prep order taker
4,11,cashier


In [3]:
udf = udf.drop(udf[udf.Corpus == ' '].index, axis=0)
udf = udf.dropna(axis=0, how='any')
print(udf.isna().sum())
print(udf.shape)

ApplicantID    0
Corpus         0
dtype: int64
(15105, 2)


In [4]:
jdf = pd.read_csv('jobs_df.csv')
jdf.Corpus = jdf.Corpus.astype('str')
print(jdf.isna().sum())
print(jdf.info())
jdf.head()

JobID     0
Corpus    0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84090 entries, 0 to 84089
Data columns (total 2 columns):
JobID     84090 non-null int64
Corpus    84090 non-null object
dtypes: int64(1), object(1)
memory usage: 1.3+ MB
None


Unnamed: 0,JobID,Corpus
0,3,customer service bayer healthcare cincinnati p...
1,28,kitchen staff/chef pacific catch san francisco...
2,30,bartender dave 's american bistro olney part-t...
3,33,server oakland part-time locate oaklandâ€™s ja...
4,35,kitchen staff skool san francisco part-time fe...


In [5]:
jobs_title = pd.read_csv('jobs_title.csv')
jobs_title.head()

Unnamed: 0,JobID,Title
0,111,Server @ Tacolicious
1,113,Kitchen Staff/Chef @ Claude Lane
2,117,Bartender @ Machka Restaurants Corp.
3,121,Server @ Teriyaki House
4,127,Kitchen Staff/Chef @ Rosa Mexicano - Sunset


---
### TF-IDF with Cosine Similarity
( Term Frequency - Inverse Document Frequency )

In [6]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
def disply_applicant(user_id, dataframe=udf):
    '''
    Selects the given users first application
    input: user id
    returns: selected applicants row
    '''
    if user_id in users_set:
        index = np.where(dataframe.ApplicantID == user_id)[0][0]
        return dataframe.iloc[[index]]
    return 'User id is not in the DB.'

In [8]:
selected_user = disply_applicant(326)
selected_user

Unnamed: 0,ApplicantID,Corpus
186,326,java developer


In [9]:
def get_recommendation(selected_user, top_recommended, scores, jobs_title=jobs_title):
    _result = pd.DataFrame()

    for index, recomend in enumerate(top_recommended):
        _result.at[index, 'ApplicantID'] = selected_user
        _result.at[index, 'JobID'] = jobs_title['JobID'][recomend]
        _result.at[index, 'title'] = jobs_title['Title'][recomend]
        _result.at[index, 'score'] =  scores[index]
    
    return _result

---

In [10]:
vectorizer = TfidfVectorizer()

tf_idf_job = vectorizer.fit_transform((jdf['Corpus']))
tf_idf_user = vectorizer.transform((selected_user['Corpus']))

In [11]:
_cosine = map(lambda x: cosine_similarity(tf_idf_user, x), tf_idf_job)
cosine_output = list(_cosine)

In [12]:
list_range = range(len(cosine_output))
_top = sorted(list_range, key=lambda i: cosine_output[i], reverse=True)[:10]
list_scores = [cosine_output[i][0][0] for i in _top]

---

---

In [13]:
def _recommended_jobs(top, user):
    '''
    Returns all the recommended jobs for the given user.
    input: top => nr of top recommended jobs | user => user id
    return: a dataframe with two columns (ApplicantID, JobID)
    '''
    result = pd.DataFrame()

    list_range = range(len(cosine_output))
    _top = sorted(list_range, key=lambda i: cosine_output[i].any(), reverse=True)[:top]
    
    for index, item in enumerate(_top):
        result.at[index, 'ApplicantID'] = user
        result.at[index, 'JobID'] = jdf['JobID'][item]
    
    return result

---

In [14]:
def _select_job(recommended_jobs):
    '''
    Returns the description (Corpus) for the given list.
    input: recommended jobs dataframe
    return: a dataframe with two columns (JobID, Corpus)
    '''
    result = pd.DataFrame()
    
    for i, recomend in enumerate(recommended_jobs['JobID']):
        index = np.where(jdf.JobID == recomend)[0][0]
        result.at[i, 'JobID'] = recomend
        result.at[i, 'Corpus'] = jdf['Corpus'][index]
    
    return result

---

In [15]:
_recommended = _recommended_jobs(10, 326)
_recommended

Unnamed: 0,ApplicantID,JobID
0,326.0,134307.0
1,326.0,140161.0
2,326.0,141755.0
3,326.0,141831.0
4,326.0,142473.0
5,326.0,144570.0
6,326.0,146640.0
7,326.0,146760.0
8,326.0,147843.0
9,326.0,148050.0


In [16]:
_select_job(_recommended)

Unnamed: 0,JobID,Corpus
0,134307.0,intern network sharepoint web development inte...
1,140161.0,senior accountant accountemps el paso seasonal...
2,141755.0,clinical supervisor part-time family center se...
3,141831.0,lead java/j2ee developer contract hire itech s...
4,142473.0,job order sr. remedy developer employ-r soluti...
5,144570.0,vice president construction gpac wichita full-...
6,146640.0,jr. java developer paladin consult inc saint l...
7,146760.0,project accountant lead developer west la acco...
8,147843.0,director sas application developer npd group p...
9,148050.0,ekg/cardiac technician per diem hca-east flori...


---

In [17]:
# we could make it more user friendly by adding some more info about what user applied for and what we recommend...
def recommender_output(top, user):
    '''
    Returns the final output for the recommender system
    input: top => nr of top recommended jobs | user => user id
    return: a dataframe with three columns (ApplicantID, JobID, Corpus)
    '''
    __recommended = _recommended_jobs(top, user)
    __selected = _select_job(__recommended)
    
    return __recommended.merge(__selected, on='JobID')

In [18]:
recommender_output(5, 326)

Unnamed: 0,ApplicantID,JobID,Corpus
0,326.0,134307.0,intern network sharepoint web development inte...
1,326.0,140161.0,senior accountant accountemps el paso seasonal...
2,326.0,141755.0,clinical supervisor part-time family center se...
3,326.0,141831.0,lead java/j2ee developer contract hire itech s...
4,326.0,142473.0,job order sr. remedy developer employ-r soluti...


In [19]:
get_recommendation(326, _top, list_scores)

Unnamed: 0,ApplicantID,JobID,title,score
0,326.0,294804.0,ADMINISTRATIVE ASSISTANT @ Cadre,0.737476
1,326.0,303228.0,Executive Administrative Assistant @ OfficeTeam,0.734409
2,326.0,270052.0,Caregiver / Home Health Aide / CNA @ Home Inst...,0.71219
3,326.0,305380.0,Board Certified Behavior Anaylst,0.598787
4,326.0,142005.0,ASSISTANT MANAGER @ Dollar Tree Stores,0.567058
5,326.0,270301.0,Per Diem Home Health Occupational Therapist As...,0.563548
6,326.0,310057.0,Payroll Clerk @ Accountemps,0.527657
7,326.0,245889.0,Full-Charge Bookkeeper @ Accountemps,0.527133
8,326.0,151044.0,Bakery Outlet Stock Clerk- Saginaw @ Lajoy Gro...,0.482844
9,326.0,146801.0,Project Coordinator @ OfficeTeam,0.48272


---
## KNN

In [20]:
from sklearn.neighbors import NearestNeighbors

In [21]:
number_neighbers = 10
model = NearestNeighbors(number_neighbers, p=2)
model.fit(tf_idf_job)
result = model.kneighbors(tf_idf_user, return_distance=True)

In [22]:
result[0][0][1:]

array([0.72882181, 0.7586961 , 0.89578222, 0.93052857, 0.93429283,
       0.97194933, 0.97248841, 1.01701082, 1.01713365])