In [1]:
import numpy as np
import pandas as pd

---

In [2]:
udf = pd.read_csv('user_df.csv')
users_set = set(udf.ApplicantID.values)
print(udf.isna().sum())
print(udf.info())
udf.head()

ApplicantID    0
Corpus         0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15959 entries, 0 to 15958
Data columns (total 2 columns):
ApplicantID    15959 non-null int64
Corpus         15959 non-null object
dtypes: int64(1), object(1)
memory usage: 249.5+ KB
None


Unnamed: 0,ApplicantID,Corpus
0,2,volunteer writer for the uloop blog
1,3,marketing intern server prep cook
2,6,project assistant
3,8,deli clerk server cashier food prep order...
4,11,cashier


In [3]:
jdf = pd.read_csv('jobs_df.csv')
jdf.Corpus = jdf.Corpus.astype('str')
print(jdf.isna().sum())
print(jdf.info())
jdf.head()

JobID     0
Corpus    0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84090 entries, 0 to 84089
Data columns (total 2 columns):
JobID     84090 non-null int64
Corpus    84090 non-null object
dtypes: int64(1), object(1)
memory usage: 1.3+ MB
None


Unnamed: 0,JobID,Corpus
0,3,custom servic cincinnati oh bayer healthcar cu...
1,28,kitchen staff chef san francisco ca pacif catc...
2,30,bartend olney md dave american bistro bartend ...
3,33,server oakland ca server oakland part time loc...
4,35,kitchen staff san francisco ca skool kitchen s...


In [18]:
jobs_title = pd.read_csv('jobs_title.csv')
jobs_title.head()

Unnamed: 0,JobID,Title
0,111,Server @ Tacolicious
1,113,Kitchen Staff/Chef @ Claude Lane
2,117,Bartender @ Machka Restaurants Corp.
3,121,Server @ Teriyaki House
4,127,Kitchen Staff/Chef @ Rosa Mexicano - Sunset


---
### TF-IDF with Cosine Similarity
( Term Frequency - Inverse Document Frequency )

In [4]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
vectorizer = TfidfVectorizer()

tf_idf_job = vectorizer.fit_transform((jdf['Corpus']))
tf_idf_job

<84090x43030 sparse matrix of type '<class 'numpy.float64'>'
	with 8101697 stored elements in Compressed Sparse Row format>

In [6]:
tf_idf_user = vectorizer.transform(udf['Corpus'])
tf_idf_user

<15959x43030 sparse matrix of type '<class 'numpy.float64'>'
	with 81239 stored elements in Compressed Sparse Row format>

In [7]:
_cosine = map(lambda x: cosine_similarity(tf_idf_user, x), tf_idf_job)

In [8]:
cosine_output = list(_cosine)

---

In [31]:
def disply_applicant(user_id, dataframe):
    '''
    Selects the given users first application
    input: user id
    returns: selected applicants row
    '''
    if user_id in users_set:
        index = np.where(dataframe.ApplicantID == user_id)[0][0]
        return dataframe.iloc[[index]]
    return 'User id is not in the DB.'

---

In [32]:
def _recommended_jobs(top, user):
    '''
    Returns all the recommended jobs for the given user.
    input: top => nr of top recommended jobs | user => user id
    return: a dataframe with two columns (ApplicantID, JobID)
    '''
    result = pd.DataFrame()

    list_range = range(len(cosine_output))
    _top = sorted(list_range, key=lambda i: cosine_output[i].all(), reverse=True)[:top]
    
    for index, item in enumerate(_top):
        result.at[index, 'ApplicantID'] = user
        result.at[index, 'JobID'] = jdf['JobID'][item]
    
    return result

---

In [33]:
def _select_job(recommended_jobs):
    '''
    Returns the description (Corpus) for the given list.
    input: recommended jobs dataframe
    return: a dataframe with two columns (JobID, Corpus)
    '''
    result = pd.DataFrame()
    
    for i, recomend in enumerate(recommended_jobs['JobID']):
        index = np.where(jdf.JobID == recomend)[0][0]
        result.at[i, 'JobID'] = recomend
        result.at[i, 'Corpus'] = jdf['Corpus'][index]
    
    return result

---

In [34]:
disply_applicant(326, udf)

Unnamed: 0,ApplicantID,Corpus
186,326,java developer


In [35]:
_recommended = _recommended_jobs(10, 326)
_recommended

Unnamed: 0,ApplicantID,JobID
0,326.0,3.0
1,326.0,28.0
2,326.0,30.0
3,326.0,33.0
4,326.0,35.0
5,326.0,45.0
6,326.0,47.0
7,326.0,49.0
8,326.0,61.0
9,326.0,64.0


In [36]:
_select_job(_recommended)

Unnamed: 0,JobID,Corpus
0,3.0,custom servic cincinnati oh bayer healthcar cu...
1,28.0,kitchen staff chef san francisco ca pacif catc...
2,30.0,bartend olney md dave american bistro bartend ...
3,33.0,server oakland ca server oakland part time loc...
4,35.0,kitchen staff san francisco ca skool kitchen s...
5,45.0,server san francisco ca liberti bar restaur se...
6,47.0,sushi chef palo alto ca yoshi cater sushi chef...
7,49.0,server san francisco ca sprig server sprig san...
8,61.0,server foster citi ca kenta ramen server kenta...
9,64.0,server san francisco ca fior italia server fio...


---

In [37]:
# we could make it more user friendly by adding some more info about what user applied for and what we recommend...
def recommender_output(top, user):
    '''
    Returns the final output for the recommender system
    input: top => nr of top recommended jobs | user => user id
    return: a dataframe with three columns (ApplicantID, JobID, Corpus)
    '''
    __recommended = _recommended_jobs(top, user)
    __selected = _select_job(__recommended)
    
    return __recommended.merge(__selected, on='JobID')

In [39]:
recommender_output(5, 326)

Unnamed: 0,ApplicantID,JobID,Corpus
0,326.0,3.0,custom servic cincinnati oh bayer healthcar cu...
1,326.0,28.0,kitchen staff chef san francisco ca pacif catc...
2,326.0,30.0,bartend olney md dave american bistro bartend ...
3,326.0,33.0,server oakland ca server oakland part time loc...
4,326.0,35.0,kitchen staff san francisco ca skool kitchen s...


---
## KNN

In [9]:
from sklearn.neighbors import NearestNeighbors

In [12]:
number_neighbers = 10
model = NearestNeighbors(number_neighbers, p=2)
model.fit(tf_idf_job)
result = model.kneighbors(tf_idf_user, return_distance=True)

In [13]:
result[0][0][1:]

array([1.15783867, 1.2027317 , 1.2206752 , 1.24204879, 1.24204879,
       1.24204879, 1.24230916, 1.24350448, 1.24371167])

In [28]:
def get_recommendation(user, recommended_jobs, scores):
    _result = pd.DataFrame()

    for index, recomend in enumerate(recommended_jobs):
        _result.at[index, 'ApplicantID'] = user
        _result.at[index, 'JobID'] = jobs_title['JobID'][recomend]
        _result.at[index, 'title'] = jobs_title['Title'][recomend]
        _result.at[index, 'score'] =  scores[index]
    
    return _result

In [29]:
get_recommendation(326, result[1][0][1:], result[0][0][1:])

Unnamed: 0,ApplicantID,JobID,title,score
0,326.0,261253.0,Fund Accountants Needed in SF @ Accountemps,1.157839
1,326.0,309520.0,Custom Framing Specialist,1.202732
2,326.0,273788.0,"Cashiers, Deli Clerks, Freight Stockers and Mo...",1.220675
3,326.0,277453.0,Program Coordinator 017-15 @ Mothers Against D...,1.242049
4,326.0,142012.0,ASSISTANT MANAGER @ Dollar Tree Stores,1.242049
5,326.0,245332.0,Outgoing Receptionist Needed Immediately @ Off...,1.242049
6,326.0,263280.0,Health and Wellness Nurse Practitioner @ Healt...,1.242309
7,326.0,283116.0,Recent Graduate Junior Accountant @ Accountemps,1.243504
8,326.0,313889.0,Customer Service Representative Needed! @ Offi...,1.243712
