In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer


In [3]:
from google.colab import files
uploaded = files.upload()


Saving jobs.csv to jobs.csv


In [4]:
df = pd.read_csv('jobs.csv')
df.head(3)

Unnamed: 0,title,industry,role,type,years of experience,language,soft skill,programming language,salary,job_id
0,GP at __ hospital,Health,GP,full-time,3,French,communication,none,50000,0
1,Grammerly Part-time software engineer,IT,Software Engineer,part-time,1,Hausa,teamworking,python,30000,1
2,Google Summer Intern,IT,Software Engineer,Intern,0,Hausa,drive,java,2000,2


In [5]:
df.shape

(9, 10)

In [6]:
columns = ['title','industry','role','type','language']
df[columns].head(3)

Unnamed: 0,title,industry,role,type,language
0,GP at __ hospital,Health,GP,full-time,French
1,Grammerly Part-time software engineer,IT,Software Engineer,part-time,Hausa
2,Google Summer Intern,IT,Software Engineer,Intern,Hausa


In [7]:
df[columns].isnull().values.any()

False

In [8]:
def get_important_features(data):
  important_features = []
  for i in range(0,data.shape[0]):
    important_features.append(data['industry'][i]+' '+data['role'][i]+' '+data['type'][i]+' '+data['language'][i])
  return important_features


In [9]:
#create a colimn to hold the combined strings
df['important_features'] = get_important_features(df)

#show the data
df.head(3)

Unnamed: 0,title,industry,role,type,years of experience,language,soft skill,programming language,salary,job_id,important_features
0,GP at __ hospital,Health,GP,full-time,3,French,communication,none,50000,0,Health GP full-time French
1,Grammerly Part-time software engineer,IT,Software Engineer,part-time,1,Hausa,teamworking,python,30000,1,IT Software Engineer part-time Hausa
2,Google Summer Intern,IT,Software Engineer,Intern,0,Hausa,drive,java,2000,2,IT Software Engineer Intern Hausa


In [10]:
#convert the text to a matrix of token counts
cm = CountVectorizer().fit_transform(df['important_features'])

In [11]:
#get the cosine similarity matrix from the count matrix
cs = cosine_similarity(cm)
#print the cosine similarity matrix
print(cs)

#matrix contains values from 0 to 1 (where 1 represents 100% similarity to jobs)

[[1.         0.18257419 0.         0.6        0.36514837 0.6
  0.         0.50709255 0.36514837]
 [0.18257419 1.         0.73029674 0.18257419 0.5        0.18257419
  0.20412415 0.3086067  0.83333333]
 [0.         0.73029674 1.         0.         0.36514837 0.
  0.4472136  0.16903085 0.73029674]
 [0.6        0.18257419 0.         1.         0.36514837 0.8
  0.2236068  0.50709255 0.36514837]
 [0.36514837 0.5        0.36514837 0.36514837 1.         0.36514837
  0.20412415 0.46291005 0.66666667]
 [0.6        0.18257419 0.         0.8        0.36514837 1.
  0.4472136  0.50709255 0.36514837]
 [0.         0.20412415 0.4472136  0.2236068  0.20412415 0.4472136
  1.         0.         0.20412415]
 [0.50709255 0.3086067  0.16903085 0.50709255 0.46291005 0.50709255
  0.         1.         0.46291005]
 [0.36514837 0.83333333 0.73029674 0.36514837 0.66666667 0.36514837
  0.20412415 0.46291005 1.        ]]


In [12]:
#Get the shape of the cosine similarity matrix
cs.shape

(9, 9)

In [13]:
#get the title of the job that the applicant likes #####
title = 'Google Summer Intern'

#find the job id  
job_id = df[df.title == title]['job_id'].values[0]

In [14]:
#create a list of enumerations for the similarity score [(job_id, similarity score),(...)]
scores = list(enumerate(cs[job_id]))

In [15]:
#sort the list
#element at index 1 is the similarity score
sorted_scores = sorted(scores,key = lambda x: x[1],reverse = True)
scoted_scores = sorted_scores[1:]

In [16]:
#Print the sorted scores
print(sorted_scores)

[(2, 0.9999999999999999), (1, 0.7302967433402215), (8, 0.7302967433402215), (6, 0.4472135954999579), (4, 0.36514837167011077), (7, 0.1690308509457033), (0, 0.0), (3, 0.0), (5, 0.0)]


In [18]:
#create a loop to print the first 7 similar jobs
j = 0
print('The 5 most recommended job to',title,'are:\n')

for item in sorted_scores:
  job_title = df[df.job_id == item[0]]['title'].values[0] 
  print(j+1,job_title)  
  j = j+1
  if j>4:
    break


The 5 most recommended job to Google Summer Intern are:

1 Google Summer Intern
2 Grammerly Part-time software engineer
3 Facebook Senior Software engineer
4 Evercore Finance Analyst
5 Google site reliablity engineer
