<a href="https://colab.research.google.com/github/Siliconvalley4uYouthProjects/SpringBoard-Swatcloud/blob/main/Recommendation_system_content_based_CV_without_wanted_unwanted_keywords.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [58]:
# Importing libraries

import pandas as pd
import numpy as np
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [59]:
# Reading in the jobs

df = pd.read_csv('tech_jobs_labeled.csv',header=None,names=['Company','Job Title','Job Description','Industry'],skiprows=1)
df.reset_index(inplace=True, drop=True)
#Reordering columns to have industry first
cols = df.columns.tolist()
cols = cols[-1:] + cols[:-1]
df = df[cols]
print(df.shape)


(2534, 4)


In [60]:
df.head()

Unnamed: 0,Industry,Company,Job Title,Job Description
0,Software Engineering,Amazon,Senior Software Development Engineer,· 4+ years of professional software developmen...
1,Software Engineering,Amazon,Software Development Engineer - Payments,· programming experience with at least one mod...
2,Software Engineering,Amazon,Software Development Engineer - Fintech,bachelor’s degree in computer science or relat...
3,Software Engineering,Amazon,Software Development Engineer,1+ years of experience contributing to the sys...
4,Software Engineering,Amazon,"Embedded Software Development Engineer, Satell...",1+ years of experience contributing to the sys...


In [61]:
df['Job Description'][1]

"· programming experience with at least one modern language such as java, c++, or c# including object-oriented design· 1+ years of experience contributing to the architecture and design (architecture, design patterns, reliability and scaling) of new and current systems.· 2+ years of non-internship professional software development experience· bachelor's degree in engineering or equivalent· 3+ years of experience in object-oriented and component design· 2+ years delivering software solutions in distributed computing and soa· 2+ years of experience working with javascript/typescript front-end applications· experience in architecting solutions using native aws components and distributed computing· good written and verbal communication skills. · bachelor’s degree in computer science, computer engineering or related technical discipline· experience mentoring junior software engineers to improve their skills, and make them more effective, product software engineers· deal well with ambiguous/

In [62]:
# Text Cleaning tasks

# Removing new line characters
df['Job Description'] = df['Job Description'].apply(lambda x: x.replace('\n', ' '))
# Removing empty leading and trailing spaces 
df['Job Description'] = df['Job Description'].apply(lambda x: x.strip())
# Removing special characters
df['Job Description'] = df['Job Description'].replace(r'[^\w\s]+', '', regex=True)
# Converting the text to lowercase
df['Job Description'] = df['Job Description'].str.lower()
# Splitting each word
df['Job Description'] = df['Job Description'].apply(lambda x: x.split(' '))

In [63]:
df['Job Description'][1][0:10]

['',
 'programming',
 'experience',
 'with',
 'at',
 'least',
 'one',
 'modern',
 'language',
 'such']

In [64]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [65]:
from nltk.corpus import stopwords
stopwords = list(stopwords.words('english'))
stopwords[0:5]

['i', 'me', 'my', 'myself', 'we']

In [66]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [67]:

# Removing stop words and lemmatizing the words
lemmatizer = WordNetLemmatizer()
from nltk.stem import WordNetLemmatizer

for i in range(len(df['Job Description'])):
    text = []
    for word in df['Job Description'][i]:
        if word not in stopwords:
            word = lemmatizer.lemmatize(word)
            text.append(word)
    df['Job Description'][i] = text
            

In [68]:
df['Job Description'][1][0:10]

['',
 'programming',
 'experience',
 'least',
 'one',
 'modern',
 'language',
 'java',
 'c',
 'c']

In [69]:
# joining the words back together

df['Job Description'] = df['Job Description'].apply(lambda x: ' '.join(x))
df['Job Description'][1]

' programming experience least one modern language java c c including objectoriented design 1 year experience contributing architecture design architecture design pattern reliability scaling new current system 2 year noninternship professional software development experience bachelor degree engineering equivalent 3 year experience objectoriented component design 2 year delivering software solution distributed computing soa 2 year experience working javascripttypescript frontend application experience architecting solution using native aws component distributed computing good written verbal communication skill  bachelor degree computer science computer engineering related technical discipline experience mentoring junior software engineer improve skill make effective product software engineer deal well ambiguousundefined problem ability think abstractly eager learn learn fast enjoy fast paced environment selfdirected demonstrate leadership potential team player excellent verbal written c

In [70]:
#Importing needed libraries
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [71]:
cv = CountVectorizer(stop_words = 'english')
count_matrix = cv.fit_transform(df['Job Description'])

### Now we supply a new data point, and let the model output top 5 recommended job titles based on the calculated cosine similarity of this new data point and the existing job descriptions.

In [72]:
def top_x_recommendations(x,DataFrame,description, countVector, countMatrix):
  # type in the input data here, for example, 'cad drawings'
  new_data_input = [description]

  # transform the new data point using the same CountVectorizer
  new_data_transformed = countVector.transform(np.array(new_data_input))

  # calculate cosine similarities of the new data point with all of the job descriptions
  cosine_sim = cosine_similarity(new_data_transformed, countMatrix)

  # collect the top x recommendations
  top_x = pd.DataFrame(cosine_sim.T, columns=['Similarity Score']).sort_values(by='similarities', ascending=False)[1:x+1]
  top_x = top_x.reset_index()
  print(top_x)

  # print out the top x job descriptions
  print("\nApplicant's qualifications: ", new_data_input[0], '\n' )
  print('Recommended jobs:')
  for index in top_x['index']:
    print('\nJob Title: ', DataFrame['Job Title'][index])
    print('Company:', DataFrame['Company'][index])
    print('Job Description:', DataFrame['Job Description'][index])
  

In [73]:
# this is the result from the input data without indeed jobs
new_data = 'cad drawings'
top_x_recommendations(5,df,new_data, cv, count_matrix)

   index  similarities
0   1468      0.160644
1   1910      0.128037
2   2022      0.112509
3    595      0.099015
4    739      0.094072

Applicant's qualifications:  cad drawings 

Recommended jobs:

Job Title:  Optical Systems Engineer
Company: Meta
Job Description: experience reviewing geometric optical system cad tool zemax code v others experience sequential nonsequential ray tracing including stray light analysisbs field engineering physic optic related field 6 year experience developing integrating optical subsystem multidisciplinary teamsexperience specification optical tolerancesexperience optomechanical integration system designexperience optical metrologyexperience radiometry photometryexperience working optic lab environment experience designing modeling geometric optical system cad tool zemax code v othersexperience monte carlo optical tolerancingexperience design manufacturability dfm proven track record highvolume production10 year experience developing optical system m

In [74]:
# this is the result from the input data with indeed jobs
new_data = 'cad drawings'
top_x_recommendations(5,df,new_data, cv, count_matrix)

   index  similarities
0   1468      0.160644
1   1910      0.128037
2   2022      0.112509
3    595      0.099015
4    739      0.094072

Applicant's qualifications:  cad drawings 

Recommended jobs:

Job Title:  Optical Systems Engineer
Company: Meta
Job Description: experience reviewing geometric optical system cad tool zemax code v others experience sequential nonsequential ray tracing including stray light analysisbs field engineering physic optic related field 6 year experience developing integrating optical subsystem multidisciplinary teamsexperience specification optical tolerancesexperience optomechanical integration system designexperience optical metrologyexperience radiometry photometryexperience working optic lab environment experience designing modeling geometric optical system cad tool zemax code v othersexperience monte carlo optical tolerancingexperience design manufacturability dfm proven track record highvolume production10 year experience developing optical system m

Now we want to compare the jobs that this model recommends to our other model which focuses on technical keywords as opposed to the entire job description. As a reminder, in our other model, we used the qualification requirements from job description #10 (Senior Software Development Engineer @ Amazon) as the input for our hypothetical candidate.

In [75]:
#This is the keywords model
# get the index of id
def index_from_id(df,id):

 return df[df.index==id].index.values[0]

# id: the one you want to match, it can be student id or company id, if it is student, then retrieve all companys skillset, and create a dataframe,(match company for a given student)
# and if it is company id, then retrieve all students skillset, and create a dataframe(match students for a given company)
# df: dataframe needed to be make
#cosine_similarity_matrix: converted from dataframe[keyword] column
#number_of_recommendations: the number you want to match
def recommendations( id, df,cosine_similarity_matrix,number_of_recommendations):

  index = index_from_id(df,id)

  similarity_scores = list(enumerate(cosine_similarity_matrix[index]))

  similarity_scores_sorted = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

  recommendations_indices = [t[0] for t in similarity_scores_sorted[1:(number_of_recommendations+1)]]
  # this could be df['ID'] 
  return df['Title'].iloc[recommendations_indices]

In [76]:
df2 = pd.read_csv('jobs_wanted_unwanted_keywords_string.csv')
vect = CountVectorizer(stop_words='english')
# transform df['Technical Keywords'] into matrix
vect_matrix = vect.fit_transform(df2['Technical Keywords'])
vect_matrix
# get the cosine_similarity_matrix
cosine_similarity_matrix_count_based = cosine_similarity(vect_matrix, vect_matrix)
recommendations(10,df2,cosine_similarity_matrix_count_based,10)

42           Sr Software Dev Engineer, Business Payments
13                         Software Development Engineer
31                  Software Development Engineer , CTPS
249    Sr. Software Development Engineer - AWS DNS, A...
2                          Software Development Engineer
274    Software Development Engineer, AWS Elemental M...
294    Software Development Engineer, Kuiper Manufact...
164    Software Development Engineer II, AB Supplier ...
3      Embedded Software Development Engineer, Satell...
16     Sr. Embedded Software Development Engineer, Sa...
Name: Title, dtype: object

In [77]:
#Now let's try with the non-keyword model.
df.iloc[10]
df['Job Description'].iloc[10]

'candidate must bachelor computer science engineering related field equivalent experience8 year professional experience software developmentexperience contributing architecture design architecture design pattern reliability scaling new current system industry experience architecting designing scalable system interact multiple system designed expansion business growspossess extremely sound understanding basic area computer science algorithm data structure object oriented design databasesknowledge professional software engineering practice  best practice full software development life cycle including coding standard code review source control management build process testing operationsexperience building complex large scale software system using aws technology successfully delivered customersexperience communicating user technical team management collect requirement describe software product feature technical design remove bottleneck tech team throughout sdlc'

In [78]:
top_x_recommendations(10,df,df['Job Description'].iloc[10],cv,count_matrix)

   index  similarities
0    160      0.751005
1     63      0.745466
2    207      0.745420
3     25      0.739808
4    165      0.737304
5    220      0.733350
6     15      0.730831
7    417      0.725731
8    175      0.722166
9    242      0.722166

Applicant's qualifications:  candidate must bachelor computer science engineering related field equivalent experience8 year professional experience software developmentexperience contributing architecture design architecture design pattern reliability scaling new current system industry experience architecting designing scalable system interact multiple system designed expansion business growspossess extremely sound understanding basic area computer science algorithm data structure object oriented design databasesknowledge professional software engineering practice  best practice full software development life cycle including coding standard code review source control management build process testing operationsexperience building comple

**Testing Using a Non-Amazon Job**

In [79]:
#Now let's try to use an input from a non-Amazon job.
df.iloc[2514]

Industry                                               Data Analysis
Company                                               Phasorsoft LLC
Job Title                                               Data analyst
Job Description    good experience alteryx data analytics 5 year ...
Name: 2514, dtype: object

In [80]:
df2.iloc[2513]

Company                                                     Phasorsoft LLC
Title                                                         Data analyst
Technical Keywords       ['java', 'python', '', 'language', 'data', 'r'...
Nontechnical Keywords    ['experience', 'year', 'science', 'language', ...
Name: 2513, dtype: object

In [81]:
recommendations(2513,df2,cosine_similarity_matrix_count_based,10)

2155        Backend Telemetry Software Engineer, Autonomy
2510                                         Data Analyst
2086                              Sr Software QA Engineer
2090                             Sr. Software QA Engineer
2212    Senior Software Engineer, Charging Data and Mo...
2151     Software Engineer, Data Engineering, Diagnostics
2521                                         Data Analyst
287     Software Development Engineer I - Supply Chain...
288     Software Development Engineer I - Supply Chain...
2134     Software Engineer, Core Engineering Technologies
Name: Title, dtype: object

In [82]:
top_x_recommendations(10,df,df['Job Description'].iloc[2514],cv,count_matrix)

   index  similarities
0   1112      0.556815
1   2494      0.537220
2    621      0.513613
3    631      0.507119
4   1155      0.502148
5    688      0.497598
6    868      0.496321
7   2527      0.494788
8    423      0.493977
9   2498      0.487110

Applicant's qualifications:  good experience alteryx data analytics 5 year experience across data analytics business intelligence developing database model data science graduating college proficiency sql python alteryx  etl tool experience analytics software language sa java r python ideal candidate already hand experience alteryx core certifiedsf developer platform developer 1 certification would preferred 

Recommended jobs:

Job Title:  Principal Product Data Analyst
Company: Microsoft
Job Description: bachelor degree statistic mathematics analytics data science engineering computer science business economics related field 7 year experience data analysis reporting data science business intelligence business financial analysisor maste

The non-keyword model gave more data analyst recommendations that seem closer to the input than the keyword model's results (which were a mix of data analyst + software engineering jobs).

**Testing using a combined job pool which includes marketing jobs**

In [83]:
cols = df.columns.tolist()
cols = cols[-1:] + cols[:-1]
cols

['Job Description', 'Industry', 'Company', 'Job Title']

In [84]:
df

Unnamed: 0,Industry,Company,Job Title,Job Description
0,Software Engineering,Amazon,Senior Software Development Engineer,4 year professional software development expe...
1,Software Engineering,Amazon,Software Development Engineer - Payments,programming experience least one modern langu...
2,Software Engineering,Amazon,Software Development Engineer - Fintech,bachelor degree computer science related field...
3,Software Engineering,Amazon,Software Development Engineer,1 year experience contributing system design a...
4,Software Engineering,Amazon,"Embedded Software Development Engineer, Satell...",1 year experience contributing system design a...
...,...,...,...,...
2529,Data Analysis,Mainz Brady Group,e-Commerce Data Analyst,microsoft excel 1 year required data analysis ...
2530,Data Analysis,Brooksource,Data Analyst,bachelor degree computer science information s...
2531,Data Analysis,Internal Data Resources,Sr. Data Analyst,work involves conducting detailed analysis ext...
2532,Operations,Amware Fulfillment,Business Analyst,m excel include advanced feature pivot table l...


In [85]:
df_m = pd.read_csv('marketing_job_descriptions.csv',names=['index','Industry','Company','Job Title','Job Description'])

df_agg = pd.concat([df,df_m])
df_agg.reset_index(inplace=True)
df_agg.drop(columns=['level_0','index'],inplace=True)

In [86]:
df_agg

Unnamed: 0,Industry,Company,Job Title,Job Description
0,Software Engineering,Amazon,Senior Software Development Engineer,4 year professional software development expe...
1,Software Engineering,Amazon,Software Development Engineer - Payments,programming experience least one modern langu...
2,Software Engineering,Amazon,Software Development Engineer - Fintech,bachelor degree computer science related field...
3,Software Engineering,Amazon,Software Development Engineer,1 year experience contributing system design a...
4,Software Engineering,Amazon,"Embedded Software Development Engineer, Satell...",1 year experience contributing system design a...
...,...,...,...,...
3954,Marketing,JnJ,"[Communications/Janssen] Manager, Therapeutic ...",preferred minimum education: bachelor’s degre...
3955,Marketing,JnJ,2023 Supply Chain Fall Co-Op,candidates must be legally authorized to work...
3956,Marketing,JnJ,Operations Leader Janssen / Manufacturing Dire...,bachelor’s degree required mba/master desirab...
3957,Marketing,JnJ,"Director, Real World Population Health Research",a minimum of a bachelor’s degree is required....


In [88]:
# Text Cleaning tasks
df_agg['Job Description'] = df_agg['Job Description'].astype(str)

# Removing new line characters
df_agg['Job Description'] = df_agg['Job Description'].apply(lambda x: x.replace('\n', ' '))
# Removing empty leading and trailing spaces 
df_agg['Job Description'] = df_agg['Job Description'].apply(lambda x: x.strip())
# Removing special characters
df_agg['Job Description'] = df_agg['Job Description'].replace(r'[^\w\s]+', '', regex=True)
# Converting the text to lowercase
df_agg['Job Description'] = df_agg['Job Description'].str.lower()
# Splitting each word
df_agg['Job Description'] = df_agg['Job Description'].apply(lambda x: x.split(' '))

for i in range(len(df_agg['Job Description'])):
    text = []
    for word in df_agg['Job Description'][i]:
        if word not in stopwords:
            word = lemmatizer.lemmatize(word)
            text.append(word)
    df_agg['Job Description'][i] = text

# joining the words back together

df_agg['Job Description'] = df_agg['Job Description'].apply(lambda x: ' '.join(x))

In [89]:
cv2 = CountVectorizer(stop_words = 'english')
count_matrix2 = cv2.fit_transform(df_agg['Job Description'])

In [90]:
top_x_recommendations(10,df_agg,df_agg['Job Description'].iloc[10],cv2,count_matrix2)

   index  similarities
0    160      0.751005
1     63      0.745466
2    207      0.745420
3     25      0.739808
4    165      0.737304
5    220      0.733350
6     15      0.730831
7    417      0.725731
8    175      0.722166
9    242      0.722166

Applicant's qualifications:  candidate must bachelor computer science engineering related field equivalent experience8 year professional experience software developmentexperience contributing architecture design architecture design pattern reliability scaling new current system industry experience architecting designing scalable system interact multiple system designed expansion business growspossess extremely sound understanding basic area computer science algorithm data structure object oriented design databasesknowledge professional software engineering practice  best practice full software development life cycle including coding standard code review source control management build process testing operationsexperience building comple

In [91]:
top_x_recommendations(20,df_agg,df_agg['Job Description'].iloc[2655],cv2,count_matrix2)

    index  similarities
0    2655      1.000000
1    2661      1.000000
2    2656      0.920911
3    1285      0.448211
4    1189      0.392568
5    1765      0.334252
6    2828      0.332182
7    1723      0.330847
8    1705      0.330847
9    1676      0.325960
10   1756      0.321265
11   1739      0.321265
12   1648      0.321265
13   1631      0.321265
14   1628      0.321265
15   1708      0.321265
16   1758      0.321265
17   1670      0.321265
18   1750      0.321265
19   1719      0.321265

Applicant's qualifications:  law degree accredited law school 5 year experience lawlicense practice law least one state leading crossfunctional team 

Recommended jobs:

Job Title:  Senior Counsel Advertising and Media
Company: Walmart
Job Description: law degree accredited law school 5 year experience lawlicense practice law least one state leading crossfunctional team

Job Title:  Counsel III - Value-Based Care Technology & Data, Digital Citizenship
Company: Walmart
Job Description: law d

In [122]:
#Let's change the "simiarity" output to a more conventional qualitative description scale.

def top_x_recommendations(x,DataFrame,description, countVector, countMatrix):
  # type in the input data here, for example, 'cad drawings'
  new_data_input = [description]

  # transform the new data point using the same CountVectorizer
  new_data_transformed = countVector.transform(np.array(new_data_input))

  # calculate cosine similarities of the new data point with all of the job descriptions
  cosine_sim = cosine_similarity(new_data_transformed, countMatrix)

  # collect the top x recommendations
  top_x = pd.DataFrame(cosine_sim.T, columns=['Similarity Score']).sort_values(by='Similarity Score', ascending=False)[1:x+1]
  similarity = []
  for score in top_x['Similarity Score']:
    if score > 0.7:
      similarity.append('Very Strong Match')
    elif score > 0.45:
      similarity.append('Strong Match')
    elif score > 0.35:
      similarity.append('Good Match')
    else:
      similarity.append('Poor Match')
  top_x['Match Strength'] = similarity
  top_x = top_x.reset_index()
  print(top_x)

  # print out the top x job descriptions
  print("\nApplicant's qualifications: ", new_data_input[0], '\n' )
  print('Recommended jobs:')
  for i in range(len(top_x['index'])):
    print('\nJob Title: ', DataFrame['Job Title'][top_x.iloc[i]['index']])
    print('Company:', DataFrame['Company'][top_x.iloc[i]['index']])
    print('Job Description:', DataFrame['Job Description'][top_x.iloc[i]['index']])
    match_strength = top_x['Match Strength'][i]
    print('Match Strength:', match_strength)

In [123]:
top_x_recommendations(20,df_agg,df_agg['Job Description'].iloc[2655],cv2,count_matrix2)

    index  Similarity Score     Match Strength
0    2655          1.000000  Very Strong Match
1    2661          1.000000  Very Strong Match
2    2656          0.920911  Very Strong Match
3    1285          0.448211         Good Match
4    1189          0.392568         Good Match
5    1765          0.334252         Poor Match
6    2828          0.332182         Poor Match
7    1723          0.330847         Poor Match
8    1705          0.330847         Poor Match
9    1676          0.325960         Poor Match
10   1756          0.321265         Poor Match
11   1739          0.321265         Poor Match
12   1648          0.321265         Poor Match
13   1631          0.321265         Poor Match
14   1628          0.321265         Poor Match
15   1708          0.321265         Poor Match
16   1758          0.321265         Poor Match
17   1670          0.321265         Poor Match
18   1750          0.321265         Poor Match
19   1719          0.321265         Poor Match

Applicant's 