In [None]:
# Importing libraries

import pandas as pd
import numpy as np
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Reading new data file - Amazon, Microsoft and other jobs

#df = pd.read_csv('./drive/MyDrive/Swatcloud/job_description - Amazon Microsoft others.csv',header=None,names=['Company','Job Title','Job Description'])

df = pd.read_csv('software_engineer_job_descriptions_without_indeed.csv',header=None,names=['Company','Job Title','Job Description'],skiprows=1)
df.reset_index(inplace=True, drop=True)
print(df.shape)


(2293, 3)


In [None]:
df.head()

Unnamed: 0,Company,Job Title,Job Description
0,Amazon,Senior Software Development Engineer,· 4+ years of professional software developmen...
1,Amazon,Software Development Engineer - Payments,· programming experience with at least one mod...
2,Amazon,Software Development Engineer - Fintech,bachelor’s degree in computer science or relat...
3,Amazon,Software Development Engineer,1+ years of experience contributing to the sys...
4,Amazon,"Embedded Software Development Engineer, Satell...",1+ years of experience contributing to the sys...


In [None]:
df['Job Description'][1]

"· programming experience with at least one modern language such as java, c++, or c# including object-oriented design· 1+ years of experience contributing to the architecture and design (architecture, design patterns, reliability and scaling) of new and current systems.· 2+ years of non-internship professional software development experience· bachelor's degree in engineering or equivalent· 3+ years of experience in object-oriented and component design· 2+ years delivering software solutions in distributed computing and soa· 2+ years of experience working with javascript/typescript front-end applications· experience in architecting solutions using native aws components and distributed computing· good written and verbal communication skills. · bachelor’s degree in computer science, computer engineering or related technical discipline· experience mentoring junior software engineers to improve their skills, and make them more effective, product software engineers· deal well with ambiguous/

In [None]:
# Text Cleaning tasks

# Removing new line characters
df['Job Description'] = df['Job Description'].apply(lambda x: x.replace('\n', ' '))
# Removing empty leading and trailing spaces 
df['Job Description'] = df['Job Description'].apply(lambda x: x.strip())
# Removing special characters
df['Job Description'] = df['Job Description'].replace(r'[^\w\s]+', '', regex=True)
# Converting the text to lowercase
df['Job Description'] = df['Job Description'].str.lower()
# Splitting each word
df['Job Description'] = df['Job Description'].apply(lambda x: x.split(' '))

In [None]:
df['Job Description'][1][0:10]

['',
 'programming',
 'experience',
 'with',
 'at',
 'least',
 'one',
 'modern',
 'language',
 'such']

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
from nltk.corpus import stopwords
stopwords = list(stopwords.words('english'))
stopwords[0:5]

['i', 'me', 'my', 'myself', 'we']

In [None]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [None]:

# Removing stop words and lemmatizing the words
lemmatizer = WordNetLemmatizer()
from nltk.stem import WordNetLemmatizer

for i in range(len(df['Job Description'])):
    text = []
    for word in df['Job Description'][i]:
        if word not in stopwords:
            word = lemmatizer.lemmatize(word)
            text.append(word)
    df['Job Description'][i] = text
            

In [None]:
df['Job Description'][1][0:10]

['',
 'programming',
 'experience',
 'least',
 'one',
 'modern',
 'language',
 'java',
 'c',
 'c']

In [None]:
# joining the words back together

df['Job Description'] = df['Job Description'].apply(lambda x: ' '.join(x))
df['Job Description'][1]

' programming experience least one modern language java c c including objectoriented design 1 year experience contributing architecture design architecture design pattern reliability scaling new current system 2 year noninternship professional software development experience bachelor degree engineering equivalent 3 year experience objectoriented component design 2 year delivering software solution distributed computing soa 2 year experience working javascripttypescript frontend application experience architecting solution using native aws component distributed computing good written verbal communication skill  bachelor degree computer science computer engineering related technical discipline experience mentoring junior software engineer improve skill make effective product software engineer deal well ambiguousundefined problem ability think abstractly eager learn learn fast enjoy fast paced environment selfdirected demonstrate leadership potential team player excellent verbal written c

In [None]:
#Importing needed libraries
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
cv = CountVectorizer(stop_words = 'english')
count_matrix = cv.fit_transform(df['Job Description'])

### Now we supply a new data point, and let the model output top 5 recommended job titles based on the calculated cosine similarity of this new data point and the existing job descriptions.

In [None]:
def top_5_recommendations(description, countVector, countMatrix):
  # type in the input data here, for example, 'cad drawings'
  new_data_input = [description]

  # transform the new data point using the same CountVetorizer
  new_data_transformed = cv.transform(np.array(new_data_input))

  # calculate cosine similarities of the new data point with all of the job descriptions
  cosine_sim = cosine_similarity(new_data_transformed, countMatrix)

  # collect the top 5 recommendations
  top_5 = pd.DataFrame(cosine_sim.T, columns=['similarities']).sort_values(by='similarities', ascending=False)[1:6]
  top_5 = top_5.reset_index()
  print(top_5)

  # print out the top 5 job descriptions
  print("\nApplicant's qualifications: ", new_data_input[0], '\n' )
  print('Recommended jobs:')
  for index in top_5['index']:
    print('\nJob Title: ', df['Job Title'][index])
    print('Company:', df['Company'][index])
    print('Job Description:', df['Job Description'][index])
  

In [None]:
# this is the result from the input data without indeed jobs
new_data = 'cad drawings'
top_5_recommendations(new_data, cv, count_matrix)

   index  similarities
0   1468      0.160644
1   1910      0.128037
2   2022      0.112509
3    595      0.099015
4    739      0.094072

Applicant's qualifications:  cad drawings 

Recommended jobs:

Job Title:  Optical Systems Engineer
Company: Meta
Job Description: experience reviewing geometric optical system cad tool zemax code v others experience sequential nonsequential ray tracing including stray light analysisbs field engineering physic optic related field 6 year experience developing integrating optical subsystem multidisciplinary teamsexperience specification optical tolerancesexperience optomechanical integration system designexperience optical metrologyexperience radiometry photometryexperience working optic lab environment experience designing modeling geometric optical system cad tool zemax code v othersexperience monte carlo optical tolerancingexperience design manufacturability dfm proven track record highvolume production10 year experience developing optical system m

In [None]:
# this is the result from the input data with indeed jobs
new_data = 'cad drawings'
top_5_recommendations(new_data, cv, count_matrix)

   index  similarities
0   1468      0.160644
1   1910      0.128037
2   2022      0.112509
3    595      0.099015
4    739      0.094072

Applicant's qualifications:  cad drawings 

Recommended jobs:

Job Title:  Optical Systems Engineer
Company: Meta
Job Description: experience reviewing geometric optical system cad tool zemax code v others experience sequential nonsequential ray tracing including stray light analysisbs field engineering physic optic related field 6 year experience developing integrating optical subsystem multidisciplinary teamsexperience specification optical tolerancesexperience optomechanical integration system designexperience optical metrologyexperience radiometry photometryexperience working optic lab environment experience designing modeling geometric optical system cad tool zemax code v othersexperience monte carlo optical tolerancingexperience design manufacturability dfm proven track record highvolume production10 year experience developing optical system m

In [None]:
# this is the result from the previous run - to be deleted
new_data = 'cad drawings'
top_5_recommendations(new_data, cv, count_matrix)

   index  similarities
0   1733      0.160644
1   1008      0.094072
2   1007      0.092848
3    862      0.091287
4   1044      0.083918

Applicant's qualifications:  cad drawings 

Recommended jobs:

Job Title:  Optical Systems Engineer
Job Description: experience reviewing geometric optical system cad tool zemax code v others experience sequential nonsequential ray tracing including stray light analysisbs field engineering physic optic related field 6 year experience developing integrating optical subsystem multidisciplinary teamsexperience specification optical tolerancesexperience optomechanical integration system designexperience optical metrologyexperience radiometry photometryexperience working optic lab environment experience designing modeling geometric optical system cad tool zemax code v othersexperience monte carlo optical tolerancingexperience design manufacturability dfm proven track record highvolume production10 year experience developing optical system multidisciplina

In [None]:
# The model seems to have recommended jobs.