In [None]:
# Importing libraries

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Reading the original data file
df = pd.read_csv('keywords.csv')
df.head()

Unnamed: 0,ID,title,key word
0,1,backend1,java python web developement spring
1,2,backend2,C++ java python django
2,3,backend3,Object oriented language data structure web de...
3,4,frontend1,html css js react node.js vue
4,5,frontend2,ui design ajax jquery html js


In [None]:
# Getting the keywords from the original data file

keywords = []
for word in df.iloc[:,2]:
  words = word.split(' ')
  for keyword in words:
    if keyword not in keywords:
      keywords.append(keyword.lower())
print(keywords)

['java', 'python', 'web', 'developement', 'spring', 'c++', 'django', '', 'object', 'oriented', 'language', 'data', 'structure', 'development', 'html', 'css', 'js', 'react', 'node.js', 'vue', 'ui', 'design', 'ajax', 'jquery', 'asynchronous', 'request', 'analysis', 'management', 'storage', 'mysql', 'warehouse', 'r', 'visualization']


In [None]:
# Reading new data file - Amazon, Microsoft and other jobs

df2 = pd.read_csv('software_engineer_job_descriptions_with_indeed.csv',header=None,names=['Company','Job Title','Job Description'])
df2 = df2[1:]
df2.head()

Unnamed: 0,Company,Job Title,Job Description
1.0,Amazon,Software Development Engineer - Payments,· programming experience with at least one mod...
2.0,Amazon,Software Development Engineer - Fintech,bachelor’s degree in computer science or relat...
3.0,Amazon,Software Development Engineer,1+ years of experience contributing to the sys...
4.0,Amazon,"Embedded Software Development Engineer, Satell...",1+ years of experience contributing to the sys...
5.0,Amazon,Embedded Software Engineer,"ba/bs degree in computer science, computer eng..."


In [None]:
#Lemmatizing and then combining all job descriptions into one list to feed into CountVectorizer
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
agg = ''
agg2 = []
for i in range(len(df2.iloc[:,2])):
  description = df2.iloc[i,2].split(' ')
  agg = ''
  for word in description: 
    if word.isalpha():
     lemmatized_word = lemmatizer.lemmatize(word)
     agg = agg + ' ' + lemmatized_word
  agg2.append(agg)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
# Vectorizing the words

vectorizer = CountVectorizer(ngram_range = (1,4),stop_words = 'english')
cv = vectorizer.fit_transform(agg2)

In [None]:
# words & phrases with their frequency
counts = pd.DataFrame(vectorizer.fit_transform(agg2).sum(axis=0),
                      columns=vectorizer.get_feature_names())
counts = counts.T



In [None]:
# top 200 common words or phrases

common_words = counts[0].sort_values(ascending=False)[:50]
print(common_words)

experience              8238
year                    4961
software                3466
year experience         3310
degree                  2583
related                 2484
computer                2475
technical               2385
data                    1948
equivalent              1764
development             1679
design                  1542
engineering             1488
working                 1413
work                    1149
ability                 1143
team                    1056
product                  999
including                923
management               921
programming              897
knowledge                894
business                 884
skill                    874
field                    846
building                 844
project                  843
software development     821
communication            813
science                  801
degree computer          774
application              722
technology               720
using                    686
language      

In [None]:
# manually selected technical keywords out of the top 200 frequently appearing keywords

technical_words = ['software','data','development','design','engineering','systems','computer','code',
                   'analysis','application', 'product','python','engineer','programming','sql','java',
                   'cloud','machine learning','web','models', 'bachelor', 'business', 'architecture', 
                   'communication', 'mathematics', 'analysis', 'testing', 'aws', 'database', 'agile', 
                   'coding', 'security', 'troubleshooting', 'scaling','unix', 'linux', 'statistics', 'c++', 
                   'algorithms', 'networking', 'javascript','object oriented','cloud based','web based','software development',
                   'computer science']
non_technical_keywords = []
for word in common_words.index:
  if word not in technical_words:
    non_technical_keywords.append(word)
for word in technical_words:
  if word not in keywords:
    keywords.append(word)
#keywords.remove('')
print(keywords)

['java', 'python', 'web', 'developement', 'spring', 'c++', 'django', '', 'object', 'oriented', 'language', 'data', 'structure', 'development', 'html', 'css', 'js', 'react', 'node.js', 'vue', 'ui', 'design', 'ajax', 'jquery', 'asynchronous', 'request', 'analysis', 'management', 'storage', 'mysql', 'warehouse', 'r', 'visualization', 'software', 'engineering', 'systems', 'computer', 'code', 'application', 'product', 'engineer', 'programming', 'sql', 'cloud', 'machine learning', 'models', 'bachelor', 'business', 'architecture', 'communication', 'mathematics', 'testing', 'aws', 'database', 'agile', 'coding', 'security', 'troubleshooting', 'scaling', 'unix', 'linux', 'statistics', 'algorithms', 'networking', 'javascript', 'object oriented', 'cloud based', 'web based', 'software development', 'computer science']


In [None]:
# adding additional bigrams to unwanted keywords list.
non_technical_keywords.append('problem solving')
non_technical_keywords.append('fast paced')
non_technical_keywords.append('cross functional')

In [None]:
#Aggregate Dataframe

df3 = df2[['Company','Job Title']]
wanted_keywords = []
unwanted_keywords = []
for jd in df2['Job Description']:
  job = jd.lower()
  wanted_keywords_sub = []
  unwanted_keywords_sub = []
  for word in keywords:
    if word in job:
      wanted_keywords_sub.append(word)
  for word in non_technical_keywords:
    if word in job:
      unwanted_keywords_sub.append(word)
  wanted_keywords.append(wanted_keywords_sub)
  unwanted_keywords.append(unwanted_keywords_sub)
df3['Wanted Keywords'] = wanted_keywords
df3['Unwanted Keywords'] = unwanted_keywords
df3.head()

Unnamed: 0,Company,Job Title,Wanted Keywords,Unwanted Keywords
1.0,Amazon,Software Development Engineer - Payments,"[java, c++, , object, oriented, language, deve...","[experience, year, degree, related, technical,..."
2.0,Amazon,Software Development Engineer - Fintech,"[java, c++, , object, oriented, language, data...","[experience, year, degree, related, technical,..."
3.0,Amazon,Software Development Engineer,"[web, , language, development, ui, design, man...","[experience, year, degree, related, technical,..."
4.0,Amazon,"Embedded Software Development Engineer, Satell...","[, language, structure, development, ui, desig...","[experience, year, degree, related, technical,..."
5.0,Amazon,Embedded Software Engineer,"[python, , language, data, development, ui, r,...","[experience, year, degree, related, equivalent..."


In [None]:
# renaming columns to match the column names from the Count Vectorizer model (1st model)

df3.columns= ['Company', 'Title', 'Technical Keywords', 'Nontechnical Keywords']
df3.head(3)

Unnamed: 0,Company,Title,Technical Keywords,Nontechnical Keywords
1.0,Amazon,Software Development Engineer - Payments,"[java, c++, , object, oriented, language, deve...","[experience, year, degree, related, technical,..."
2.0,Amazon,Software Development Engineer - Fintech,"[java, c++, , object, oriented, language, data...","[experience, year, degree, related, technical,..."
3.0,Amazon,Software Development Engineer,"[web, , language, development, ui, design, man...","[experience, year, degree, related, technical,..."


In [None]:
# saving to a csv file

df3.to_csv('jobs_wanted_unwanted_keywords.csv',index=False)

In [62]:
df4 = df3.reset_index(drop=True)

In [None]:
# Converting the key word column to one long string instead of a list of words.
# Saving this file because the previous model expects a string in the key word column

for i in range(len(df4['Technical Keywords'])):
    df4['Technical Keywords'][i] = " ".join(df4['Technical Keywords'][i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df4['Technical Keywords'][i] = " ".join(df4['Technical Keywords'][i])


In [None]:
df4.head(3)

In [63]:
# saving to a csv file. This file has the key word column as string, instead of a list of strings.
df4.to_csv('jobs_wanted_unwanted_keywords_string.csv',index=False)