# Job - Pre-processing and Modelling Iteration 1

In [1]:
# libraries import

import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt
%matplotlib inline

import re
import datetime
from datetime import date
from time import strptime

import RAKE as rake
import operator


######################################################################################

# Working on Job description Text Data
######################################################################################   

In [2]:
# reading my sorted job csv
job = pd.read_csv('C:/Users/CHARAN SRI SAI/Downloads/rsjbrproj/data_gathering_eda/sorted_jobs_master_new2.csv')

In [3]:
job.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38941 entries, 0 to 38940
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   index             38941 non-null  int64  
 1   company           38941 non-null  object 
 2   education         38941 non-null  object 
 3   experience        38941 non-null  int64  
 4   industry          38941 non-null  object 
 5   jobdescription    38941 non-null  object 
 6   jobtitle          38941 non-null  object 
 7   payrate           38941 non-null  object 
 8   postdate          38908 non-null  object 
 9   skills            38941 non-null  object 
 10  experience_range  38941 non-null  int64  
 11  Salary_range      38941 non-null  float64
 12  is_grad           38941 non-null  float64
 13  is_postgrad       38941 non-null  float64
 14  is_doc            38941 non-null  float64
 15  j_id              38941 non-null  int64  
 16  location          38941 non-null  object

###########################################################################################################################
# Understanding Job_description column (using NLP)
###########################################################################################################################


# 1. NLP - NLTK application to understand most used words

In [4]:
#Import all the dependencies
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
set(stopwords.words('english'))
# nltk.download('abc')
# from nltk.corpus import abc
# from nltk import RegexpTokenizer

import string
stopwords = set(stopwords.words("english"))
import gensim
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

[nltk_data] Downloading package wordnet to C:\Users\CHARAN SRI
[nltk_data]     SAI\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
# defining tokenizer 
def my_tokenizer(text):
    # 1. split at whitespace
    text = text.split(' ')
    
    #2. lowercase
    text = [word.lower() for word in text]
    
    #3. Remove puncutation
    #table to replace puncuation
    punc_table = str.maketrans('','',string.punctuation)
    
    #call translate()
    text = [word.translate(punc_table) for word in text]
    
    #4. remove stopwords
    text = [word for word in text if word not in stopwords]
    
    #5. lemmmatize
    lemmatizer = WordNetLemmatizer()
    
    text = [lemmatizer.lemmatize(word, pos='v') for word in text]
    text = [lemmatizer.lemmatize(word, pos='n') for word in text]
    text = [lemmatizer.lemmatize(word, pos='a') for word in text]
    
    #6. remove empty strings
    text = [word for word in text if word !='']
    
    return text 

In [6]:
#apply count vectorizor for tokenization
from sklearn.feature_extraction.text import CountVectorizer
# # # Now, time to count vectorize to get most used words in job description
X_train = job['jobdescription']
X_train.shape

# 1. Instantiate
bagofwords = CountVectorizer(min_df = 15, tokenizer = my_tokenizer)
# 2. Fit
bagofwords.fit(X_train)
# 3. Transform
X_train_p = bagofwords.transform(X_train)

# analysing JD words in dataframe
word_counts = np.array(np.sum(X_train_p, axis=0)).reshape((-1,))
words = np.array(bagofwords.get_feature_names_out())
words_df = pd.DataFrame({"word":words, 
                         "count":word_counts})
words_rank = words_df.sort_values(by="count", ascending=False)
words_rank.to_csv('jd_words_rank_.csv') # Storing for inspection 
words_rank.head()



Unnamed: 0,word,count
5415,job,140463
10756,,87571
7596,profile,84883
3031,description,75494
2388,company,72167


In [7]:
# Visualizing top 10 words
import seaborn as sns
plt.figure(figsize=(12,8))
rk_w = words_rank['count'].astype(str)
sns.barplot(words_rank['word'][:10], words_rank['count'][:10].astype(str), palette=sns.color_palette("YlOrRd"))
plt.title('Top 10 Most Common Words in Job Description')
plt.show()

TypeError: barplot() takes from 0 to 1 positional arguments but 2 positional arguments (and 1 keyword-only argument) were given

<Figure size 1200x800 with 0 Axes>

# 2. NLP - TF-IDF application to get a list of all tokens 
-- This helped me to gather what words needed to be in stop-words list

In [8]:
df_job_descriptions = job[['j_id','jobtitle','company' ]]
df_job_descriptions['jd_combo'] = job['jobtitle'] +" "+job['jobdescription']+ " "+job['skills'] + " " + job['industry']
df_job_descriptions.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_job_descriptions['jd_combo'] = job['jobtitle'] +" "+job['jobdescription']+ " "+job['skills'] + " " + job['industry']


Unnamed: 0,j_id,jobtitle,company,jd_combo
0,0,walkin data entry operator (night shift),MM Media Pvt Ltd,walkin data entry operator (night shift) Job D...
1,1,work based onhome based part time.,find live infotech,work based onhome based part time. Job Descrip...
2,2,pl/sql developer - sql,Softtech Career Infosystem Pvt. Ltd,pl/sql developer - sql Job Description Send ...
3,3,manager/ad/partner - indirect tax - ca,Onboard HRServices LLP,manager/ad/partner - indirect tax - ca Job Des...
4,3,manager/ad/partner - indirect tax - ca,Onboard HRServices LLP,manager/ad/partner - indirect tax - ca Job Des...


In [9]:
#apply tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer
stopwords = nltk.corpus.stopwords.words('english')
stopwords.append('ã¯æ’ëœ')
#Transforms words to TFIDF
vectorizer = TfidfVectorizer(stop_words = stopwords)

index = 0
keys = {}

for jd in df_job_descriptions.itertuples() :
    key = jd[1]
    keys[key] = index
    index += 1

#Fit the vectorizer to the data
vectorizer.fit(df_job_descriptions['jd_combo'].fillna(''))

#Transform the data
tfidf_scores = vectorizer.transform(df_job_descriptions['jd_combo'].fillna(''))

print(tfidf_scores.shape)
print(df_job_descriptions.shape)



(38941, 58510)
(38941, 4)


In [10]:
#get all words 
import pandas as pd

# Convert sparse TF-IDF scores to a sparse DataFrame
test = pd.DataFrame.sparse.from_spmatrix(tfidf_scores, columns=vectorizer.get_feature_names_out())


In [11]:
#Show
test.head()

Unnamed: 0,00,000,0000,00000,0000gmt,0001pt,00029,00034,000402,00053,...,ïƒ,ïƒ¼,ïƒž,œ100,œmost,œrecognition,œto,šâ,šã,žâ
0,0.0,0.055441,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.065396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


As count vectorizer and Tf-Idf are only exploding my column numbers. It might not be wise to proceed with any of these. Moveover, I need to compare job description with Resume, that may not with fair comparison. So I will use these results so far for customizing stop word list. And will later use Doc2Vec to train my model.

# Creating my Stopword list 

### As seen there are so many unwanted tokens like numbers,ïƒ¼ etc , I need to add them in "stop words" list to train model 

In [12]:
#getting list of all tokens
word_list = test.columns.tolist()

In [13]:
##Getting a list of unwanted words as s_words and adding to stopwords
s_words =[]
for word in word_list:
    #print(word)
    if re.search("^\W|^\d",word):
        s_words.append(word)
        

In [14]:
s_words.append('')        
from nltk.corpus import stopwords
stopword_set = set(stopwords.words('english'))
stopword_set = list(stopword_set)
stopword_set.extend(s_words)

## Collecting all text data for DOC2VEC modelling

In [15]:
# using concatenated text columns
docs = df_job_descriptions['jd_combo']
docs_sample = docs.head(10)
docs_sample

0    walkin data entry operator (night shift) Job D...
1    work based onhome based part time. Job Descrip...
2    pl/sql developer - sql Job Description   Send ...
3    manager/ad/partner - indirect tax - ca Job Des...
4    manager/ad/partner - indirect tax - ca Job Des...
5    manager/ad/partner - indirect tax - ca Job Des...
6    manager/ad/partner - indirect tax - ca Job Des...
7    manager/ad/partner - indirect tax - ca Job Des...
8    manager/ad/partner - indirect tax - ca Job Des...
9    java technical lead (6-8 yrs) - Job Descriptio...
Name: jd_combo, dtype: object

In [16]:
#pre-processing with custom stop word list
def preprocess(text):
    stop_words = stopword_set
    #0. split words by whitespace
    text = text.split()
    
    
    # 1. lower case
    text = [word.lower() for word in text]
    
    # 2. remove punctuations
    punc_table = str.maketrans('','',string.punctuation)
    text = [word.translate(punc_table) for word in text]
    
    # 3. remove stop words
    text = [word for word in text if word not in stop_words]
    
    return text

In [17]:
# calling my pre-process to tokenize 
tokenized_doc = []
doc = df_job_descriptions['jd_combo']
#doc = docs_sample
for d in doc:
    tokenized_doc.append(preprocess(d))
#tokenized_doc

In [18]:
# Convert tokenized document into gensim formated tagged data
tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(tokenized_doc)]

In [19]:
num_doc = len(tagged_data) #should be 38941
num_doc

38941

In [20]:
#settings to show epoch progress
from gensim.test.utils import get_tmpfile
from gensim.models.callbacks import CallbackAny2Vec

class EpochSaver(CallbackAny2Vec):

    def __init__(self, path_prefix):
        self.path_prefix = path_prefix
        self.epoch = 0

    def on_epoch_end(self, model):
        output_path = get_tmpfile('{}_epoch{}.model'.format(self.path_prefix, self.epoch))
        model.save(output_path)
        self.epoch += 1

In [21]:
#settings to show epoch progress
class EpochLogger(CallbackAny2Vec):
    
    def __init__(self):
        self.epoch = 0
        
    def on_epoch_begin(self, model):
        print("Epoch #{} start".format(self.epoch))

    def on_epoch_end(self, model):
        print("Epoch #{} end".format(self.epoch))
        self.epoch += 1

In [22]:
epoch_logger = EpochLogger()
## Train doc2vec model
model = Doc2Vec(tagged_data, vector_size=20, window=2, min_count=1, workers=4, epochs = 100, callbacks=[epoch_logger])


Epoch #0 start
Epoch #0 end
Epoch #1 start
Epoch #1 end
Epoch #2 start
Epoch #2 end
Epoch #3 start
Epoch #3 end
Epoch #4 start
Epoch #4 end
Epoch #5 start
Epoch #5 end
Epoch #6 start
Epoch #6 end
Epoch #7 start
Epoch #7 end
Epoch #8 start
Epoch #8 end
Epoch #9 start
Epoch #9 end
Epoch #10 start
Epoch #10 end
Epoch #11 start
Epoch #11 end
Epoch #12 start
Epoch #12 end
Epoch #13 start
Epoch #13 end
Epoch #14 start
Epoch #14 end
Epoch #15 start
Epoch #15 end
Epoch #16 start
Epoch #16 end
Epoch #17 start
Epoch #17 end
Epoch #18 start
Epoch #18 end
Epoch #19 start
Epoch #19 end
Epoch #20 start
Epoch #20 end
Epoch #21 start
Epoch #21 end
Epoch #22 start
Epoch #22 end
Epoch #23 start
Epoch #23 end
Epoch #24 start
Epoch #24 end
Epoch #25 start
Epoch #25 end
Epoch #26 start
Epoch #26 end
Epoch #27 start
Epoch #27 end
Epoch #28 start
Epoch #28 end
Epoch #29 start
Epoch #29 end
Epoch #30 start
Epoch #30 end
Epoch #31 start
Epoch #31 end
Epoch #32 start
Epoch #32 end
Epoch #33 start
Epoch #33 end


In [23]:
# Save trained doc2vec model
model.save("my_doc2vec_new.model")

In [24]:
## Load saved doc2vec model
model= Doc2Vec.load("my_doc2vec_new.model")

In [25]:
#confirm length (should be 38941)
len(tokenized_doc)

38941

In [26]:
## Get vector value
vec = np.empty([38941,20])

for k,i in enumerate(tokenized_doc):
    
    #print(i)
    vector = model.infer_vector(i)
    vec[k] = vector
    #vec = np.append(vector)
    #vecf = np.append(vec,vector)

# reshape into 2D
new_arr = np.reshape(vec,(-1,20))

In [27]:
rng = range(1, 21)
vec_df = pd.DataFrame(new_arr, columns=['vec_' + str(i) for i in rng])

In [28]:
#check vectors 
vec_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38941 entries, 0 to 38940
Data columns (total 20 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   vec_1   38941 non-null  float64
 1   vec_2   38941 non-null  float64
 2   vec_3   38941 non-null  float64
 3   vec_4   38941 non-null  float64
 4   vec_5   38941 non-null  float64
 5   vec_6   38941 non-null  float64
 6   vec_7   38941 non-null  float64
 7   vec_8   38941 non-null  float64
 8   vec_9   38941 non-null  float64
 9   vec_10  38941 non-null  float64
 10  vec_11  38941 non-null  float64
 11  vec_12  38941 non-null  float64
 12  vec_13  38941 non-null  float64
 13  vec_14  38941 non-null  float64
 14  vec_15  38941 non-null  float64
 15  vec_16  38941 non-null  float64
 16  vec_17  38941 non-null  float64
 17  vec_18  38941 non-null  float64
 18  vec_19  38941 non-null  float64
 19  vec_20  38941 non-null  float64
dtypes: float64(20)
memory usage: 5.9 MB


In [29]:
# merging vectors to main dataset (job)
con_job = pd.concat([job, vec_df], axis=1)

In [30]:
con_job.to_csv('con_job_new.csv', index=False)

In [31]:
###sample with 10 rows

# tokenized_doc = []
# #doc = df_job_descriptions['jd_combo']
# doc = docs_sample
# for d in doc:
#     tokenized_doc.append(preprocess(d))
# #tokenized_doc

In [32]:
# # Convert tokenized document into gensim formated tagged data
# tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(tokenized_doc)]

In [33]:
# num_doc = len(tagged_data)
# num_doc

In [34]:
# ## Get vector value
# vec = np.empty([9,20])
# for i in tokenized_doc:
#     #print(i)
#     vector = model.infer_vector(i)
#     vecf = np.append(vec,vector)

# # reshape into 2D
# new_arr = np.reshape(vecf,(-1,20))

In [35]:
# rng = range(1, 21)
# vec_df = pd.DataFrame(new_arr, columns=['vec_' + str(i) for i in rng])