# Capstone - Pre- processing

In [1]:
# libraries import

import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt
%matplotlib inline

import re
import datetime
from datetime import date
from time import strptime

import RAKE as rake
import operator


######################################################################################

# Working on Job description Data
######################################################################################   

In [2]:
# reading my sorted job csv
job = pd.read_csv('WIP/sorted_jobs_master_new.csv')

In [3]:
job.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38941 entries, 0 to 38940
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   company           38941 non-null  object 
 1   education         38941 non-null  object 
 2   experience        38941 non-null  int64  
 3   industry          38941 non-null  object 
 4   jobdescription    38941 non-null  object 
 5   jobtitle          38941 non-null  object 
 6   payrate           38941 non-null  object 
 7   skills            38941 non-null  object 
 8   experience_range  38941 non-null  int64  
 9   industry_enum     38941 non-null  int64  
 10  Salary_range      38941 non-null  float64
 11  j_id              38941 non-null  int64  
 12  is_grad           38941 non-null  int64  
 13  is_postgrad       38941 non-null  int64  
 14  is_doc            38941 non-null  int64  
 15  location          38941 non-null  int64  
 16  loc_name          38941 non-null  object

###########################################################################################################################
# Understanding Job_description column (using NLP)
###########################################################################################################################
* This section also  includes cosine similarity to find jobs closer to each other

# 1. NLP - NLTK application to understand most used words

In [4]:
#Import all the dependencies
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
set(stopwords.words('english'))
# nltk.download('abc')
# from nltk.corpus import abc
# from nltk import RegexpTokenizer

import string
stopwords = set(stopwords.words("english"))
import gensim
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\shail\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
# defining tokenizer 
def my_tokenizer(text):
    # 1. split at whitespace
    text = text.split(' ')
    
    #2. lowercase
    text = [word.lower() for word in text]
    
    #3. Remove puncutation
    #table to replace puncuation
    punc_table = str.maketrans('','',string.punctuation)
    
    #call translate()
    text = [word.translate(punc_table) for word in text]
    
    #4. remove stopwords
    text = [word for word in text if word not in stopwords]
    
    #5. lemmmatize
    lemmatizer = WordNetLemmatizer()
    
    text = [lemmatizer.lemmatize(word, pos='v') for word in text]
    text = [lemmatizer.lemmatize(word, pos='n') for word in text]
    text = [lemmatizer.lemmatize(word, pos='a') for word in text]
    
    #6. remove empty strings
    text = [word for word in text if word !='']
    
    return text 

# 2. NLP - TF-IDF application to get a list of all tokens 
-- This helped to gather what words needed to be in stop-words list

In [16]:
#z = job['jobdescription'].str.rstrip('job description   send me jobs like this')

In [7]:
# job['jobdescription'] = job.jobdescription.str[40:]
job['jobdescription']

0         Qualifications: - == > 10th To Graduation & A...
1         Qualifications: - == > 10th To Graduation & A...
2         - as a developer in providing application des...
3         - Involved with all stages of indirect taxati...
4         - Involved with all stages of indirect taxati...
                               ...                        
38936     Looking for candidates with strong programmin...
38937     Work with tech lead to architect and develop ...
38938     We are looking for a Senior UI Developers and...
38939     We are looking for a Senior UI Developers and...
38940     Job description : Experience of 5-10 years wi...
Name: jobdescription, Length: 38941, dtype: object

In [23]:
# t= job.copy()
# t.to_csv('WIP.sorted_jobs_master_new.csv', index=False)

In [8]:
df_job_descriptions = job[['j_id','jobtitle','company' ]]
df_job_descriptions['jd_combo'] = job['jobtitle'] +" " +  job['jobdescription'] 
df_job_descriptions.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,j_id,jobtitle,company,jd_combo
0,0,walkin data entry operator (night shift),MM Media Pvt Ltd,walkin data entry operator (night shift) Qual...
1,1,work based onhome based part time.,find live infotech,work based onhome based part time. Qualificat...
2,2,pl/sql developer - sql,Softtech Career Infosystem Pvt. Ltd,pl/sql developer - sql - as a developer in pr...
3,3,manager/ad/partner - indirect tax - ca,Onboard HRServices LLP,manager/ad/partner - indirect tax - ca - Invo...
4,4,manager/ad/partner - indirect tax - ca,Onboard HRServices LLP,manager/ad/partner - indirect tax - ca - Invo...


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
stopwords = nltk.corpus.stopwords.words('english')
stopwords.append('ã¯æ’ëœ')
#Transforms words to TFIDF
vectorizer = TfidfVectorizer(stop_words = stopwords)

index = 0
keys = {}

for jd in df_job_descriptions.itertuples() :
    key = jd[1]
    keys[key] = index
    index += 1

#Fit the vectorizer to the data
vectorizer.fit(df_job_descriptions['jd_combo'].fillna(''))

#Transform the data
tfidf_scores = vectorizer.transform(df_job_descriptions['jd_combo'].fillna(''))

print(tfidf_scores.shape)
print(df_job_descriptions.shape)

  'stop_words.' % sorted(inconsistent))


(38941, 58510)
(38941, 4)


In [10]:
type(tfidf_scores)

scipy.sparse.csr.csr_matrix

In [11]:
test = pd.DataFrame(tfidf_scores.toarray(), columns = vectorizer.get_feature_names())

In [12]:
test.head()

Unnamed: 0,00,000,0000,00000,0000gmt,0001pt,00029,00034,000402,00053,...,ïƒ,ïƒ¼,ïƒž,œ100,œmost,œrecognition,œto,šâ,šã,žâ
0,0.0,0.056499,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.068273,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Creating my Stopword list 

### As seen there are so many unwanted tokens like numbers,ïƒ¼ etc , I need to add them in "stop words" list to train model 

In [13]:
#getting list of all tokens
word_list = test.columns.tolist()

In [14]:
##Getting a list of unwanted words as s_words and adding to stopwords
s_words =[]
for word in word_list:
    #print(word)
    if re.search("^\W|^\d",word):
        s_words.append(word)
        

In [15]:
s_words.append('')        
from nltk.corpus import stopwords
stopword_set = set(stopwords.words('english'))
stopword_set = list(stopword_set)
stopword_set.extend(s_words)

## Collecting all text data for DOC2VEC modelling

In [16]:
# df_job_descriptions = job[['j_id','jobtitle','company' ]]
# df_job_descriptions['jd_combo'] = job['jobtitle'] +" " +  job['jobdescription'] 
df_job_descriptions.head()

Unnamed: 0,j_id,jobtitle,company,jd_combo
0,0,walkin data entry operator (night shift),MM Media Pvt Ltd,walkin data entry operator (night shift) Qual...
1,1,work based onhome based part time.,find live infotech,work based onhome based part time. Qualificat...
2,2,pl/sql developer - sql,Softtech Career Infosystem Pvt. Ltd,pl/sql developer - sql - as a developer in pr...
3,3,manager/ad/partner - indirect tax - ca,Onboard HRServices LLP,manager/ad/partner - indirect tax - ca - Invo...
4,4,manager/ad/partner - indirect tax - ca,Onboard HRServices LLP,manager/ad/partner - indirect tax - ca - Invo...


In [17]:
docs = df_job_descriptions['jd_combo']
docs_sample = docs.head(10)
docs_sample

0    walkin data entry operator (night shift)  Qual...
1    work based onhome based part time.  Qualificat...
2    pl/sql developer - sql  - as a developer in pr...
3    manager/ad/partner - indirect tax - ca  - Invo...
4    manager/ad/partner - indirect tax - ca  - Invo...
5    manager/ad/partner - indirect tax - ca  - Invo...
6    manager/ad/partner - indirect tax - ca  - Invo...
7    manager/ad/partner - indirect tax - ca  - Invo...
8    manager/ad/partner - indirect tax - ca  - Invo...
9    java technical lead (6-8 yrs) -  Please share ...
Name: jd_combo, dtype: object

In [18]:
def preprocess(text):
    stop_words = stopword_set
    #0. split words by whitespace
    text = text.split()
    
    
    # 1. lower case
    text = [word.lower() for word in text]
    
    # 2. remove punctuations
    punc_table = str.maketrans('','',string.punctuation)
    text = [word.translate(punc_table) for word in text]
    
    # 3. remove stop words
    text = [word for word in text if word not in stop_words]
    
    return text

In [19]:
tokenized_doc = []
doc = df_job_descriptions['jd_combo']
#doc = docs_sample
for d in doc:
    tokenized_doc.append(preprocess(d))
#tokenized_doc

In [20]:
# Convert tokenized document into gensim formated tagged data
tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(tokenized_doc)]

In [21]:
num_doc = len(tagged_data)
num_doc

38941

In [24]:
from gensim.test.utils import get_tmpfile
from gensim.models.callbacks import CallbackAny2Vec

class EpochSaver(CallbackAny2Vec):

    def __init__(self, path_prefix):
        self.path_prefix = path_prefix
        self.epoch = 0

    def on_epoch_end(self, model):
        output_path = get_tmpfile('{}_epoch{}.model'.format(self.path_prefix, self.epoch))
        model.save(output_path)
        self.epoch += 1

In [25]:
class EpochLogger(CallbackAny2Vec):
    
    def __init__(self):
        self.epoch = 0
        
    def on_epoch_begin(self, model):
        print("Epoch #{} start".format(self.epoch))

    def on_epoch_end(self, model):
        print("Epoch #{} end".format(self.epoch))
        self.epoch += 1

In [27]:
epoch_logger = EpochLogger()
## Train doc2vec model
model1 = Doc2Vec(tagged_data, vector_size=20, window=2, min_count=1, workers=4, epochs = 200, callbacks=[epoch_logger])


Epoch #0 start
Epoch #0 end
Epoch #1 start
Epoch #1 end
Epoch #2 start
Epoch #2 end
Epoch #3 start
Epoch #3 end
Epoch #4 start
Epoch #4 end
Epoch #5 start
Epoch #5 end
Epoch #6 start
Epoch #6 end
Epoch #7 start
Epoch #7 end
Epoch #8 start
Epoch #8 end
Epoch #9 start
Epoch #9 end
Epoch #10 start
Epoch #10 end
Epoch #11 start
Epoch #11 end
Epoch #12 start
Epoch #12 end
Epoch #13 start
Epoch #13 end
Epoch #14 start
Epoch #14 end
Epoch #15 start
Epoch #15 end
Epoch #16 start
Epoch #16 end
Epoch #17 start
Epoch #17 end
Epoch #18 start
Epoch #18 end
Epoch #19 start
Epoch #19 end
Epoch #20 start
Epoch #20 end
Epoch #21 start
Epoch #21 end
Epoch #22 start
Epoch #22 end
Epoch #23 start
Epoch #23 end
Epoch #24 start
Epoch #24 end
Epoch #25 start
Epoch #25 end
Epoch #26 start
Epoch #26 end
Epoch #27 start
Epoch #27 end
Epoch #28 start
Epoch #28 end
Epoch #29 start
Epoch #29 end
Epoch #30 start
Epoch #30 end
Epoch #31 start
Epoch #31 end
Epoch #32 start
Epoch #32 end
Epoch #33 start
Epoch #33 end


In [28]:
# Save trained doc2vec model
model1.save("Model/my_doc2vec_v2.model")

In [30]:
## Load saved doc2vec model
model1= Doc2Vec.load("Model/my_doc2vec_v2.model")

In [31]:
#confirm length (should be 38941)
len(tokenized_doc)

38941

In [35]:
## Get vector value
vec = np.empty([38941,20])

for k,i in enumerate(tokenized_doc):
    
    #print(i)
    vector = model1.infer_vector(i)
    vec[k] = vector
    #vec = np.append(vector)
    #vecf = np.append(vec,vector)

# reshape into 2D
new_arr = np.reshape(vec,(-1,20))

In [36]:
rng = range(1, 21)
vec_df = pd.DataFrame(new_arr, columns=['vec_' + str(i) for i in rng])

In [37]:
vec_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38941 entries, 0 to 38940
Data columns (total 20 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   vec_1   38941 non-null  float64
 1   vec_2   38941 non-null  float64
 2   vec_3   38941 non-null  float64
 3   vec_4   38941 non-null  float64
 4   vec_5   38941 non-null  float64
 5   vec_6   38941 non-null  float64
 6   vec_7   38941 non-null  float64
 7   vec_8   38941 non-null  float64
 8   vec_9   38941 non-null  float64
 9   vec_10  38941 non-null  float64
 10  vec_11  38941 non-null  float64
 11  vec_12  38941 non-null  float64
 12  vec_13  38941 non-null  float64
 13  vec_14  38941 non-null  float64
 14  vec_15  38941 non-null  float64
 15  vec_16  38941 non-null  float64
 16  vec_17  38941 non-null  float64
 17  vec_18  38941 non-null  float64
 18  vec_19  38941 non-null  float64
 19  vec_20  38941 non-null  float64
dtypes: float64(20)
memory usage: 5.9 MB


In [38]:
con_job_1 = pd.concat([job, vec_df], axis=1)

In [39]:
con_job_1.to_csv('wip/con_job_1.csv', index=False)


# Ignore the code below.... This will be used for future reference - Helping jobseekers to better tune resumes by providing them relavant keywords 

###############################################################################################################################

In [16]:
from rake_nltk import Rake
import operator
r = Rake(min_length=1, max_length=2)

# from nlp_rake import rake
# stoppath = 'data/stoplists/SmartStoplist.txt'
# rake_object = rake.Rake(stoppath)

In [17]:

text='Hello World!!! miss, india new can zero cannot afford'
a=r.extract_keywords_from_text(text)
b=r.get_ranked_phrases()
c=r.get_ranked_phrases_with_scores()
print(b)
print(c)


['india new', 'miss']
[(4.0, 'india new'), (1.0, 'miss')]


In [22]:
#jd = pd.read_csv('data/working_jd_sample.csv')
#jd.head()

a=r.extract_keywords_from_text(X[0])
b=r.get_ranked_phrases()
c=r.get_ranked_phrases_with_scores()
print(c)

# keywords = rake_object.run(X[0])
# print ("keywords: ", keywords)

[(4.0, 'xml configuration'), (4.0, 'working knowledge'), (4.0, 'working across'), (4.0, 'wide selection'), (4.0, 'team lead'), (4.0, 'solutions company'), (4.0, 'preferred skills'), (4.0, 'open source'), (4.0, 'office presence'), (4.0, 'new zealand'), (4.0, 'mobility apps'), (4.0, 'majorly focusing'), (4.0, 'jobs like'), (4.0, 'job description'), (4.0, 'interpersonal skills'), (4.0, 'inr 7'), (4.0, 'good exposure'), (4.0, 'design role'), (4.0, 'data structure'), (4.0, 'company profile'), (4.0, 'basic knowledge'), (4.0, '9 years'), (3.666666666666667, 'web services'), (3.666666666666667, 'strong knowledge'), (3.666666666666667, 'strong analytical'), (3.666666666666667, 'socket programming'), (3.666666666666667, 'mom services'), (3.666666666666667, 'java programming'), (3.5, 'industry verticals'), (3.5, 'engineering graduate'), (3.5, '000 p'), (1.6666666666666667, 'strong'), (1.6666666666666667, 'services'), (1.6666666666666667, 'programming'), (1.5, 'industry'), (1.5, 'graduate'), (1.5,