#**APPENDIX**
### MSCI623- Big Data Analytics  
###### University of Waterloo - spring 2020

These codes have two goals:
1.   Using the K-mean algorithm to clustering job titles - *Unsupervised learning*
2.   Using a neural network to detect "fraud" and "not_fraud" job posts - *Supervised learning*  

by implementing python language.


## Preparing coding environment

These codes have been run in **Google Colab** and the coding environment has been set based on that.  
  
1.   Importing essential libraries and download some extra components of packages(need to upgrade tensorflow library)
2.   Setting random states to have consistency in answers every time of running codes
3.   Turning warnings off
4.   Uploading data 
5.   Saving data as a dataframe
6.   Setting a drive on google drive to be able to save models, pictures and graphs
7.   Checking system GPU
8.   Running personalized functions that will be used in the rest of the codes


In [None]:
# Upgrade tensorflow library
!pip install tensorflow
!pip install --upgrade tensorflow
!pip install tf-nightly

In [None]:
# Import libraries
import io
import os
import warnings
import random
import statistics
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import re 
import nltk
import gensim
from nltk.tokenize import word_tokenize,sent_tokenize
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from wordcloud import WordCloud
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from sklearn.tree import DecisionTreeClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import BatchNormalization
from  tensorflow.keras.utils import plot_model
from sklearn.metrics import classification_report,confusion_matrix

print('All required libraries were imported')

In [None]:
# Downloading essential components of nltk library and set them 
nltk.download('stopwords')
stopwords = set(nltk.corpus.stopwords.words("english")) 
nltk.download('punkt')
print('\n', '*'*15, 'All extra packages downloaded and set', '*'*15)

In [None]:
# set random seeds
random.seed(123)
np.random.seed(123)
tf.random.set_seed(123)
PYTHONHASHSEED=123
print('Random seeds set')

In [None]:
# Turning warnings off
warnings.filterwarnings("ignore")
print('All warnings turned off')

In [None]:
# Upload data file and wait untill become 100% done. This might takes some time
from google.colab import files
uploaded = files.upload()

In [None]:
# Saving data on dataframe
job_posts = pd.read_csv('fake_job_postings.csv')
print('CSV file has been saved in pandas dataframe as "job_posts"')

In [None]:
# Set a folder to save images and tabels
from google.colab import drive
drive.mount('/content/gdrive')
os.chdir("gdrive/My Drive/MSCI 623-Project")
print(os.getcwd())
print('MSCI623-Project in google drive set as repository')

In [None]:
# Checking the GPU of running codes (If using google colab GPU the output should be "/device:GPU:0")
tf.test.gpu_device_name()

In [None]:
# This function clean text data and return it as text
def clean_text(text):
  text = text.lower()  # lower case the text
  text = re.sub(r"[^a-z]+", ' ', text)  # remove all signs and punctuation and numbers
  words = word_tokenize(text)  # Tokenize data
  wordsFiltered = []
  for w in words:
    if w not in stopwords:
      wordsFiltered.append(w)

  return '{}'.format(' '.join(wordsFiltered))

print('Personalized function have been loaded')

## Exploratory data analysis (EDA)

Next codes are to explore the original data.

In [None]:
# To check data shape
print('# of columns:', job_posts.shape[1])
print('# of rows:', job_posts.shape[0])

In [None]:
# To check dataframe
job_posts.head()

In [None]:
# Understanding numeric columns of data 
job_posts.describe()

The "job_id" is table key and the other four numeric columns as "telecommuting", "has_company_logo", "has_question" and "fraudulent" are binary variables.  
The "fraudulent" column is the target column.


In [None]:
# Checking share of the target value
fake_count = (job_posts["fraudulent"]==1).sum()
true_count=(job_posts["fraudulent"]==0).sum()
total = fake_count + true_count
print('# of fake posts:',fake_count, '({:4.2f}'.format(fake_count*100/total),'% of total)')
print('# of true posts:',true_count, '({:4.2f}'.format(true_count*100/total),'% of total')

In [None]:
# Visualizing share of the target value
fig, axes = plt.subplots()
plt.tight_layout()

job_posts["fraudulent"].value_counts().plot(kind='pie',  labels=['not_fraudulent \n (~95%)', 'fraudulent \n (~5%)'],colormap='Set3')
axes.set_ylabel(' ')
plt.title('Fraudulent and legitimate(not_fraudulent) share of data', fontsize=13)

fig.savefig('Images/datashare.png')
plt.show()

Data is considerably imbalanced.

In [None]:
# Visualizing share of other three binary variables considering the target value
fig, ax = plt.subplots(1,2)
plt.tight_layout()

tab1 = pd.crosstab(job_posts['fraudulent'],job_posts['telecommuting'])
tab1.rename(columns={1: 'telecom', 0: 'not_telecom'},inplace=True)
for i,cat in enumerate(tab1.index):
    tab1.loc[cat].plot.pie(ax=ax[i],startangle=260, colors= np.array(['skyblue','khaki']))
    ax[0].set_title('True_job_posts', fontweight='bold')
    ax[1].set_title('Fake_job_posts', fontweight='bold')
    ax[0].set_ylabel('telecommuting position')
    ax[0].yaxis.labelpad=20.0
    ax[1].set_ylabel('')
fig.savefig('Images/telecom_pos_share.png')
# plt.show()

fig, ax = plt.subplots(1,2)
plt.tight_layout()
tab2 = pd.crosstab(job_posts['fraudulent'],job_posts['has_company_logo'])
tab2.rename(columns={1: 'has_logo', 0: 'no_logo'},inplace=True)
for i,cat in enumerate(tab2.index):
    tab2.loc[cat].plot.pie(ax=ax[i],startangle=45, colors= np.array(['skyblue','khaki']))
    ax[0].set_title('True_job_posts', fontweight='bold')
    ax[1].set_title('Fake_job_posts', fontweight='bold')
    ax[0].set_ylabel('has_company_logo in posts')
    ax[0].yaxis.labelpad=20.0
    ax[1].set_ylabel('')
fig.savefig('Images/Logo_share.png')
# plt.show()

fig, ax = plt.subplots(1,2)
plt.tight_layout()
tab3 = pd.crosstab(job_posts['fraudulent'],job_posts['has_questions'])
tab3.rename(columns={1: 'has_questin', 0: 'no_question'},inplace=True)
for i,cat in enumerate(tab3.index):
    tab3.loc[cat].plot.pie(ax=ax[i],startangle=180, colors= np.array(['skyblue','khaki']))
    ax[0].set_title('True_job_posts', fontweight='bold')
    ax[1].set_title('Fake_job_posts', fontweight='bold')
    ax[0].set_ylabel('has_question in posts')
    ax[0].yaxis.labelpad=20.0
    ax[1].set_ylabel('')
fig.savefig('Images/question_share.png')
# plt.show()

While most real job posts show the company logo, the big portion of fake job posts has no logo. Moreover, a smaller share of fake job posts has questions.

In [None]:
# To check the correlation between three binary variables and target value
cor_data = job_posts[['fraudulent','telecommuting','has_company_logo','has_questions']]
corr = cor_data .corr(method="pearson")

fig, axis= plt.subplots(figsize=(8, 8))
plt.tight_layout()

sns.heatmap(corr, cmap= "RdBu", center=0.00, annot=True, fmt='.1g',cbar_kws={'label': 'correlation'})
sns.set(font_scale=1.3)

fig.savefig('Images/binary_cor.png')
plt.show()

Though there is no high correlation between variables, telecommuting position shows a bigger positive correlation with target value in comparison to the other two.

In [None]:
# Checking missing values in the data set
NA = job_posts.isna().groupby(job_posts.fraudulent).sum().T
NA['Total']= NA.sum(axis=1)
NA.to_csv('data/NA_table.csv', index=True, encoding='utf-8')
NA

The data set has a lot of missing information. However, based on the text mining approach in this project, all text columns will be considered as a package of text information. Thus, the only concern will be the salary range column, which could be eliminated.

### Cleaning data

All 12 columns as "title", "location", "department", "company_profile", "description", "requirements", "benefits", "employement_type", "required_experience", "required_education", "industry" and "function" will be combined in information column and the original columns will be removed, except for "title" column. The "title" column is needed for unsupervised learnin(clustering).

In [None]:
# To keep the main dataset and make some changes to the new one
data = job_posts.copy()
data.fillna(" ",inplace = True)

data.drop(['job_id'], axis=1, inplace=True)
data['information'] = data['title']  + ' ' + data['location'] + ' '  + ' ' + data['department'] + data['company_profile'] + ' ' + data['description'] + ' ' + data['requirements'] + ' ' + data['benefits'] + ' ' + data['employment_type'] + ' ' + data['required_experience'] + ' ' + data['required_education'] + ' ' + data['industry'] + ' ' + data['function'] 

data.drop(['location', 'department', 'company_profile','description', 'requirements', 'benefits', 'employment_type','required_experience','required_education', 'industry', 'function'], axis=1, inplace=True)
data.drop(['telecommuting','has_company_logo','has_questions'],axis=1,inplace=True)
data.drop(['salary_range'], axis=1, inplace=True)
data=data[['title','information','fraudulent']]
data.head()

The new data set has no missing value in the remaining columns and data is ready for the next steps.

## Unsupervised learnin - Clustering

The **k-means clustering** algorithm will be used to clustering **job titles** for unsupervised learning.   
The k_means algorithm expect numeric data as input. Thus, the **doc2vec** approach will be applied to convert alphabetic "title" data to a numeric format.

In [None]:
# To cleaning text data and check it(this might takes some time)
df=data.copy()
df.information=df.information.apply(lambda x: clean_text(x)) 
df.title=df.title.apply(lambda x: clean_text(x))
df.head()

###Doc2vec for job titles

Document to vector(doc2vec) model of "gensim" library will be implemented to convert titles to vectors.

In [None]:
title_len = list(df.title.apply(lambda x : len(x)))
print('"Title length information"')
print(' Minimun:', int(np.min(title_len)) )
print(' Mode:', int(statistics.mode(title_len)) )
print(' 99 percentile:', int(np.percentile(title_len, 99)) )
print(' Maximum:', int(np.max(title_len)) )

In [None]:
# To prepare data for training doc2vec model for titles
titles = list(df.title.apply(lambda x: word_tokenize(x)))
tagged_titles= [TaggedDocument(doc, [i]) for i, doc in enumerate(titles)]
print('Title data are ready to be used to train doc2vec model')

In [None]:
# Create doc2vec model for titles (this might takes some time)
d2v_titels = Doc2Vec(vector_size=20, min_count=2, epochs=10)
d2v_titels.build_vocab(tagged_titles)
d2v_titels.train(tagged_titles, total_examples=d2v_titels.corpus_count, epochs=d2v_titels.epochs)
print('Doc2Vec model for titles trained')

d2v_titels.save('data/d2v_titels.model')
print('Doc2vec model for titles saved in the data folder')

In [None]:
# Extracting the doc2vec vectors for the job titles
titles_vec = d2v_titels.docvecs.vectors_docs
print('All',len(titles_vec),'job titles converted to their vectors.')

###k_mean clustering

To find the optimal number of clusters, the visual approach of "Elbow method" will be used.


In [None]:
# Testing different amount of k for k-mean model and calculate SSE for them (this might takes some time)
distortations = {}
for k in range(1,25):
  kmeans = KMeans(n_clusters=k,init='k-means++') 
  kmeans.fit(titles_vec)
  distortations[k] = kmeans.inertia_
print('Data for different amount of K has been saved')

In [None]:
# Plotting elbow curve 
fig, axes = plt.subplots(figsize=(8, 8))
plt.tight_layout()

plt.plot(list(distortations.keys()),list(distortations.values()), color='red')
plt.title('Elbow curve to find number of clusters for titles')
plt.xlabel('Number of clusters')
plt.ylabel('SSE')
axes.set_facecolor('white')
plt.grid(b=True, which='major', color='lightgrey', linestyle='--', axis='x')
plt.grid(b=None, which='minor', color='lightgrey', linestyle=':', axis='x')
plt.minorticks_on()
axes.spines['bottom'].set_color('0.5')
axes.spines['top'].set_color('0.5')
axes.spines['right'].set_color('0.5')
axes.spines['left'].set_color('0.5')

fig.savefig('Images/elbow.png')
plt.show()

Thus, the best number of clusters could be 6.

In [None]:
# To model K-mean
km_model = KMeans(n_clusters=6, init='k-means++')  
km_model.fit(titles_vec)
labels=km_model.labels_.tolist()
print('k-mean clustering model has been created with k=6')

The "Principal Component Analysis(PCA)" will be used to plot the clusters and show how the model performs visually. PCA makes it easier to see clusters.


In [None]:
# To visualize the clustering model
predict = km_model.fit_predict(titles_vec)
pca = PCA(n_components=2).fit(titles_vec)
datapoint = pca.transform(titles_vec)

fig, axes = plt.subplots(figsize=(8, 8))
plt.tight_layout()

color_theme = ['darkgray','lightsalmon','powderblue','steelblue','gold', 'darkkhaki','turquoise','hotpink']
color = [color_theme[i] for i in predict]

plt.scatter(datapoint[:, 0], datapoint[:, 1], c=color)
centroids = km_model.cluster_centers_
centroidpoint = pca.transform(centroids)
plt.scatter(centroidpoint[:, 0], centroidpoint[:, 1], marker='^', s=150, c='black')
plt.title('K-Means Classification')
axes.set_facecolor('white')
axes.grid()
axes.spines['bottom'].set_color('0.5')
axes.spines['top'].set_color('0.5')
axes.spines['right'].set_color('0.5')
axes.spines['left'].set_color('0.5')

fig.savefig('Images/clusters.png')
plt.show()

Now we will add labels to our dataset.

In [None]:
# # To check the dataset
df['title_cluster']= labels
df=df[['title','title_cluster','information','fraudulent']]
df.head()

Take a look at the number of data in each clusters:

In [None]:
# To count number of titles in each cluster
c_table = df.groupby('title_cluster').count()
c_table.drop(['information','fraudulent'],axis=1,inplace=True)
c_table

## Supervised learning - Classification

The goal is to classify job posts into fraudulent and legitimate based on the text information.

### Data preperation

To apply text mining approaches to the information column, it is good to get a better insight into the texts in that column.

In [None]:
# To preparing data for classification
cdata = df.copy()
cdata['tokenized_info']= cdata['information'].apply(lambda x: word_tokenize(x))
cdata.drop(['title','title_cluster'],axis=1,inplace=True)
cdata = cdata[['information','tokenized_info','fraudulent']]
cdata.head()

In [None]:
# Getting an idea about the information column
fake_info_len = list(cdata[cdata["fraudulent"]==1]['tokenized_info'].apply(lambda x: len(x)))
true_info_len = list(cdata[cdata["fraudulent"]==0]['tokenized_info'].apply(lambda x: len(x)))
all_info_len = [*true_info_len, *fake_info_len]
print('number of fake sentences:',len(fake_info_len))
print('number of true sentences:',len(true_info_len))
print('Maximum number of words in information part for fake posts:',max(fake_info_len))
print('Maximum number of words in information part for true posts:',max(true_info_len))

In [None]:
# visualizing the length of data in the information column
fig,(ax1,ax2)= plt.subplots(ncols=2, figsize=(15, 5))

ax1.hist(fake_info_len,bins = 20,color='lightsalmon')
ax1.set_title('fraudulent posts')
ax1.set_xlabel('Number of words', fontweight='bold',fontsize=11)
ax1.set_facecolor('white')
ax1.spines['bottom'].set_color('0.5')
ax1.spines['top'].set_color('0.5')
ax1.spines['right'].set_color('0.5')
ax1.spines['left'].set_color('0.5')

ax2.hist(true_info_len, bins = 20,color='steelblue')
ax2.set_title('not_fraudulent Post')
ax2.set_xlabel('Number of words', fontweight='bold',fontsize=11)
ax2.set_facecolor('white')
ax2.spines['bottom'].set_color('0.5')
ax2.spines['top'].set_color('0.5')
ax2.spines['right'].set_color('0.5')
ax2.spines['left'].set_color('0.5')

fig.suptitle('Number of words in information part',fontweight='bold',fontsize=12)

fig.savefig('Images/info_len.png')
plt.show()

The "word cloud" is a famous visual method in natural language processing to get an insight into text data.

In [None]:
# Using visualization to see the most frequent tokens
fig,(ax1,ax2)= plt.subplots(ncols=2, figsize=(15, 5), dpi=100)
plt.tight_layout()

wc_fake = WordCloud(width = 1400 , height = 800 , max_words = 500 , background_color ='white', stopwords = stopwords, min_font_size = 8).generate(" ".join(cdata[cdata.fraudulent == 1].information))
ax1.imshow(wc_fake)
ax1.set_title('fraudulent posts')
ax1.axis('off')

wc_true = WordCloud(width = 1400 , height = 800 , max_words = 500 , background_color ='white', stopwords = stopwords, min_font_size = 8).generate(" ".join(cdata[cdata.fraudulent == 0].information))
ax2.imshow(wc_true)
ax2.set_title('not_fraudulent posts')
ax2.axis('off')

plt.axis("off") 
plt.grid(b=None)

fig.savefig('Images/wordclouds.png')
plt.show()

In [None]:
# To check data types
print('Dataset data types')
print(cdata.dtypes)

For unsupervised learning, data has been split into three parts. "Training" data will be used to train the model, "Validation" will be applied for tuning the model's hyperparameters and "Test set" will be used to check the performance of the model.

In [None]:
# Splitting data into three sets
x= list(cdata.tokenized_info)
y= to_categorical(np.array(cdata.fraudulent))

train_x , val_x ,train_y , val_y = train_test_split( x , y , test_size = 0.1 , random_state = 123)
train_x , test_x ,train_y , test_y = train_test_split( x , y , test_size = 0.1 , random_state = 123)

print('Data randomly splited to the training set(80%), validation set(10%) and test set(10%)')
print('*'*20)
print('training set size=', len(train_x))
print('validation set size=', len(val_x))
print('testing set size=', len(test_x))


### Doc2vec for job posts text data

Document to vector model will be applied to the information column which contains all text information about the job posts.

In [None]:
# Prepare training data to be used  in doc to vec model
tagged_info= [TaggedDocument(doc, [i]) for i, doc in enumerate(train_x)]
print('Training data is ready to be used for doc2vec model')

In [None]:
# Building Doc2vec model for information column
d2v_info = Doc2Vec(vector_size=100, min_count=1, epochs=20)
d2v_info.build_vocab(tagged_info)
d2v_info.train(tagged_info, total_examples=d2v_info.corpus_count, epochs=d2v_info.epochs)
print('Doc2Vec model has been trained for information column')

d2v_info.save('data/d2v_info.model')
print('Doc2vec model for information column saved in data folder')

### Pre Processing data

All three data sets information have to be converted into vectors to be used as an input of the neural network.

In [None]:
# Pre_processing data (this might takes some time)
train_vec= np.array([d2v_info.infer_vector(item) for item in train_x])
val_vec =  np.array([d2v_info.infer_vector(item) for item in val_x])
test_vec = np.array([d2v_info.infer_vector(item) for item in test_x])
print('pre-processing data is finished')

### Classification models

Two classifier will be used for classificatioin, **Decision tree** and **Neural networks**.  

#### Decision tree Model

In [None]:
# To create and fit the decision tree model
dt_model = DecisionTreeClassifier(class_weight='balanced', criterion = 'entropy')
dt_model.fit(train_vec,np.argmax(train_y,axis=1)) 

##### Testing decision tree model performance

In [None]:
# To apply model on test set
dt_pred= dt_model.predict(test_vec)
dt_report = classification_report(np.argmax(test_y,axis=1),dt_pred,target_names = ['0','1'])
print(dt_report)

In [None]:
# To graph the confusion matrix
dt_cm=confusion_matrix(np.argmax(test_y,axis=1),dt_pred)
dt_cm = pd.DataFrame(dt_cm)
dt_cm.index.name = 'Actual'
dt_cm.columns.name = 'Predicted'

fig, axis= plt.subplots(figsize=(8, 8))
plt.tight_layout()

sns.heatmap(dt_cm ,cmap= "Blues",annot = True, fmt='')

fig.savefig('Images/dt_confusion_matrix.png')
plt.show()

#### Neural network - Sequential mode

In [None]:
# Defining model 
model = Sequential(name='Neural_Model')
model.add(Dense(512, activation='relu', input_dim=100))
model.add(Dropout(0.2))
model.add(BatchNormalization())
model.add(Dense(128,activation='relu'))
model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(Dense(64,activation='relu'))
model.add(Dropout(0.2))
model.add(BatchNormalization())
model.add(Dense(2, activation='sigmoid'))

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='categorical_crossentropy',metrics=['accuracy'])

model.summary()

In [None]:
# To plot the model
plot_model(model, to_file='Images/Neural_model.png', show_shapes=True, show_layer_names=True)

In [None]:
# Fitting model (this takes some times)
weights = {0:5, 1:100}
history = model.fit(train_vec, train_y, class_weight=weights, batch_size=64, epochs=100, validation_data=(val_vec, val_y))
print('\n','*'*20,'Model trained','*'*20)

In [None]:
# Saving sequentioal model
model.save('data/sequential_model')
print('Sequential model has been saved')

In [None]:
# Plotting the accuracy and loss of the training and validation sets during epochs
fig, (ax1,ax2)= plt.subplots(ncols=2, figsize=(10, 5), dpi=100)

epochs = [i for i in range(100)]
train_acc = history.history['accuracy']
train_loss = history.history['loss']
val_acc = history.history['val_accuracy']
val_loss = history.history['val_loss']
fig.set_size_inches(20,10)

ax1.plot(epochs , train_acc , 'co-' , label = 'Training Accuracy')
ax1.plot(epochs , val_acc , 'yo-' , label = 'Validation Accuracy')
ax1.set_title('Training & Validation Accuracy per epochs',fontsize=12, fontweight='bold')
ax1.legend()
ax1.set_xlabel("Epochs")
ax1.set_ylabel("Accuracy")
ax1.set_facecolor('white')
ax1.spines['bottom'].set_color('0.5')
ax1.spines['top'].set_color('0.5')
ax1.spines['right'].set_color('0.5')
ax1.spines['left'].set_color('0.5')
ax2.plot(epochs , train_loss , 'co-' , label = 'Training Loss')
ax2.plot(epochs , val_loss , 'yo-' , label = 'Validation Loss')
ax2.set_title('Training & Validation Loss per epochs', fontsize=12, fontweight='bold')
ax2.legend( fontsize= 13)
ax2.set_xlabel("Epochs")
ax2.set_ylabel("Loss")
ax2.set_facecolor('white')
ax2.spines['bottom'].set_color('0.5')
ax2.spines['top'].set_color('0.5')
ax2.spines['right'].set_color('0.5')
ax2.spines['left'].set_color('0.5')
fig.suptitle('Fitting model history per epoches',fontweight='bold')

fig.savefig('Images/learning_loss.png')
plt.show()

##### Testing Neural model performance

In [None]:
# To apply model on test set
pred_prop =model.predict(test_vec)
pred=np.around(pred_prop , decimals = 0)
report = classification_report(test_y,pred)
print(report)

In [None]:
# To graph the confusion matrix
cm=confusion_matrix(np.argmax(test_y,axis=1),np.argmax(pred,axis=1))
cm = pd.DataFrame(cm)
cm.index.name = 'Actual'
cm.columns.name = 'Predicted'

fig, axis= plt.subplots(figsize=(8, 8))
plt.tight_layout()

sns.heatmap(cm ,cmap= "Blues",annot = True, fmt='')

fig.savefig('Images/confusion_matrix.png')
plt.show()