# Fitting Logistic Regression model with SMOTE

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import nltk
import string
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix



Let us load the dataset and check the dataset

In [3]:
#Loading the data
df = pd.read_csv('Data/fake_job_posting_clean.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,country,has_company_logo,employment_type,required_experience,required_education,function,fraudulent,text
0,0,US,1,Other,Internship,Unspecified,Marketing,0,"Marketing Intern We're Food52, and we've creat..."
1,1,NZ,1,Full-time,Not Applicable,Unspecified,Customer Service,0,Customer Service - Cloud Video Production 90 S...
2,2,US,1,Other,Not Applicable,Unspecified,Other,0,Commissioning Machinery Assistant (CMA) Valor ...
3,3,US,1,Full-time,Mid-Senior level,Bachelor's Degree,Sales,0,Account Executive - Washington DC Our passion ...
4,4,US,1,Full-time,Mid-Senior level,Bachelor's Degree,Health Care Provider,0,Bill Review Manager SpotSource Solutions LLC i...


We will drop the 'Unnamed :0' index column

In [4]:
##Removing the index column
df.drop('Unnamed: 0', axis=1, inplace = True)

We will now split the dataset into training and test dataset

In [4]:
## Train Test Split of data
X = df.drop('fraudulent', axis =1)
y = df['fraudulent']

X_train,X_test, y_train,y_test = train_test_split(X,y,test_size = 0.20, stratify = y, random_state = 1)

The index of the train and test dataset are reset

In [5]:
##Resetting the index the test and train data
X_train = X_train.reset_index(drop = True)
y_train = y_train.reset_index(drop = True)
X_test = X_test.reset_index(drop = True)
y_test = y_test.reset_index(drop = True)

The categorical variable are takeninto a separate dataframe and using onehot encoder all the categorical features are coverted in dummy variables in the column transformer function.

In [6]:
from sklearn.preprocessing import OneHotEncoder

In [7]:
X_train_ohe = X_train.drop(columns = ['has_company_logo', 'text'], axis = 1)
X_test_ohe = X_test.drop(columns = ['has_company_logo', 'text'], axis = 1)

In [8]:
X_train_ohe.head()
X_test_ohe.head()

Unnamed: 0,country,employment_type,required_experience,required_education,function
0,US,Full-time,Associate,Bachelor's Degree,Marketing
1,US,Full-time,Not Applicable,Unspecified,Other
2,CA,Full-time,Not Applicable,Unspecified,Information Technology
3,US,Part-time,Not Applicable,Unspecified,Customer Service
4,GB,Full-time,Associate,Bachelor's Degree,Sales


In [9]:
col_transform = [('OHE_transform', OneHotEncoder(handle_unknown = 'ignore'), ['country',
                                                                              'employment_type',
                                                                              'required_education',
                                                                              'required_experience',
                                                                              'function'])]
col_trans = ColumnTransformer(col_transform)
col_trans.fit(X_train_ohe)
X_train_e = col_trans.transform(X_train_ohe)
X_test_e = col_trans.transform(X_test_ohe)

The column transformer returned as a sparse matrix and hence converting the array into a dataframe

In [10]:
##Converting the one hot encoded array to a dataframe
train_ohe = pd.DataFrame(columns=col_trans.get_feature_names(), data=X_train_e.toarray())
test_ohe= pd.DataFrame(columns=col_trans.get_feature_names(), data=X_test_e.toarray())

As we did during the modelling, the tokenniser is defined.

In [11]:
nltk.download('stopwords')
stemmer = nltk.stem.PorterStemmer()

from nltk.corpus import stopwords 
ENGLISH_STOP_WORDS = stopwords.words('english')

def my_tokenizer(sentence):
    
    for punctuation_mark in string.punctuation:
        # Remove punctuation and set to lower case
        sentence = sentence.replace(punctuation_mark,'').lower()

    # split sentence into words
    listofwords = sentence.split(' ')
    listofstemmed_words = []
    
        
    # Remove stopwords and any tokens that are just empty strings
    for word in listofwords:
        if (not word in ENGLISH_STOP_WORDS) and (word!=''):
            # Stem words
            stemmed_word = stemmer.stem(word)
            listofstemmed_words.append(stemmed_word)

    return listofstemmed_words

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jaiganeshkannan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


The text columns is stored ina separate dataframe to convert it into token using TF-IDF vectoriser.

In [12]:
X_train_t = X_train['text']
X_test_t = X_test['text']

In [13]:
##Creating a sparse matrix for text
text = TfidfVectorizer(min_df=20, tokenizer=my_tokenizer, ngram_range = (1,3))
text.fit(X_train_t)
X_train_t = text.transform(X_train_t)
X_test_t = text.transform(X_test_t)



In [None]:
##Converting the array to a dataframe
train_t = pd.DataFrame(columns=text.get_feature_names(), data=X_train_t.toarray())
test_t= pd.DataFrame(columns=text.get_feature_names(), data=X_test_t.toarray())

We will now concatinate the onehotencoded dataframe and vectorised token dataframe with the original dataframe. We willthen drop the original columns that are transformed.

In [15]:
#combining the dataset, on=hehotencoded dataset and vectorised dataset
X_train_ct = pd.concat([X_train,train_t,train_ohe], axis = 1)
X_test_ct = pd.concat([X_test,test_t,test_ohe], axis = 1)

In [16]:
#Dropping the dupliated columns
X_train_ct.drop(columns = ['text','country','employment_type','required_education',
                           'required_experience','function'] , axis=1, inplace = True)
X_test_ct.drop(columns = ['text','country','employment_type','required_education',
                           'required_experience','function'] , axis=1, inplace = True)

We will now over sample the train data using SMOTE

In [17]:
# instantiate SMOTE
sm = SMOTE(sampling_strategy = 0.3, random_state = 1)

# performing the SMOTE oversampling
X_train_sm, y_train_sm = sm.fit_resample(X_train_ct, y_train)



Logistic regression model is run with the hyper parameter values from th ebest estimator identified.

In [18]:
%%time
#Instantiate the model
lg_s_model = LogisticRegression(solver = 'liblinear', C=0.5, penalty = 'l2', random_state = 10)
#Fit the model
lg_s_model.fit(X_train_sm, y_train_sm)

CPU times: user 6.46 s, sys: 1.36 s, total: 7.82 s
Wall time: 6.73 s


LogisticRegression(C=0.5, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=10, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [38]:
#scoring the model
print(lg_s_model.score(X_test_ct, y_test))
print(lg_s_model.score(X_train_sm, y_train_sm))

0.9792613636363636
0.9803583735354927


We can now retrieve the positive and the negative coeffients using `.coef_`and iterpret it.

In [19]:
##Storing the coefficients to a dataframe
coeff = pd.DataFrame(lg_s_model.coef_, columns = X_train_ct.columns)

In [20]:
#Transposing the coefficients
coeff_col = coeff.T

In [21]:
#Identifying the top 5 negative coefficients
coeff_col.sort_values(by = [0], ascending=True).head(5)

Unnamed: 0,0
has_company_logo,-2.092332
OHE_transform__x4_Health Care Provider,-1.92884
OHE_transform__x0_GR,-1.671207
team,-1.66867
OHE_transform__x1_Temporary,-1.618331


In [22]:
#Identifying the top 5 positive coefficients
coeff_col.sort_values(by = [0], ascending=False).head(5)

Unnamed: 0,0
unkc,3.221086
earn,3.164142
money,2.639044
OHE_transform__x0_MY,2.44814
assist,2.172617
