In [None]:
import os
import string
import random
import operator
import numpy as np 
import pandas as pd 
from tqdm import tqdm
import matplotlib.pyplot as plt
import math
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler

In [None]:
train_df= pd.read_csv("256/train.csv")

UNDERSAMPLING

In [None]:
sincerequestions=train_df[:][train_df['target']==0]
insincerequestions=train_df[:][train_df['target']==1]
sincerequestions_under=sincerequestions.sample(len(insincerequestions))
train_under = pd.concat([sincerequestions_under,insincerequestions], axis=0)
train_temp = train_under.drop(['target'],axis=1)

LOADING EMBEDDINGS FROM GLOVE

In [None]:
embeddings_index = {}
f = open('256/embeddings/glove.840B.300d/glove.840B.300d.txt')
for line in tqdm(f):
    values = line.split(" ")
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

PREPROCESSING

In [None]:
punctuations = string.punctuation
stopwords = list(STOP_WORDS)

parser = English()
def spacy_tokenizer(sentence):
    mytokens = parser(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ]
    mytokens = " ".join([i for i in mytokens])
    return mytokens

In [None]:
train_temp["question_text"] = train_temp["question_text"].apply(lambda x: spacy_tokenizer(x))

SPLITTING TRAIN AND TEST

In [None]:
train_x, val_x,train_y,val_y = train_test_split(train_temp,train_under['target'],test_size=.20, random_state=0)

In [None]:
# Convert values to embeddings
def text_to_array(text):
    empyt_emb = np.zeros(300)
    text = text[:-1].split()[:30]
    embeds = [embeddings_index.get(x, empyt_emb) for x in text]
    embeds+= [empyt_emb] * (30 - len(embeds))
    return np.array(embeds)

X_train = [text_to_array(X_text) for X_text in tqdm(train_x["question_text"])]
X_val = [text_to_array(X_text) for X_text in tqdm(val_x["question_text"])]


RESHAPING 3D ARRAY

In [None]:
import numpy as np
trainvects=np.asarray(X_train)
valvects=np.asarray(X_val)
nsamples, nx, ny = trainvects.shape
X_train = trainvects.reshape((nsamples,nx*ny))
msamples, mx, my = valvects.shape
X_val = valvects.reshape((msamples,mx*my))

LOGISTIC REGRESSION

In [30]:
from sklearn.linear_model import LogisticRegression
logr = LogisticRegression()
logr.fit(X_train,train_y)
y_pred1=logr.predict(X_val)



XG BOOST CLASSIFIER

In [16]:
from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb.fit(X_train,train_y)
y_pred2 = xgb.predict(X_val)

BERNOULLI NAIVE BAYES CLASSIFIER

In [23]:
from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB(alpha=0.01)
bnb.fit(X_train,train_y)
y_pred3 = bnb.predict(X_val)

EVALUATION USING ACCURACY SCORE and F1 SCORE

In [20]:
from sklearn.metrics import f1_score, balanced_accuracy_score

In [21]:
def evaluate(model,y_predict):
    print(model)
    f1=f1_score(val_y,y_predict)
    accuracy= balanced_accuracy_score(val_y,y_predict)
    print("F1 score:",f1)
    print("Accuracy:",accuracy)
    return f1,accuracy
    

In [None]:
f1_logr,acc_logr=evaluate("STANDARD VECTOR CLASSIFIER",y_pred1)

In [22]:
f1_xgb,acc_xgb=evaluate("XGBOOST CLASSIFIER",y_pred2)

XGBOOST CLASSIFIER
F1 score: 0.7118743794358842
Accuracy: 0.7589469339048308


In [24]:
f1_bnb,acc_bnb=evaluate("BERNOULLI NAIVE BAYES",y_pred3)

BERNOULLI NAIVE BAYES
F1 score: 0.8307089186540346
Accuracy: 0.8401706755764279


EVALUATION GRAPH

In [None]:
import matplotlib.pyplot as plt

In [None]:
objects = ('STANDARD VECTOR CLASSIFIER', 'XGBOOST CLASSIFIER', 'BERNOULLI NAIVE BAYES')
y_pos = np.arange(len(objects))
performance1 = [f1_logr,f1_xgb,f1_bnb]
performance2=[acc_logr,acc_xgb,acc_bnb]

In [None]:
plt.bar(y_pos, performance1, align='center', alpha=0.5)
plt.xticks(y_pos, objects)
plt.ylabel('F1 score')
plt.title('Classifier')

plt.show()

In [None]:
plt.bar(y_pos, performance2, align='center', alpha=0.5)
plt.xticks(y_pos, objects)
plt.ylabel('Accuracy')
plt.title('Classifier')

plt.show()