In [49]:
import pandas as pd
import sklearn
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.utils import resample

In [50]:
train=pd.read_csv("train.csv")
val=pd.read_csv("validation.csv")
test=pd.read_csv("test.csv")

In [51]:
print("Train size =",len(train))
print("Test size =",len(test))
print("Validation size =",len(val))

Train size = 3900
Test size = 836
Validation size = 836


In [52]:
y_train,X_train=train["label"],train["text"]
y_val,X_val=val["label"],val["text"]
y_test,X_test=val["label"],val["text"]

In [53]:
train['label'].value_counts()

0    3383
1     517
Name: label, dtype: int64

Highly imbalanced data. Hence, we oversample the minority class.

In [54]:
train_majority = train[train["label"]==0]
train_minority = train[train["label"]==1]

In [55]:
train_minority_upsampled = resample(train_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=len(train_majority),    # to match majority class
                                 random_state=1234) # reproducible results

In [56]:
train = pd.concat([train_majority, train_minority_upsampled])

In [57]:
y_train,X_train=train["label"],train["text"]
y_val,X_val=val["label"],val["text"]
y_test,X_test=val["label"],val["text"]

In [58]:
train['label'].value_counts()

0    3383
1    3383
Name: label, dtype: int64

In [59]:
# transform the text data to feature vectors that can be used as input to the Logistic regression

feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english', lowercase='True')

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

# convert Y_train and Y_test values as integers

y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [60]:
print(X_train.shape)
print(X_train_features.shape)

(6766,)
(6766, 6997)


# Logistic Regression

In [61]:
model1 = LogisticRegression()

In [62]:
# training the Logistic Regression model with the training data
model1.fit(X_train_features, y_train)

LogisticRegression()

In [63]:
# prediction on training data

prediction_on_training_data = model1.predict(X_train_features)
accuracy_on_training_data = accuracy_score(y_train, prediction_on_training_data)

In [64]:
print('Accuracy on training data : ', accuracy_on_training_data)

Accuracy on training data :  0.9979308306237068


In [65]:
# prediction on test data

y_pred = model1.predict(X_test_features)
accuracy_on_test_data = accuracy_score(y_test, y_pred)

In [66]:
print('Accuracy on test data : ', accuracy_on_test_data)

Accuracy on test data :  0.9892344497607656


In [67]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       725
           1       0.99      0.93      0.96       111

    accuracy                           0.99       836
   macro avg       0.99      0.96      0.98       836
weighted avg       0.99      0.99      0.99       836



# MLP Classifier

In [68]:
model2=MLPClassifier()
model2.fit(X_train_features, y_train)

MLPClassifier()

In [69]:
# prediction on training data

prediction_on_training_data = model2.predict(X_train_features)
accuracy_on_training_data = accuracy_score(y_train, prediction_on_training_data)

In [70]:
print('Accuracy on training data : ', accuracy_on_training_data)

Accuracy on training data :  1.0


In [71]:
# prediction on test data

y_pred = model2.predict(X_test_features)
accuracy_on_test_data = accuracy_score(y_test, y_pred)

In [72]:
print('Accuracy on test data : ', accuracy_on_test_data)

Accuracy on test data :  0.9880382775119617


In [73]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       725
           1       0.99      0.92      0.95       111

    accuracy                           0.99       836
   macro avg       0.99      0.96      0.97       836
weighted avg       0.99      0.99      0.99       836



# Naive Bayes Classifier

### Gaussian NB Classifier

In [74]:
model3 = GaussianNB()
model3.fit(X_train_features.toarray() , y_train)

GaussianNB()

In [75]:
# prediction on training data

prediction_on_training_data = model3.predict(X_train_features.toarray())
accuracy_on_training_data = accuracy_score(y_train, prediction_on_training_data)

In [76]:
print('Accuracy on training data : ', accuracy_on_training_data)

Accuracy on training data :  0.9694058527933787


In [77]:
# prediction on test data

y_pred = model3.predict(X_test_features.toarray())
accuracy_on_test_data = accuracy_score(y_test, y_pred)

In [78]:
print('Accuracy on test data : ', accuracy_on_test_data)

Accuracy on test data :  0.8779904306220095


In [79]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.97      0.89      0.93       725
           1       0.53      0.80      0.64       111

    accuracy                           0.88       836
   macro avg       0.75      0.85      0.78       836
weighted avg       0.91      0.88      0.89       836



### Multinomial NB

In [80]:
model4 = MultinomialNB()
model4.fit(X_train_features.toarray() , y_train)

MultinomialNB()

In [81]:
# prediction on training data

prediction_on_training_data = model4.predict(X_train_features.toarray())
accuracy_on_training_data = accuracy_score(y_train, prediction_on_training_data)

In [82]:
print('Accuracy on training data : ', accuracy_on_training_data)

Accuracy on training data :  0.9921667159326042


In [83]:
# prediction on test data

y_pred = model4.predict(X_test_features.toarray())
accuracy_on_test_data = accuracy_score(y_test, y_pred)

In [84]:
print('Accuracy on test data : ', accuracy_on_test_data)

Accuracy on test data :  0.9808612440191388


In [85]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99       725
           1       0.89      0.97      0.93       111

    accuracy                           0.98       836
   macro avg       0.94      0.98      0.96       836
weighted avg       0.98      0.98      0.98       836



# The Logistic Regression model gives the best accuracy of 98.9%.