In [2]:
import pandas as pd
import sklearn
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB

In [3]:
train=pd.read_csv("train.csv")
val=pd.read_csv("validation.csv")
test=pd.read_csv("test.csv")

In [4]:
print("Train size =",len(train))
print("Test size =",len(test))
print("Validation size =",len(val))

Train size = 3900
Test size = 836
Validation size = 836


In [5]:
y_train,X_train=train["label"],train["text"]
y_val,X_val=val["label"],val["text"]
y_test,X_test=val["label"],val["text"]

In [6]:
# transform the text data to feature vectors that can be used as input to the Logistic regression

feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english', lowercase='True')

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

# convert Y_train and Y_test values as integers

y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [7]:
print(X_train.shape)
print(X_train_features.shape)

(3900,)
(3900, 6997)


# Logistic Regression

In [8]:
model1 = LogisticRegression()

In [9]:
# training the Logistic Regression model with the training data
model1.fit(X_train_features, y_train)

LogisticRegression()

In [10]:
# prediction on training data

prediction_on_training_data = model1.predict(X_train_features)
accuracy_on_training_data = accuracy_score(y_train, prediction_on_training_data)

In [11]:
print('Accuracy on training data : ', accuracy_on_training_data)

Accuracy on training data :  0.9648717948717949


In [12]:
# prediction on test data

y_pred = model1.predict(X_test_features)
accuracy_on_test_data = accuracy_score(y_test, y_pred)

In [13]:
print('Accuracy on test data : ', accuracy_on_test_data)

Accuracy on test data :  0.9545454545454546


In [14]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.95      1.00      0.97       725
           1       1.00      0.66      0.79       111

    accuracy                           0.95       836
   macro avg       0.98      0.83      0.88       836
weighted avg       0.96      0.95      0.95       836



# MLP Classifier

In [15]:
model2=MLPClassifier()
model2.fit(X_train_features, y_train)

MLPClassifier()

In [16]:
# prediction on training data

prediction_on_training_data = model2.predict(X_train_features)
accuracy_on_training_data = accuracy_score(y_train, prediction_on_training_data)

In [17]:
print('Accuracy on training data : ', accuracy_on_training_data)

Accuracy on training data :  1.0


In [18]:
# prediction on test data

y_pred = model2.predict(X_test_features)
accuracy_on_test_data = accuracy_score(y_test, y_pred)

In [19]:
print('Accuracy on test data : ', accuracy_on_test_data)

Accuracy on test data :  0.9880382775119617


In [20]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       725
           1       0.98      0.93      0.95       111

    accuracy                           0.99       836
   macro avg       0.99      0.96      0.97       836
weighted avg       0.99      0.99      0.99       836



# Naive Bayes Classifier

### Gaussian NB Classifier

In [21]:
model3 = GaussianNB()
model3.fit(X_train_features.toarray() , y_train)

GaussianNB()

In [22]:
# prediction on training data

prediction_on_training_data = model3.predict(X_train_features.toarray())
accuracy_on_training_data = accuracy_score(y_train, prediction_on_training_data)

In [23]:
print('Accuracy on training data : ', accuracy_on_training_data)

Accuracy on training data :  0.946923076923077


In [24]:
# prediction on test data

y_pred = model3.predict(X_test_features.toarray())
accuracy_on_test_data = accuracy_score(y_test, y_pred)

In [25]:
print('Accuracy on test data : ', accuracy_on_test_data)

Accuracy on test data :  0.8851674641148325


In [26]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.98      0.89      0.93       725
           1       0.54      0.87      0.67       111

    accuracy                           0.89       836
   macro avg       0.76      0.88      0.80       836
weighted avg       0.92      0.89      0.90       836



### Multinomial NB

In [27]:
model4 = MultinomialNB()
model4.fit(X_train_features.toarray() , y_train)

MultinomialNB()

In [28]:
# prediction on training data

prediction_on_training_data = model4.predict(X_train_features.toarray())
accuracy_on_training_data = accuracy_score(y_train, prediction_on_training_data)

In [29]:
print('Accuracy on training data : ', accuracy_on_training_data)

Accuracy on training data :  0.9823076923076923


In [30]:
# prediction on test data

y_pred = model4.predict(X_test_features.toarray())
accuracy_on_test_data = accuracy_score(y_test, y_pred)

In [31]:
print('Accuracy on test data : ', accuracy_on_test_data)

Accuracy on test data :  0.9593301435406698


In [32]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98       725
           1       1.00      0.69      0.82       111

    accuracy                           0.96       836
   macro avg       0.98      0.85      0.90       836
weighted avg       0.96      0.96      0.96       836



# The MLP classifier gives the best accuracy of 98%.