In [2]:
import pandas as pd
import random
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [3]:
def read_data(filename): 
    with open(filename, 'r') as f:
        return f.readlines()

In [4]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yamin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yamin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [5]:
posi, negi = read_data('./rt-polarity.pos'), read_data('./rt-polarity.neg')

In [6]:
def preprocess_data(text):
    tokens = text.split()
    tokens = [token.lower() for token in tokens]
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words and token.isalpha()]
    return ' '.join(tokens)

In [7]:
posi = [preprocess_data(sentence) for sentence in posi]
negi = [preprocess_data(sentence) for sentence in negi]

In [8]:
train_pos = posi[:4000]
train_neg = negi[:4000] 

val_pos = posi[4000:4500]
val_neg = negi[4000:4500]

test_pos = posi[4500:]
test_neg = negi[4500:]

In [9]:
train_data = train_pos + train_neg
train_labels = [1] * 4000 + [0] * 4000

val_data = val_pos + val_neg
val_labels = [1] * 500 + [0] * 500

test_data = test_pos + test_neg
test_labels = [1] * 831 + [0] * 831

In [10]:

combined_train = list(zip(train_data, train_labels)) 
combined_val = list(zip(val_data, val_labels))
combined_test = list(zip(test_data, test_labels))

In [11]:
random.shuffle(combined_train)
random.shuffle(combined_val)
random.shuffle(combined_test)

In [12]:

train_data, train_labels = zip(*combined_train)
val_data, val_labels = zip(*combined_val)
test_data, test_labels = zip(*combined_test)


In [13]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_data)
X_val = vectorizer.transform(val_data)
X_test = vectorizer.transform(test_data)

In [14]:
nb_model = MultinomialNB()
nb_model.fit(X_train, train_labels)

nb_preds = nb_model.predict(X_test)

In [15]:
print("Naive Bayes Report:")
print(classification_report(test_labels, nb_preds))
print("Confusion Matrix (Naive Bayes):")
print(confusion_matrix(test_labels, nb_preds))
print("Accuracy Score (Naive Bayes):")
print(accuracy_score(test_labels, nb_preds)*100)

Naive Bayes Report:
              precision    recall  f1-score   support

           0       0.77      0.78      0.78       831
           1       0.78      0.77      0.78       831

    accuracy                           0.78      1662
   macro avg       0.78      0.78      0.78      1662
weighted avg       0.78      0.78      0.78      1662

Confusion Matrix (Naive Bayes):
[[649 182]
 [190 641]]
Accuracy Score (Naive Bayes):
77.6173285198556


In [16]:
log_reg_model = LogisticRegression()
log_reg_model.fit(X_train, train_labels)

log_reg_preds = log_reg_model.predict(X_test)

In [17]:
print("Logistic Regression Report:")
print(classification_report(test_labels, log_reg_preds))
print("Confusion Matrix (Logistic Regression):")
print(confusion_matrix(test_labels, log_reg_preds))
print("Accuracy Score (Logistic Regression):")
print(accuracy_score(test_labels, log_reg_preds)*100)

Logistic Regression Report:
              precision    recall  f1-score   support

           0       0.75      0.76      0.75       831
           1       0.76      0.74      0.75       831

    accuracy                           0.75      1662
   macro avg       0.75      0.75      0.75      1662
weighted avg       0.75      0.75      0.75      1662

Confusion Matrix (Logistic Regression):
[[634 197]
 [216 615]]
Accuracy Score (Logistic Regression):
75.15042117930204


In [18]:
svm_model = SVC()
svm_model.fit(X_train, train_labels)

svm_preds = svm_model.predict(X_test)

In [19]:
print("SVM Report:")
print(classification_report(test_labels, svm_preds))
print("Confusion Matrix (SVM):")
print(confusion_matrix(test_labels, svm_preds))
print("Accuracy Score (SVM):")
print(accuracy_score(test_labels, svm_preds)*100)

SVM Report:
              precision    recall  f1-score   support

           0       0.74      0.78      0.76       831
           1       0.77      0.73      0.75       831

    accuracy                           0.75      1662
   macro avg       0.75      0.75      0.75      1662
weighted avg       0.75      0.75      0.75      1662

Confusion Matrix (SVM):
[[646 185]
 [227 604]]
Accuracy Score (SVM):
75.21058965102286
