In [1]:
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [2]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Usaid\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Usaid\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Usaid\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Usaid\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
imdb_data = pd.read_csv('train.csv')
imdb_data_test = pd.read_csv('test.csv')

In [4]:
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.lower() not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # Join tokens back into text
    processed_text = ' '.join(tokens)
    return processed_text

In [5]:
imdb_data['processed_review'] = imdb_data['review'].apply(preprocess_text)
imdb_data_test['processed_review'] = imdb_data_test['review'].apply(preprocess_text)

In [6]:
x_train = imdb_data['processed_review']
y_train = imdb_data['sentiment']

x_test = imdb_data_test['processed_review']
y_test = imdb_data_test['sentiment']

In [7]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=10000)
X_train_tfidf = tfidf_vectorizer.fit_transform(x_train)
X_test_tfidf = tfidf_vectorizer.transform(x_test)

In [8]:
count_vectorizer = CountVectorizer(ngram_range=(1, 3), max_features=10000)
X_train_count = count_vectorizer.fit_transform(x_train)
X_test_count = count_vectorizer.transform(x_test)

In [9]:
y_train_tfidf = tfidf_vectorizer.transform(y_train)
y_test_tfidf = tfidf_vectorizer.transform(y_test)

In [10]:
from sklearn.ensemble import GradientBoostingClassifier


classifiers = {
    "Naive Bayes": MultinomialNB(),
    "Random Forest": RandomForestClassifier(),
    "k-NN": KNeighborsClassifier(),
    "Gradient Boosting": GradientBoostingClassifier()
}

In [11]:
for name, classifier in classifiers.items():
    print(f"Training {name}...")
    classifier.fit(X_train_tfidf, y_train)
    y_pred = classifier.predict(X_test_tfidf)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy (TF-IDF): {accuracy:.2f}")

    # Also evaluate with CountVectorizer
    classifier.fit(X_train_count, y_train)
    y_pred_count = classifier.predict(X_test_count)
    accuracy_count = accuracy_score(y_test, y_pred_count)
    print(f"{name} Accuracy (CountVectorizer): {accuracy_count:.2f}")


Training Naive Bayes...
Naive Bayes Accuracy (TF-IDF): 0.87
Naive Bayes Accuracy (CountVectorizer): 0.86
Training Random Forest...
Random Forest Accuracy (TF-IDF): 0.86
Random Forest Accuracy (CountVectorizer): 0.85
Training k-NN...
k-NN Accuracy (TF-IDF): 0.73
k-NN Accuracy (CountVectorizer): 0.59
Training Gradient Boosting...
Gradient Boosting Accuracy (TF-IDF): 0.81
Gradient Boosting Accuracy (CountVectorizer): 0.81


In [12]:
from sklearn.naive_bayes import BernoulliNB

In [13]:
naiveB = BernoulliNB(alpha=15)

naiveB.fit(X_train_tfidf, y_train)
y_pred = naiveB.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)

print(f"Naive Bayes Accuracy (TF-IDF): {accuracy:.2f}")

naiveB.fit(X_train_count, y_train)
y_pred_count = naiveB.predict(X_test_count)
accuracy_count = accuracy_score(y_test, y_pred_count)

print(f"Naive Bayes Accuracy (CountVectorizer): {accuracy_count:.2f}")

Naive Bayes Accuracy (TF-IDF): 0.87
Naive Bayes Accuracy (CountVectorizer): 0.87


In [14]:
lr = LogisticRegression(max_iter=1000)

lr.fit(X_train_tfidf, y_train)
y_pred = lr.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)

print(f"Logistic Regression Accuracy (TF-IDF): {accuracy:.2f}")

lr.fit(X_train_count, y_train)
y_pred_count = lr.predict(X_test_count)
accuracy_count = accuracy_score(y_test, y_pred_count)

print(f"Logistic Regression Accuracy (CountVectorizer): {accuracy_count:.2f}")

Logistic Regression Accuracy (TF-IDF): 0.89
Logistic Regression Accuracy (CountVectorizer): 0.87


In [16]:
svm = SVC()

svm.fit(X_train_tfidf, y_train)
y_pred = svm.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)

print(f"SVM Accuracy (TF-IDF): {accuracy:.2f}")

svm.fit(X_train_count, y_train)
y_pred_count = svm.predict(X_test_count)
accuracy_count = accuracy_score(y_test, y_pred_count)

print(f"SVM Accuracy (CountVectorizer): {accuracy_count:.2f}")

SVM Accuracy (TF-IDF): 0.90
SVM Accuracy (CountVectorizer): 0.88


### Task 3

In [10]:
from gensim.models import Word2Vec
import numpy as np

In [11]:
tokenized_data = [text.split() for text in x_train]

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=tokenized_data, vector_size=100, window=5, min_count=1, workers=4)

In [12]:
# Function to create document vectors by averaging the word vectors
def document_vector(word2vec_model, doc):
    doc_vector = [word2vec_model.wv[word] for word in doc if word in word2vec_model.wv]
    return np.mean(doc_vector, axis=0)

X_train_word2vec = [document_vector(word2vec_model, doc) for doc in tokenized_data]

# Repeat the same process for test data
tokenized_data_test = [text.split() for text in x_test]
X_test_word2vec = [document_vector(word2vec_model, doc) for doc in tokenized_data_test]

In [14]:
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "K-NN": KNeighborsClassifier(n_neighbors=5)
}

# Training and Evaluation
for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train_word2vec, y_train)
    y_pred = model.predict(X_test_word2vec)
    acc = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {acc:.4f}")
    print()

Training Random Forest...
Accuracy: 0.8032

Training K-NN...
Accuracy: 0.7460



In [15]:
lr = LogisticRegression(max_iter=1000)

lr.fit(X_train_word2vec, y_train)
y_pred = lr.predict(X_test_word2vec)
accuracy = accuracy_score(y_test, y_pred)

print(f"Logistic Regression Accuracy (Word2Vec): {accuracy:.2f}")

Logistic Regression Accuracy (Word2Vec): 0.84


In [17]:
svm = SVC()

svm.fit(X_train_word2vec, y_train)
y_pred = svm.predict(X_test_word2vec)
accuracy = accuracy_score(y_test, y_pred)

print(f"SVM Accuracy (Word2Vec): {accuracy:.2f}")

SVM Accuracy (Word2Vec): 0.84
