In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing all necessary modules and packages

In [None]:
!pip install contractions
!pip install word2number

import re, string, subprocess, emoji, nltk, inflect, contractions, json
import matplotlib.pyplot as plt
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from bs4 import BeautifulSoup
from wordcloud import WordCloud
from word2number import w2n
from tqdm import tqdm

with open('/kaggle/input/my_stopwords.txt', 'r') as f:
    custom_stopwords = set(f.read().splitlines())

with open("/kaggle/input/genz_dict.txt", "r") as file:
    genz_dict = json.load(file)

with open("/kaggle/input/emoticon_dict.txt", "r") as file:
    emoticon_dict = json.load(file)

try:
    nltk.data.find('wordnet.zip')
except:
    nltk.download('wordnet', download_dir='/kaggle/working/')
    command = "unzip /kaggle/working/corpora/wordnet.zip -d /kaggle/working/corpora"
    subprocess.run(command.split())
    nltk.data.path.append('/kaggle/working/')

from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import shuffle, resample
from sklearn.ensemble import RandomForestClassifier

**dropping first column of indices as it was redundant**

In [None]:
dataset = pd.read_csv("/kaggle/input/movie-review-train-data/Movie_reviews_non_comp.csv")
dataset = dataset.drop(dataset.columns[0], axis=1)
sample = dataset.head()
print(sample)

# see class distribution

In [None]:
print(dataset['Class'].value_counts())

# Basic Data preprocessing

**removed html and url tags**

**converted emojis and emoticons to text**

In [None]:
dataset['clean_review'] = dataset['Review']
# remove html and url tags
dataset['clean_review'] = dataset['clean_review'].apply(lambda x: re.sub(r'http\S+|www.\S+', '', x))
dataset['clean_review'] = dataset['clean_review'].apply(lambda x: BeautifulSoup(x, "lxml").text)

def convert_emojis_to_words(text):
    converted_text = emoji.demojize(text)
    return converted_text

def convert_emoticon_to_words(text):
    for emoticon,desc in emoticon_dict.items():
        converted_text=text.replace(emoticon,desc)
    return converted_text


dataset['clean_review'] = dataset['clean_review'].apply(convert_emojis_to_words)
dataset['clean_review'] = dataset['clean_review'].apply(convert_emoticon_to_words)

**converted all text to lowercase**

In [None]:
dataset['clean_review'] = dataset['Review'].apply(lambda x: x.lower())
print(dataset['clean_review'].head())

**we noticed that some review contain fraction where people gave rating like 4/5 . So we tried to capture there sentiment here by converting it to good/bad**

In [None]:
def fraction_to_sentiment(review):
    pattern = r'(\d+(\.\d+)?)/(\d+(\.\d+)?)'
    matches = re.findall(pattern, review)
    
    for match in matches:
        num_str, _, denom_str, _ = match
        
        num = float(num_str)
        denom = float(denom_str)
        
        if num == 9 and denom == 11:
            continue

        if denom == 0:
            continue
        
        result = num / denom
        
        if result > 0.8:
            replacement = "awesome"
        elif 0.6 <= result <= 0.8:
            replacement = "good"
        elif 0.4 <= result < 0.6:
            replacement = "neutral"
        elif 0.2 <= result < 0.4:
            replacement = "bad"
        else:
            replacement = "worst"
        
        review = review.replace(f"{num_str}/{denom_str}", replacement)
    
    return review

dataset['clean_review'] = dataset['clean_review'].apply(fraction_to_sentiment)

**converted genz text lingo to embedding readable form**

**expanded contractions like dont't to do not**

**removed punctuations**

In [None]:
def convert_genz_lingo(text):
    words=text.split()
    new_words=[]
    for word in words:
        if word in genz_dict:
            new_words.append(genz_dict[word])
        else:
            new_words.append(word)
    return " ".join(new_words)

dataset['clean_review'] = dataset['clean_review'].apply(convert_genz_lingo)
dataset['clean_review'] = dataset['clean_review'].apply(lambda x: contractions.fix(x))
dataset['clean_review'] = dataset['clean_review'].apply(lambda x: re.sub(r"[^\w\s]", '', x))

print(dataset['clean_review'].head())

# Tokenization

In [None]:
dataset['tokens'] = dataset['clean_review'].apply(lambda x: word_tokenize(x))
print(dataset['tokens'].head())

# Exploratory Data Analysis 

**wordcloud before removing stop words and lemmatization**

In [None]:
all_tokens = [token for tokens_list in dataset['tokens'] for token in tokens_list]
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(dict(nltk.FreqDist(all_tokens)))
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

**word length analysis**

In [None]:
word_lengths = [len(word) for word in all_tokens]  
plt.figure(figsize=(8,5))
plt.hist(word_lengths, bins=range(1, max(word_lengths) + 2), edgecolor="black", alpha=0.7)
plt.xlabel("Sentence Length")
plt.ylabel("Frequency")
plt.title("Histogram of Sentence Lengths")
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

In [None]:
longest_word = max(all_tokens, key=len) 
print(f"Longest word: {longest_word} (Length: {len(longest_word)})")
avg_word_length = np.mean([len(word) for word in all_tokens])
print(f"Average word length: {avg_word_length:.2f}")

**character frequency analysis**

In [None]:
char_counts = Counter("".join(all_tokens))
sorted_chars = sorted(char_counts.keys())  
sorted_counts = [char_counts[ch] for ch in sorted_chars]
plt.figure(figsize=(10, 5))
plt.bar(sorted_chars, sorted_counts, color="skyblue", edgecolor="black", alpha=0.7)
plt.xlabel("Characters")
plt.ylabel("Frequency")
plt.title("Character Frequency Count")
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

**sentence length analysis**

In [None]:
token_counts = dataset["tokens"].apply(len)
plt.figure(figsize=(10,6))
plt.hist(token_counts, bins=20, color='skyblue', edgecolor='black', alpha=0.7)
plt.title("Histogram of Sentence lengths", fontsize=16)
plt.xlabel("Number of words", fontsize=14)
plt.ylabel("Frequency", fontsize=14)
plt.grid(axis='y', alpha=0.75)
plt.show()

In [None]:
avg_sentence_length = np.mean([token_counts])
print(f"Average sentence length: {avg_sentence_length:.2f}")

# removing stop words

In [None]:
dataset['tokens'] = dataset['tokens'].apply(lambda x: [word for word in x if word not in custom_stopwords])
print(dataset['tokens'].head())

**converting numbers to word form**

In [None]:
def convert_numbers_to_words(tokens):
    new_tokens = []
    for token in tokens:
        try:
            word = w2n.word_to_num(token)
            new_tokens.append(str(word))  
        except ValueError:
            new_tokens.append(token)
    return new_tokens

def convert_digits_to_words(tokens):
    return [inflect.engine().number_to_words(token) if token.isdigit() else token for token in tokens]

dataset['tokens'] = dataset['tokens'].apply(convert_numbers_to_words)
dataset['tokens'] = dataset['tokens'].apply(convert_digits_to_words)

# Lemmatization

In [None]:
lemmatizer = WordNetLemmatizer()
wordnet_map = {"N": wordnet.NOUN, "V": wordnet.VERB, "J": wordnet.ADJ, "R": wordnet.ADV}

def lemmatize_text(text):
    pos_tags = nltk.pos_tag(text)
    lemmatized_words = []
    for word, tag in pos_tags:
        pos = wordnet_map.get(tag[0].upper(), wordnet.NOUN)
        lemmatized_word = lemmatizer.lemmatize(word, pos=pos)
        lemmatized_words.append(lemmatized_word)
    return lemmatized_words

dataset['tokens'] = dataset['tokens'].apply(lemmatize_text)

# wordcloud after data preprocessing

In [None]:
all_tokens = [token for tokens_list in dataset['tokens'] for token in tokens_list]
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(dict(nltk.FreqDist(all_tokens)))
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

# n-grams after data preprocessing

In [None]:
unigrams = all_tokens
bigrams = list(ngrams(all_tokens, 2))
trigrams = list(ngrams(all_tokens, 3))

unigram_counts = Counter(unigrams)
bigram_counts = Counter(bigrams)
trigram_counts = Counter(trigrams)

top_unigrams = unigram_counts.most_common(10)
top_bigrams = bigram_counts.most_common(10)
top_trigrams = trigram_counts.most_common(10)

labels_uni, values_uni = zip(*top_unigrams)
labels_bi, values_bi = zip(*[(" ".join(bi), count) for bi, count in top_bigrams])
labels_tri, values_tri = zip(*[(" ".join(tri), count) for tri, count in top_trigrams])

fig, axs = plt.subplots(3, 1, figsize=(12,18))
axs[0].bar(labels_uni, values_uni, color='skyblue')
axs[0].set_title("Top Unigrams")
axs[0].tick_params(axis='x', rotation=45)

axs[1].bar(labels_bi, values_bi, color='lightgreen')
axs[1].set_title("Top Bigrams")
axs[1].tick_params(axis='x', rotation=45)

axs[2].bar(labels_tri, values_tri, color='salmon')
axs[2].set_title("Top Trigrams")
axs[2].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

# loading word embedding and check coverage


**we used google word2vec 300 dimensional embedding trained on google news** 

In [None]:
path_to_model = '/kaggle/input/GoogleNews-vectors-negative300.bin' 
word2vec = KeyedVectors.load_word2vec_format(path_to_model, binary=True, limit=100000)

def check_coverage(tokenized_reviews, model):
    word_counts = Counter()
    total_words = 0
    known_words = 0

    for tokens in tokenized_reviews:  
        word_counts.update(tokens)  

    total_words = sum(word_counts.values())  
    known_words = sum(count for word, count in word_counts.items() if word in model)

    print(f"Total words in dataset: {total_words}")
    print(f"Words covered in Word2Vec: {known_words} ({(known_words / total_words) * 100:.2f}%)")

check_coverage(dataset['tokens'], word2vec)


**lets see top out of vocabulary words**

In [None]:
oov_words = [word for tokens in dataset['tokens'] for word in tokens if word not in word2vec]
oov_word_counts = Counter(oov_words)

print("Most frequent Out of Vocabulary words:")
print(oov_word_counts.most_common(20)) 


In [None]:
df = dataset

# converting sentences to fixed size vectors using word embedding
**with avg , sum and max norm**

In [None]:
def sentence_to_vector(tokenized_sentence, model, method="average"):
    words = [word for word in tokenized_sentence if word in model] 
    if not words:
        return np.zeros(300) 
        
    word_vectors = np.array([model[word] for word in words])  
    
    if method == "sum":
        return np.sum(word_vectors, axis=0)  
    elif method == "max":
        return np.max(word_vectors, axis=0)  
    else:
        return np.mean(word_vectors, axis=0)  

df['sentence_vectors_avg'] = [sentence_to_vector(tokens, word2vec) for tokens in tqdm(df['tokens'], desc="Vectorizing Sentences with avg norm")]
df['sentence_vectors_sum'] = [sentence_to_vector(tokens, word2vec,"sum") for tokens in tqdm(df['tokens'], desc="Vectorizing Sentences with sum norm")]
df['sentence_vectors_max'] = [sentence_to_vector(tokens, word2vec,"max") for tokens in tqdm(df['tokens'], desc="Vectorizing Sentences with max norm")]

# train test split (stratified)

In [None]:
X_avg = np.array(df['sentence_vectors_avg'].tolist())  
X_sum = np.array(df['sentence_vectors_sum'].tolist())  
X_max = np.array(df['sentence_vectors_max'].tolist())  
y = df['Class']
X_avg_train, X_avg_test, y_avg_train, y_avg_test = train_test_split(X_avg, y, test_size=0.2, stratify=y, random_state=42)
X_sum_train, X_sum_test, y_sum_train, y_sum_test = train_test_split(X_sum, y, test_size=0.2, stratify=y, random_state=42)
X_max_train, X_max_test, y_max_train, y_max_test = train_test_split(X_max, y, test_size=0.2, stratify=y, random_state=42)

# Batch gradient descent 
**will do it 3 times , ones for each avg, sum and max norm sentence vector**

**with avg norm**

In [None]:
clf_batch_avg = LogisticRegression(max_iter=1000, multi_class='ovr', solver='lbfgs') 
clf_batch_avg.fit(X_avg_train, y_avg_train)
y_avg_pred_test = clf_batch_avg.predict(X_avg_test)
y_avg_pred_train = clf_batch_avg.predict(X_avg_train)

print(f"Accuracy_on_test_data: {accuracy_score(y_avg_test, y_avg_pred_test):.4f}")
print("Classification Report test data:")
print(classification_report(y_avg_test, y_avg_pred_test))

print(f"Accuracy_on_train_data: {accuracy_score(y_avg_train, y_avg_pred_train):.4f}")
print("Classification Report train data:")
print(classification_report(y_avg_train, y_avg_pred_train))

**with sum norm**

In [None]:
clf_batch_sum = LogisticRegression(max_iter=1000, multi_class='ovr', solver='lbfgs') 
clf_batch_sum.fit(X_sum_train, y_sum_train)
y_sum_pred_test = clf_batch_sum.predict(X_sum_test)
y_sum_pred_train = clf_batch_sum.predict(X_sum_train)

print(f"Accuracy_on_test_data: {accuracy_score(y_sum_test, y_sum_pred_test):.4f}")
print("Classification Report test data:")
print(classification_report(y_sum_test, y_sum_pred_test))

print(f"Accuracy_on_train_data: {accuracy_score(y_sum_train, y_sum_pred_train):.4f}")
print("Classification Report train data:")
print(classification_report(y_sum_train, y_sum_pred_train))

**with max norm**

In [None]:
clf_batch_max = LogisticRegression(max_iter=1000, multi_class='ovr', solver='lbfgs') 
clf_batch_max.fit(X_max_train, y_max_train)
y_max_pred_test = clf_batch_max.predict(X_max_test)
y_max_pred_train = clf_batch_max.predict(X_max_train)

print(f"Accuracy_on_test_data: {accuracy_score(y_max_test, y_max_pred_test):.4f}")
print("Classification Report test data:")
print(classification_report(y_max_test, y_max_pred_test))

print(f"Accuracy_on_train_data: {accuracy_score(y_max_train, y_max_pred_train):.4f}")
print("Classification Report train data:")
print(classification_report(y_max_train, y_max_pred_train))

**as all 3 norms give similar accuracy, with avg norm being slightly better, we will use avg norm from now on**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_avg, y, test_size=0.2, stratify=y, random_state=42)

**Hyperparameter tuning to find number of iterations at which convergence was achieved**

In [None]:
max_iter_values = [100,300,500,1000] 
accuracies = []
for max_iter_value in max_iter_values:
    clf_batch = LogisticRegression(max_iter=max_iter_value, multi_class='ovr', solver='lbfgs')
    clf_batch.fit(X_train, y_train)
    y_pred_test = clf_batch.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred_test)
    accuracies.append(accuracy)
    print(f"max_iter = {max_iter_value}, Accuracy on test data: {accuracy:.4f}")

plt.plot(max_iter_values, accuracies, marker='o')
plt.xlabel('max_iter')
plt.ylabel('Accuracy')
plt.title('Hyperparameter Tuning: max_iter vs Accuracy')
plt.grid(True)
plt.show()


# Stochastic Gradient descent

In [None]:
clf_stochastic = SGDClassifier(
    loss='log_loss',
    max_iter=1000,
    tol=1e-4,
    random_state=42,
    penalty='l2',
    alpha=0.0001 
)

clf_stochastic.fit(X_train, y_train)
y_pred_test_stochastic = clf_stochastic.predict(X_test)
y_pred_train_stochastic = clf_stochastic.predict(X_train)

print(f"Accuracy_on_test_data: {accuracy_score(y_test, y_pred_test_stochastic):.4f}")
print("Classification Report test data:")
print(classification_report(y_test, y_pred_test_stochastic))

print(f"Accuracy_on_train_data: {accuracy_score(y_train, y_pred_train_stochastic):.4f}")
print("Classification Report train data:")
print(classification_report(y_train, y_pred_train_stochastic))

**Hyperparameter tuning to find number of iterations at which convergence was achieved**

In [None]:
max_iter_values = [10,30,50,100,300,500,1000]
accuracies = []

for max_iter_value in max_iter_values:
    clf_stochastic = SGDClassifier(
    loss='log_loss',          
    max_iter=max_iter_value,       
    tol=1e-4,
    random_state=42,
    penalty='l2',
    alpha=0.0001
    )

    clf_stochastic.fit(X_train, y_train)
    y_pred_test_stochastic = clf_stochastic.predict(X_test)    
    accuracy = accuracy_score(y_test, y_pred_test_stochastic)
    accuracies.append(accuracy)    
    print(f"max_iter = {max_iter_value}, Accuracy on test data: {accuracy:.4f}")

plt.plot(max_iter_values, accuracies, marker='o')
plt.xlabel('max_iter')
plt.ylabel('Accuracy')
plt.title('Hyperparameter Tuning: max_iter vs Accuracy')
plt.grid(True)
plt.show()


# Mini-Batch gradient descent

In [None]:
mini_batch_size = 100  
n_epochs = 10         
clf_minibatch = SGDClassifier(
    loss='log_loss',  
    max_iter=1,
    tol=None,
    random_state=42,
    penalty='l2',
    alpha=0.0001         
)

classes = np.unique(y_train)
clf_minibatch.partial_fit(X_train[:mini_batch_size], y_train[:mini_batch_size], classes=classes)

for epoch in range(n_epochs):
    X_train_shuffled, y_train_shuffled = shuffle(X_train, y_train, random_state=42)
    for i in range(0, len(X_train), mini_batch_size):
        X_mini_batch = X_train_shuffled[i:i + mini_batch_size]
        y_mini_batch = y_train_shuffled[i:i + mini_batch_size]
        clf_minibatch.partial_fit(X_mini_batch, y_mini_batch)

y_pred_test_minibatch = clf_minibatch.predict(X_test)
y_pred_train_minibatch = clf_minibatch.predict(X_train)

print(f"Accuracy_on_test_data: {accuracy_score(y_test, y_pred_test_minibatch):.4f}")
print("Classification Report test data:")
print(classification_report(y_test, y_pred_test_minibatch))

print(f"Accuracy_on_train_data: {accuracy_score(y_train, y_pred_train_minibatch):.4f}")
print("Classification Report train data:")
print(classification_report(y_train, y_pred_train_minibatch))

# Random forest Classifier

In [None]:
clf_rf = RandomForestClassifier(
    n_estimators=100,      
    max_depth=10,  
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,       
    criterion='gini',      
    n_jobs=-1              
)

clf_rf.fit(X_train, y_train)
y_pred_test_rf = clf_rf.predict(X_test)
y_pred_train_rf = clf_rf.predict(X_train)

print(f"Accuracy_on_test_data: {accuracy_score(y_test, y_pred_test_rf):.4f}")
print("Classification Report test data:")
print(classification_report(y_test, y_pred_test_rf))

print(f"Accuracy_on_train_data: {accuracy_score(y_train, y_pred_train_rf):.4f}")
print("Classification Report train data:")
print(classification_report(y_train, y_pred_train_rf))

**Hyperparameter tuning using Randomized Search for random forest classifier**

In [None]:
from sklearn.model_selection import RandomizedSearchCV,StratifiedShuffleSplit

param_dist = {
    'n_estimators': [100, 200, 500],  
    'max_depth': [10, 20, None],  
    'min_samples_split': [2, 5, 10],  
    'min_samples_leaf': [1, 2, 4],  
    'criterion': ['gini'],  
}

clf_rf = RandomForestClassifier(random_state=42, n_jobs=-1)
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

random_search = RandomizedSearchCV(
    estimator=clf_rf,
    param_distributions=param_dist,
    n_iter=20,             
    scoring='accuracy',    
    cv=sss,                 
    verbose=2,             
    random_state=42,
    n_jobs=-1              
)

random_search.fit(X_train, y_train)
print("Best Parameters:", random_search.best_params_)
print(f"Best Accuracy from Validation Split:  {random_search.best_score_:.4f}")
best_rf = random_search.best_estimator_
y_pred = best_rf.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
print(f"test accuracy: {test_accuracy:.2f}")