<a href="https://colab.research.google.com/github/Tanishq758/IMDb-Sentiment-Analysis-ML/blob/main/sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


--2025-09-07 12:18:40--  https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz’


2025-09-07 12:18:56 (5.21 MB/s) - ‘aclImdb_v1.tar.gz’ saved [84125825/84125825]



In [23]:
!tar -xzf aclImdb_v1.tar.gz


In [24]:
import os

print("Train folders:", os.listdir("aclImdb/train"))
print("Test folders:", os.listdir("aclImdb/test"))





Train folders: ['pos', 'urls_neg.txt', 'unsupBow.feat', 'urls_pos.txt', 'urls_unsup.txt', 'labeledBow.feat', 'neg', 'unsup']
Test folders: ['pos', 'urls_neg.txt', 'urls_pos.txt', 'labeledBow.feat', 'neg']


In [25]:
import glob
import pandas as pd

def load_imdb_dataset(directory):
    data = []
    for label, sentiment in enumerate(["neg", "pos"]):  # 0=neg, 1=pos
        files = glob.glob(os.path.join(directory, sentiment, "*.txt"))
        for file in files:
            with open(file, encoding="utf-8") as f:
                text = f.read()
                data.append((text, label))
    return pd.DataFrame(data, columns=["review", "label"])

train_df = load_imdb_dataset("aclImdb/train")
test_df = load_imdb_dataset("aclImdb/test")

print(train_df.shape)
print(test_df.shape)
train_df.head()
train_df.tail()


(25000, 2)
(25000, 2)


Unnamed: 0,review,label
24995,I think James Cameron might be becoming my fav...,1
24996,"Yeah, it's a chick flick and it moves kinda sl...",1
24997,In Mexico this movie was aired only in PayTV. ...,1
24998,Have you heard the story about the reluctant h...,1
24999,The entire 10:15 minute presentation is done i...,1


In [7]:
import glob
import re
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score, classification_report


In [8]:
def clean_text(text):
    text = text.lower()                                     # Lowercase
    text = re.sub(r"<.*?>", " ", text)                      # Remove HTML tags
    text = re.sub(r"http\S+|www\S+", " ", text)             # Remove URLs
    text = re.sub(r"\d+", " ", text)                        # Remove numbers
    text = re.sub(r"[^a-zA-Z\s]", " ", text)                # Remove punctuation/symbols
    text = re.sub(r"\s+", " ", text).strip()                # Remove extra spaces
    return text

In [9]:
df = pd.concat([train_df, test_df]).reset_index(drop=True)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(
   df["review"], df["label"], test_size=0.2, random_state=42
)

In [11]:
vectorizer = TfidfVectorizer(max_features=20000, stop_words="english")
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [12]:
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_tfidf, y_train)
y_pred_lr = log_reg.predict(X_test_tfidf)

In [14]:
print("=== Logistic Regression ===")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("F1 Score:", f1_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))

=== Logistic Regression ===
Accuracy: 0.8908
F1 Score: 0.8906906906906907
              precision    recall  f1-score   support

           0       0.90      0.88      0.89      5055
           1       0.88      0.90      0.89      4945

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



In [15]:
nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train)
y_pred_nb = nb.predict(X_test_tfidf)

In [16]:
print("\n=== Naive Bayes ===")
print("Accuracy:", accuracy_score(y_test, y_pred_nb))
print("F1 Score:", f1_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))


=== Naive Bayes ===
Accuracy: 0.8587
F1 Score: 0.856153924462995
              precision    recall  f1-score   support

           0       0.86      0.87      0.86      5055
           1       0.86      0.85      0.86      4945

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000



In [17]:
svm = LinearSVC()
svm.fit(X_train_tfidf, y_train)
y_pred_svm = svm.predict(X_test_tfidf)

In [18]:
print("\n=== SVM ===")
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("F1 Score:", f1_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))


=== SVM ===
Accuracy: 0.8835
F1 Score: 0.882761396799839
              precision    recall  f1-score   support

           0       0.89      0.88      0.88      5055
           1       0.88      0.89      0.88      4945

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000

