In [2]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
import pickle

In [3]:
# Load dataset
df = pd.read_csv('/content/IMDB Dataset.csv')

In [4]:
# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [5]:
# Preprocessing functions
def remove_html_tags(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

In [6]:
def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

In [7]:
# Apply preprocessing
df['review'] = df['review'].apply(lambda x: remove_html_tags(x))
df['review'] = df['review'].str.lower()
df['review'] = df['review'].apply(lambda x: remove_stopwords(x))

In [8]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size=0.2, random_state=0, stratify=df['sentiment'])

In [9]:
# Train model
clf = Pipeline([('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])
clf.fit(X_train, y_train)

In [10]:
# Evaluate model
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.90      0.89      0.90      5000
    positive       0.89      0.90      0.90      5000

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000



In [12]:
# Save the model
with open('sentiment_analysis.pkl', 'wb') as f:
    pickle.dump(clf, f)