In [None]:
import pandas as pd

In [None]:
file_path = '../data/IMDB Dataset.csv'
df = pd.read_csv(file_path)

In [None]:
print("First 5 rows of the dataset:")
display(df.head())
print("\nDataset Info:")
df.info()
print(f"\nDataset Shape: {df.shape}")

In [None]:
import re

def clean_text(text):
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    return text

df['cleaned_review'] = df['review'].apply(clean_text)

In [None]:
print("Original vs. Cleaned Text:")
display(df[['review', 'cleaned_review']].head())

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

X = df['cleaned_review']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print("Shape of the TF-IDF Training Matrix:", X_train_tfidf.shape)
print("Shape of the TF-IDF Testing Matrix:", X_test_tfidf.shape)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

model = LogisticRegression(random_state=42, max_iter=1000)

print("Training the model...")
model.fit(X_train_tfidf, y_train)
print("Training complete.")

print("\nMaking predictions on the test set...")
y_pred = model.predict(X_test_tfidf)
print("Prediction complete.")

accuracy = accuracy_score(y_test, y_pred)

print("\n--- MODEL PERFORMANCE ---")
print(f"Accuracy: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [None]:
import joblib
import os

os.makedirs('models', exist_ok=True)

joblib.dump(tfidf, 'models/tfidf_vectorizer.joblib')
joblib.dump(model, 'models/sentiment_model.joblib')

print("\nModel and Vectorizer have been saved to the 'models' directory.")