In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# 1. Load Data
df_movies = pd.read_csv('datasets/rotten_tomatoes_movies.csv')
df_reviews = pd.read_csv('datasets/rotten_tomatoes_critic_reviews_50k.csv')

# 2. Merge Dataframes on 'rotten_tomatoes_link'
# We want to predict the movie status based on reviews, so we join to get the status for each review
df_merged = pd.merge(df_reviews, df_movies[['rotten_tomatoes_link', 'tomatometer_status']], on='rotten_tomatoes_link', how='inner')

# 3. Check for missing values in target and relevant features
df_merged = df_merged.dropna(subset=['review_content', 'tomatometer_status'])

# 4. Define Target and Features
# Target: tomatometer_status (Rotten, Fresh, Certified-Fresh)
# Feature: review_content
X = df_merged['review_content']
y = df_merged['tomatometer_status']

# 5. Train/Test Split
# Stratify to maintain class distribution
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 6. Verify the split
print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)

# 7. Feature Extraction (TF-IDF)
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# 8. Logistic Regression Model
# max_iter=1000 to ensure convergence
log_reg = LogisticRegression(max_iter=1000, multi_class='auto', random_state=42)
log_reg.fit(X_train_tfidf, y_train)

# 9. Evaluation
y_pred = log_reg.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Training set shape: (32933,)
Test set shape: (8234,)




Accuracy: 0.6051736701481661

Classification Report:
                  precision    recall  f1-score   support

Certified-Fresh       0.58      0.52      0.55      2380
          Fresh       0.52      0.40      0.45      2304
         Rotten       0.65      0.80      0.72      3550

       accuracy                           0.61      8234
      macro avg       0.58      0.57      0.57      8234
   weighted avg       0.59      0.61      0.59      8234

