In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler, PowerTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tqdm.auto import tqdm
import re
from datetime import datetime
import joblib

In [None]:
print("Loading dataset...")
dataset_path = "/content/Rotten_Tomatoes_Movies dataset(1).csv"
df = pd.read_csv(dataset_path, header=0, index_col=None)

Loading dataset...


In [2]:
# Load data
data = pd.read_csv("/content/Rotten_Tomatoes_Movies dataset(1).csv")

# Handle missing values
imputer = SimpleImputer(strategy="most_frequent")
data = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

# Bin audience ratings
bins = np.linspace(0, 100, 21)
labels = np.arange(20)
data['audience_rating'] = pd.cut(data['audience_rating'], bins=bins, labels=labels, right=True).astype(float)
data['audience_rating'] = data['audience_rating'].fillna(data['audience_rating'].median()).astype(int)

# Encode categorical data
rating_map = {'G': 1, 'PG': 2, 'PG-13': 3, 'R': 4, 'NC-17': 5, 'PG-13)': 3, 'R)': 4, 'NC17': 5, 'NR': 0}
status_map = {'Rotten': 0, 'Fresh': 1, 'Certified Fresh': 2}
data['rating'] = data['rating'].map(rating_map)
data['tomatometer_status'] = data['tomatometer_status'].map(status_map)

# Process date features
data['in_theaters_date'] = pd.to_datetime(data['in_theaters_date'], format='%m/%d/%Y', errors='coerce')
data['on_streaming_date'] = pd.to_datetime(data['on_streaming_date'], format='%m/%d/%Y', errors='coerce')
data['days_in_theaters'] = (data['on_streaming_date'] - data['in_theaters_date']).dt.days
data['release_year'] = data['in_theaters_date'].dt.year
data['release_month'] = data['in_theaters_date'].dt.month
data['release_quarter'] = data['in_theaters_date'].dt.quarter
current_year = datetime.now().year
data['movie_age'] = current_year - data['release_year']

# Extract studio and genre features
studio_size = data['studio_name'].value_counts()
data['studio_size'] = data['studio_name'].map(studio_size)
data['genre_count'] = data['genre'].fillna('').str.count(',') + 1

data['cast_size'] = data['cast'].fillna('').str.count(',') + 1
data['director_count'] = data['directors'].fillna('').str.count(',') + 1
data['writer_count'] = data['writers'].fillna('').str.count(',') + 1

# Drop unnecessary columns
data = data.drop(['in_theaters_date', 'on_streaming_date'], axis=1)

# Normalize numerical features
numerical_features = ['runtime_in_minutes', 'tomatometer_rating', 'tomatometer_count', 'days_in_theaters', 'release_year', 'movie_age', 'studio_size', 'cast_size', 'director_count', 'writer_count', 'genre_count']
pt = PowerTransformer(method='yeo-johnson')
data[numerical_features] = data[numerical_features].fillna(0)
data[numerical_features] = pt.fit_transform(data[numerical_features])

# Prepare features
target = data['audience_rating']
non_text_features = data.drop(columns=['audience_rating', 'movie_title', 'movie_info', 'critics_consensus', 'genre', 'cast', 'directors', 'writers', 'studio_name'])


  data[numerical_features] = data[numerical_features].fillna(0)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


In [3]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(non_text_features, target, test_size=0.2, random_state=42)

# Process text features
def preprocess_text(text):
    text = re.sub(r'[^\w\s]', ' ', str(text))
    text = re.sub(r'\s+', ' ', text).strip()
    return text

text_data = data[['movie_title', 'movie_info', 'critics_consensus', 'genre', 'cast']]
text_data = text_data.fillna('')
combined_texts = [
    f"TITLE: {preprocess_text(row['movie_title'])} INFO: {preprocess_text(row['movie_info'])} CONSENSUS: {preprocess_text(row['critics_consensus'])} GENRE: {preprocess_text(row['genre'])} CAST: {preprocess_text(row['cast'])}"
    for _, row in text_data.iterrows()
]

tfidf = TfidfVectorizer(max_features=1000, stop_words='english')
text_features = tfidf.fit_transform(combined_texts)

X_train_text, X_test_text, _, _ = train_test_split(text_features, target, test_size=0.2, random_state=42)


In [4]:
# Train models
rf = RandomForestRegressor(n_estimators=200, max_depth=20, random_state=42)
xgb = XGBRegressor(random_state=42)
cat = CatBoostRegressor(random_state=42, verbose=0)

rf.fit(X_train, y_train)
xgb.fit(X_train, y_train)
cat.fit(X_train, y_train)

# Train neural network for text features
text_model = Sequential([
    Dense(1024, activation='relu', input_shape=(1000,)),
    Dropout(0.4),
    Dense(512, activation='relu'),
    Dropout(0.3),
    Dense(256, activation='relu'),
    Dropout(0.3),
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    Dense(1)
])
text_model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])
text_model.fit(X_train_text.toarray(), y_train, epochs=20, batch_size=32, validation_split=0.2, verbose=1)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m333/333[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 32ms/step - loss: 32.0502 - mae: 4.3457 - val_loss: 13.4272 - val_mae: 3.0056
Epoch 2/20
[1m333/333[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 31ms/step - loss: 13.2201 - mae: 2.9635 - val_loss: 13.7270 - val_mae: 3.0548
Epoch 3/20
[1m333/333[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 29ms/step - loss: 11.1761 - mae: 2.7187 - val_loss: 14.2664 - val_mae: 3.1110
Epoch 4/20
[1m333/333[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 29ms/step - loss: 7.6559 - mae: 2.2093 - val_loss: 14.4469 - val_mae: 3.0502
Epoch 5/20
[1m333/333[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 31ms/step - loss: 5.1384 - mae: 1.8069 - val_loss: 13.8931 - val_mae: 3.0140
Epoch 6/20
[1m333/333[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 38ms/step - loss: 4.0142 - mae: 1.5860 - val_loss: 16.3611 - val_mae: 3.3313
Epoch 7/20
[1m333/333[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

<keras.src.callbacks.history.History at 0x7d8ea8b67750>

In [5]:
# Generate predictions
text_predictions = text_model.predict(X_test_text.toarray()).squeeze()
non_text_predictions = [rf.predict(X_test), xgb.predict(X_test), cat.predict(X_test)]

# Ensemble stacking
meta_features = np.vstack([text_predictions, *non_text_predictions]).T
meta_model = XGBRegressor(random_state=42, n_estimators=100)
meta_model.fit(meta_features, y_test)

# Evaluate model
y_pred = meta_model.predict(meta_features)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)

print("Evaluation Results:")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R^2 Score: {r2}")
print(f"Root Mean Squared Error (RMSE): {rmse}")


[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step
Evaluation Results:
Mean Squared Error (MSE): 1.4927054643630981
Mean Absolute Error (MAE): 0.9017853140830994
R^2 Score: 0.9098700284957886
Root Mean Squared Error (RMSE): 1.2217632603590183
