#Environmental_Setup

In [144]:
!pip install pandas numpy scikit-learn openpyxl joblib



In [145]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [146]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib

In [147]:
!pip install pyxlsb
!pip install xlrd



# Data_Loading_and_Exploration

#Load_the_Excel_data

In [148]:
file_path='/Users/vimlendusharma/Downloads/Rotten_Tomatoes_Movies3.xls'
df=pd.read_excel(file_path)

In [149]:
df.head()

Unnamed: 0,movie_title,movie_info,critics_consensus,rating,genre,directors,writers,cast,in_theaters_date,on_streaming_date,runtime_in_minutes,studio_name,tomatometer_status,tomatometer_rating,tomatometer_count,audience_rating
0,Percy Jackson & the Olympians: The Lightning T...,A teenager discovers he's the descendant of a ...,Though it may seem like just another Harry Pot...,PG,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,Craig Titley,"Logan Lerman, Brandon T. Jackson, Alexandra Da...",2010-02-12,2010-06-29,83.0,20th Century Fox,Rotten,49,144,53.0
1,Please Give,Kate has a lot on her mind. There's the ethics...,Nicole Holofcener's newest might seem slight i...,R,Comedy,Nicole Holofcener,Nicole Holofcener,"Catherine Keener, Amanda Peet, Oliver Platt, R...",2010-04-30,2010-10-19,90.0,Sony Pictures Classics,Certified Fresh,86,140,64.0
2,10,Blake Edwards' 10 stars Dudley Moore as George...,,R,"Comedy, Romance",Blake Edwards,Blake Edwards,"Dudley Moore, Bo Derek, Julie Andrews, Robert ...",1979-10-05,1997-08-27,118.0,Waner Bros.,Fresh,68,22,53.0
3,12 Angry Men (Twelve Angry Men),"A Puerto Rican youth is on trial for murder, a...",Sidney Lumet's feature debut is a superbly wri...,NR,"Classics, Drama",Sidney Lumet,Reginald Rose,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",1957-04-13,2001-03-06,95.0,Criterion Collection,Certified Fresh,100,51,97.0
4,"20,000 Leagues Under The Sea","This 1954 Disney version of Jules Verne's 20,0...","One of Disney's finest live-action adventures,...",G,"Action & Adventure, Drama, Kids & Family",Richard Fleischer,Earl Felton,"James Mason, Kirk Douglas, Paul Lukas, Peter L...",1954-01-01,2003-05-20,127.0,Disney,Fresh,89,27,74.0


In [150]:
print(f"Dataset contains {df.shape[0]} rows and {df.shape[1]} columns.")

Dataset contains 16638 rows and 16 columns.


In [151]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16638 entries, 0 to 16637
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   movie_title         16638 non-null  object        
 1   movie_info          16614 non-null  object        
 2   critics_consensus   8309 non-null   object        
 3   rating              16638 non-null  object        
 4   genre               16621 non-null  object        
 5   directors           16524 non-null  object        
 6   writers             15289 non-null  object        
 7   cast                16354 non-null  object        
 8   in_theaters_date    15823 non-null  datetime64[ns]
 9   on_streaming_date   16636 non-null  datetime64[ns]
 10  runtime_in_minutes  16483 non-null  float64       
 11  studio_name         16222 non-null  object        
 12  tomatometer_status  16638 non-null  object        
 13  tomatometer_rating  16638 non-null  int64     

In [152]:
df.describe()

Unnamed: 0,in_theaters_date,on_streaming_date,runtime_in_minutes,tomatometer_rating,tomatometer_count,audience_rating
count,15823,16636,16483.0,16638.0,16638.0,16386.0
mean,1999-10-24 22:10:52.973519488,2008-08-13 13:35:49.266650624,102.391494,60.466522,56.607104,60.470829
min,1914-06-01 00:00:00,1935-06-06 00:00:00,1.0,0.0,5.0,0.0
25%,1993-03-29 00:00:00,2003-01-28 00:00:00,90.0,38.0,12.0,45.0
50%,2006-08-04 00:00:00,2008-04-08 00:00:00,99.0,66.0,28.0,62.0
75%,2013-06-28 00:00:00,2014-05-12 00:00:00,111.0,86.0,76.0,77.0
max,2019-12-07 00:00:00,2019-11-01 00:00:00,2000.0,100.0,497.0,100.0
std,,,25.028011,28.58723,66.3838,20.462368


In [153]:
missing_values=df.isnull().sum()
print("Missing values in each column:")
print(missing_values)

Missing values in each column:
movie_title              0
movie_info              24
critics_consensus     8329
rating                   0
genre                   17
directors              114
writers               1349
cast                   284
in_theaters_date       815
on_streaming_date        2
runtime_in_minutes     155
studio_name            416
tomatometer_status       0
tomatometer_rating       0
tomatometer_count        0
audience_rating        252
dtype: int64


# Data_Preprocessing

#Handle_Missing_Values

In [154]:
df=df.dropna(subset=['audience_rating'])

In [155]:
df

Unnamed: 0,movie_title,movie_info,critics_consensus,rating,genre,directors,writers,cast,in_theaters_date,on_streaming_date,runtime_in_minutes,studio_name,tomatometer_status,tomatometer_rating,tomatometer_count,audience_rating
0,Percy Jackson & the Olympians: The Lightning T...,A teenager discovers he's the descendant of a ...,Though it may seem like just another Harry Pot...,PG,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,Craig Titley,"Logan Lerman, Brandon T. Jackson, Alexandra Da...",2010-02-12,2010-06-29,83.0,20th Century Fox,Rotten,49,144,53.0
1,Please Give,Kate has a lot on her mind. There's the ethics...,Nicole Holofcener's newest might seem slight i...,R,Comedy,Nicole Holofcener,Nicole Holofcener,"Catherine Keener, Amanda Peet, Oliver Platt, R...",2010-04-30,2010-10-19,90.0,Sony Pictures Classics,Certified Fresh,86,140,64.0
2,10,Blake Edwards' 10 stars Dudley Moore as George...,,R,"Comedy, Romance",Blake Edwards,Blake Edwards,"Dudley Moore, Bo Derek, Julie Andrews, Robert ...",1979-10-05,1997-08-27,118.0,Waner Bros.,Fresh,68,22,53.0
3,12 Angry Men (Twelve Angry Men),"A Puerto Rican youth is on trial for murder, a...",Sidney Lumet's feature debut is a superbly wri...,NR,"Classics, Drama",Sidney Lumet,Reginald Rose,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",1957-04-13,2001-03-06,95.0,Criterion Collection,Certified Fresh,100,51,97.0
4,"20,000 Leagues Under The Sea","This 1954 Disney version of Jules Verne's 20,0...","One of Disney's finest live-action adventures,...",G,"Action & Adventure, Drama, Kids & Family",Richard Fleischer,Earl Felton,"James Mason, Kirk Douglas, Paul Lukas, Peter L...",1954-01-01,2003-05-20,127.0,Disney,Fresh,89,27,74.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16633,Zoot Suit,"Based on a 1940s Los Angeles murder trial, thi...",,R,"Drama, Musical & Performing Arts",Luis Valdez,Luis Valdez,"Daniel Valdez, Edward James Olmos, Charles Aid...",1981-10-02,2003-05-27,104.0,MCA Universal Home Video,Rotten,56,9,74.0
16634,Zootopia,The modern mammal metropolis of Zootopia is a ...,The brilliantly well-rounded Zootopia offers a...,PG,"Action & Adventure, Animation, Comedy","Byron Howard, Rich Moore, Jared Bush","Jared Bush, Phil Johnston","Ginnifer Goodwin, Jason Bateman, Idris Elba, J...",2016-03-04,2016-06-07,108.0,Walt Disney Animation Studios,Certified Fresh,97,279,92.0
16635,Zorba the Greek,If ever there was a role that Anthony Quinn wa...,,NR,"Action & Adventure, Art House & International,...",,,"Anthony Quinn, Alan Bates, Irene Papas, Lila K...",1964-12-17,2004-08-03,142.0,Fox,Fresh,78,9,87.0
16636,Zulu,"Filmed on a grand scale, Zulu is a rousing rec...",,PG,"Classics, Drama","Cy Endfield, Cyril Endfield","Cy Endfield, John Prebble","Stanley Baker, Jack Hawkins, Ulla Jacobsson, J...",1964-06-17,2001-02-02,139.0,Paramount Pictures,Fresh,95,21,91.0


#for_other_columns_impute_missing_values

In [156]:
numerical_cols=['runtime_in_minutes', 'tomatometer_rating', 'tomatometer_count']
categorical_cols=['rating', 'genre', 'directors', 'writers', 'studio_name', 'tomatometer_status']
text_cols=['movie_title', 'movie_info', 'critics_consensus', 'cast']
df.loc[:, text_cols] = df.loc[:, text_cols].fillna('')

#for_date_columns_convert_to_datetime_and_filled_missing_with_placeholder

In [157]:
date_cols=['in_theaters_date', 'on_streaming_date']
for col in date_cols:
    df[col]=pd.to_datetime(df[col], errors='coerce')
    df[col]=df[col].fillna(pd.Timestamp('1900-01-01'))

# Feature_Engineering

#Extract_Date_Features

In [158]:
df['release_gap_days']=(df['on_streaming_date']-df['in_theaters_date']).dt.days

In [159]:
df['release_gap_days']=df['release_gap_days'].apply(lambda x:x if x>=0 else 0)

#Features_and_target

In [160]:
target='audience_rating'

features=['movie_title', 'movie_info', 'critics_consensus', 'rating', 'genre', 'directors', 'writers', 'cast',
         'in_theaters_date', 'on_streaming_date', 'runtime_in_minutes', 'studio_name', 'tomatometer_status', 
         'tomatometer_rating', 'tomatometer_count', 'release_gap_days']

# Feature_Transformation_and_Pipeline_Setup

#Identify_Column_Types

In [161]:
categorical_cols=['rating', 'genre', 'directors', 'writers', 'studio_name', 'tomatometer_status']

text_cols=['movie_title', 'movie_info', 'critics_consensus', 'cast']

numerical_cols=['runtime_in_minutes', 'tomatometer_rating', 'tomatometer_count', 'release_gap_days']

#Define_Preprocessing_Steps

In [162]:
#Numerical_Pipeline
numerical_pipeline=Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

#Categorical_Pipeline
categorical_pipeline=Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

#Text_Pipeline
text_pipeline=Pipeline(steps=[
    ('tfidf', TfidfVectorizer(max_features=5000))
])

#Combine_allPreprocessing_Steps

In [172]:
preprocessor=ColumnTransformer(transformers=[
    ('num', numerical_pipeline, numerical_cols),
    ('cat', categorical_pipeline, categorical_cols),
    ('movie_info', text_pipeline, 'movie_info'),
    ('critics_consensus', text_pipeline, 'critics_consensus')
],remainder='drop')

# Model_Building_and_Training

#complete_pipeline

In [173]:
model_pipeline=Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

#Features_and_target

In [174]:
X=df[features]
y=df[target]

X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state=42)

#Model_training

In [175]:
df['release_gap_days']

0          137
1          172
2         6536
3        16033
4        18036
         ...  
16633     7907
16634       95
16635    14474
16636    13379
16637     9419
Name: release_gap_days, Length: 16386, dtype: int64

In [176]:
print("Columns in the DataFrame:")
print(df.columns.tolist())

Columns in the DataFrame:
['movie_title', 'movie_info', 'critics_consensus', 'rating', 'genre', 'directors', 'writers', 'cast', 'in_theaters_date', 'on_streaming_date', 'runtime_in_minutes', 'studio_name', 'tomatometer_status', 'tomatometer_rating', 'tomatometer_count', 'audience_rating', 'release_gap_days']


In [177]:
print("Feature Columns:")
print(features)

Feature Columns:
['movie_title', 'movie_info', 'critics_consensus', 'rating', 'genre', 'directors', 'writers', 'cast', 'in_theaters_date', 'on_streaming_date', 'runtime_in_minutes', 'studio_name', 'tomatometer_status', 'tomatometer_rating', 'tomatometer_count', 'release_gap_days']


In [178]:
missing_target = df[target].isnull().sum()
print(f"Missing values in '{target}': {missing_target}")

if missing_target > 0:
    df = df.dropna(subset=[target])
    print(f"Dropped {missing_target} rows with missing '{target}'.")


Missing values in 'audience_rating': 0


In [179]:
display(df[[target] + features].head())

Unnamed: 0,audience_rating,movie_title,movie_info,critics_consensus,rating,genre,directors,writers,cast,in_theaters_date,on_streaming_date,runtime_in_minutes,studio_name,tomatometer_status,tomatometer_rating,tomatometer_count,release_gap_days
0,53.0,Percy Jackson & the Olympians: The Lightning T...,A teenager discovers he's the descendant of a ...,Though it may seem like just another Harry Pot...,PG,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,Craig Titley,"Logan Lerman, Brandon T. Jackson, Alexandra Da...",2010-02-12,2010-06-29,83.0,20th Century Fox,Rotten,49,144,137
1,64.0,Please Give,Kate has a lot on her mind. There's the ethics...,Nicole Holofcener's newest might seem slight i...,R,Comedy,Nicole Holofcener,Nicole Holofcener,"Catherine Keener, Amanda Peet, Oliver Platt, R...",2010-04-30,2010-10-19,90.0,Sony Pictures Classics,Certified Fresh,86,140,172
2,53.0,10,Blake Edwards' 10 stars Dudley Moore as George...,,R,"Comedy, Romance",Blake Edwards,Blake Edwards,"Dudley Moore, Bo Derek, Julie Andrews, Robert ...",1979-10-05,1997-08-27,118.0,Waner Bros.,Fresh,68,22,6536
3,97.0,12 Angry Men (Twelve Angry Men),"A Puerto Rican youth is on trial for murder, a...",Sidney Lumet's feature debut is a superbly wri...,NR,"Classics, Drama",Sidney Lumet,Reginald Rose,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",1957-04-13,2001-03-06,95.0,Criterion Collection,Certified Fresh,100,51,16033
4,74.0,"20,000 Leagues Under The Sea","This 1954 Disney version of Jules Verne's 20,0...","One of Disney's finest live-action adventures,...",G,"Action & Adventure, Drama, Kids & Family",Richard Fleischer,Earl Felton,"James Mason, Kirk Douglas, Paul Lukas, Peter L...",1954-01-01,2003-05-20,127.0,Disney,Fresh,89,27,18036


In [180]:
model_pipeline.fit(X_train, y_train)

In [181]:
print(target)

audience_rating


# Model_Evaluation

In [182]:
y_pred=model_pipeline.predict(X_test)

#Calculate_evaluation_metrics

In [183]:
mse=mean_squared_error(y_test, y_pred)
rmse=np.sqrt(mse)
r2=r2_score(y_test, y_pred)

print(f"Model Performance on Test Set:")
print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.2f}")

Model Performance on Test Set:
RMSE: 14.23
R² Score: 0.51


# Cross_Validation_and_Hyperparameter_tuning

In [184]:
param_grid={
    'regressor__n_estimators':[100, 200],
    'regressor__max_depth':[None, 10, 20],
    'regressor__min_samples_split':[2, 5],
}

#Initialize_GridSearchCV

In [185]:
grid_search=GridSearchCV(model_pipeline, param_grid, cv=5, scoring='r2', n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END regressor__max_depth=None, regressor__min_samples_split=2, regressor__n_estimators=200; total time=14.1min
[CV] END regressor__max_depth=None, regressor__min_samples_split=5, regressor__n_estimators=200; total time=10.1min
[CV] END regressor__max_depth=10, regressor__min_samples_split=5, regressor__n_estimators=100; total time= 1.4min
[CV] END regressor__max_depth=10, regressor__min_samples_split=5, regressor__n_estimators=200; total time= 2.6min
[CV] END regressor__max_depth=20, regressor__min_samples_split=2, regressor__n_estimators=100; total time= 4.1min
[CV] END regressor__max_depth=20, regressor__min_samples_split=5, regressor__n_estimators=100; total time= 3.3min
[CV] END regressor__max_depth=20, regressor__min_samples_split=5, regressor__n_estimators=100; total time= 3.2min
[CV] END regressor__max_depth=None, regressor__min_samples_split=2, regressor__n_estimators=100; total time= 7.0min
[CV] END regressor__m

In [186]:
print(f"Best parameters: {grid_search.best_params_}")

Best parameters: {'regressor__max_depth': None, 'regressor__min_samples_split': 5, 'regressor__n_estimators': 200}


In [187]:
best_model = grid_search.best_estimator_

#predict_with_best_model

In [188]:
y_pred_best=best_model.predict(X_test)

#Evaluation_of_model

In [189]:
mse_best = mean_squared_error(y_test, y_pred_best)
rmse_best = np.sqrt(mse_best)
r2_best = r2_score(y_test, y_pred_best)

In [190]:
print(f"Best Model Performance on Test Set:")
print(f"RMSE: {rmse_best:.2f}")
print(f"R² Score: {r2_best:.2f}")

Best Model Performance on Test Set:
RMSE: 14.22
R² Score: 0.51


# Model_Saving

In [193]:
import joblib

model_filename = 'audience_rating_model.joblib'

joblib.dump(best_model, model_filename)

print(f"Model saved to '{model_filename}' successfully.")


Model saved to 'audience_rating_model.joblib' successfully.


In [192]:
if 'best_model' in locals():
    print("best_model is trained and available.")
else:
    print("best_model is not defined. Please ensure the model is trained successfully.")


best_model is trained and available.


In [194]:
import os

if os.path.isfile(model_filename):
    file_size = os.path.getsize(model_filename)
    print(f"'{model_filename}' exists and is {file_size} bytes.")
else:
    print(f"Failed to save the model to '{model_filename}'.")


'audience_rating_model.joblib' exists and is 93186208 bytes.


# Load_the_Model

In [196]:
import joblib

model_filename = 'audience_rating_model.joblib'

loaded_model = joblib.load(model_filename)

print(f"Model loaded from '{model_filename}' successfully.")


Model loaded from 'audience_rating_model.joblib' successfully.


In [197]:
print(f"Loaded model type: {type(loaded_model)}")

Loaded model type: <class 'sklearn.pipeline.Pipeline'>


In [200]:
pipeline_filename = 'audience_rating_pipeline.joblib'
joblib.dump(best_model, pipeline_filename)
print(f"Entire pipeline saved to '{pipeline_filename}' successfully.")


Entire pipeline saved to 'audience_rating_pipeline.joblib' successfully.
