<a href="https://colab.research.google.com/github/Saherpathan/ASR/blob/main/Untitled38.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Assignment: Audience Rating Prediction

---
Submitted by:
Saher Pathan

sahergpathan@gmail.com

Date: 20/12/24



[Dataset used](https://docs.google.com/spreadsheets/d/1iudRNFp8vSp2K-BG7u276yhgEUoLq0MU/edit?usp=sharing&ouid=107464109217356733956&rtpof=true&sd=true)

Importing necessary libraries


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score
import joblib
import ipywidgets as widgets
from IPython.display import display


Loading the data

In [2]:
df = pd.read_excel("/content/Rotten_Tomatoes_Movies3.xls")


Viewing the given dataset

In [3]:
df.head()

Unnamed: 0,movie_title,movie_info,critics_consensus,rating,genre,directors,writers,cast,in_theaters_date,on_streaming_date,runtime_in_minutes,studio_name,tomatometer_status,tomatometer_rating,tomatometer_count,audience_rating
0,Percy Jackson & the Olympians: The Lightning T...,A teenager discovers he's the descendant of a ...,Though it may seem like just another Harry Pot...,PG,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,Craig Titley,"Logan Lerman, Brandon T. Jackson, Alexandra Da...",2010-02-12,2010-06-29,83.0,20th Century Fox,Rotten,49,144,53.0
1,Please Give,Kate has a lot on her mind. There's the ethics...,Nicole Holofcener's newest might seem slight i...,R,Comedy,Nicole Holofcener,Nicole Holofcener,"Catherine Keener, Amanda Peet, Oliver Platt, R...",2010-04-30,2010-10-19,90.0,Sony Pictures Classics,Certified Fresh,86,140,64.0
2,10,Blake Edwards' 10 stars Dudley Moore as George...,,R,"Comedy, Romance",Blake Edwards,Blake Edwards,"Dudley Moore, Bo Derek, Julie Andrews, Robert ...",1979-10-05,1997-08-27,118.0,Waner Bros.,Fresh,68,22,53.0
3,12 Angry Men (Twelve Angry Men),"A Puerto Rican youth is on trial for murder, a...",Sidney Lumet's feature debut is a superbly wri...,NR,"Classics, Drama",Sidney Lumet,Reginald Rose,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",1957-04-13,2001-03-06,95.0,Criterion Collection,Certified Fresh,100,51,97.0
4,"20,000 Leagues Under The Sea","This 1954 Disney version of Jules Verne's 20,0...","One of Disney's finest live-action adventures,...",G,"Action & Adventure, Drama, Kids & Family",Richard Fleischer,Earl Felton,"James Mason, Kirk Douglas, Paul Lukas, Peter L...",1954-01-01,2003-05-20,127.0,Disney,Fresh,89,27,74.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16638 entries, 0 to 16637
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   movie_title         16638 non-null  object        
 1   movie_info          16614 non-null  object        
 2   critics_consensus   8309 non-null   object        
 3   rating              16638 non-null  object        
 4   genre               16621 non-null  object        
 5   directors           16524 non-null  object        
 6   writers             15289 non-null  object        
 7   cast                16354 non-null  object        
 8   in_theaters_date    15823 non-null  datetime64[ns]
 9   on_streaming_date   16636 non-null  datetime64[ns]
 10  runtime_in_minutes  16483 non-null  float64       
 11  studio_name         16222 non-null  object        
 12  tomatometer_status  16638 non-null  object        
 13  tomatometer_rating  16638 non-null  int64     

Selecting the features along with the target

In [5]:

X = df[['movie_info', 'critics_consensus', 'tomatometer_rating',
        'tomatometer_count', 'runtime_in_minutes', 'genre', 'rating', 'studio_name']]
y = df['audience_rating']

# Split dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Data Imputing

In [6]:
# Impute missing values in 'movie_info' and 'critics_consensus' with empty strings
X_train['movie_info'] = X_train['movie_info'].fillna('')
X_train['critics_consensus'] = X_train['critics_consensus'].fillna('')

# Impute missing values in other columns with appropriate strategies
# For example, for numerical features, you can use the mean or median:
X_train['runtime_in_minutes'] = X_train['runtime_in_minutes'].fillna(X_train['runtime_in_minutes'].mean())

# For categorical features, you can use the most frequent category or a separate category for missing values:
X_train['genre'] = X_train['genre'].fillna(X_train['genre'].mode()[0])

# Additionally, impute missing values in the target variable 'y_train'
# Here, we use the mean, but you can choose other strategies like median
y_train = y_train.fillna(y_train.mean())


Defining pre-processing pipeline

In [7]:
# TfidfVectorizer for textual features
text_transformer = TfidfVectorizer(max_features=100)

# StandardScaler for numerical features
numerical_transformer = StandardScaler()

# OneHotEncoder for categorical features
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Combining the transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('text', text_transformer, 'movie_info'),
        ('critic_text', text_transformer, 'critics_consensus'),
        ('num', numerical_transformer, ['tomatometer_rating', 'tomatometer_count', 'runtime_in_minutes']),
        ('cat', categorical_transformer, ['genre', 'rating', 'studio_name'])
    ])


Training the model

In [None]:
# Full pipeline with RandomForestRegressor as the model
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Train the model
model_pipeline.fit(X_train, y_train)

# Save the model for later use
joblib.dump(model_pipeline, 'audience_rating_model.pkl')


Making the predictions

In [None]:
# Make predictions on the test set
y_pred = model_pipeline.predict(X_test)


Evaluating the model


In [None]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.2f}")
print(f"R² Score: {r2:.2f}")


Interactive Prediction Interface


In [None]:
# Load the trained model
model = joblib.load('audience_rating_model.pkl')

# Widgets for user input
movie_info_input = widgets.Textarea(value="Enter movie info here...", description="Movie Info:", layout=widgets.Layout(width='80%'))
critics_consensus_input = widgets.Textarea(value="Enter critics' consensus...", description="Critics:", layout=widgets.Layout(width='80%'))
tomatometer_rating_input = widgets.IntSlider(value=50, min=0, max=100, description="Tomatometer:")
tomatometer_count_input = widgets.IntText(value=10, description="Tomato Count:")
runtime_input = widgets.IntText(value=90, description="Runtime (mins):")
genre_input = widgets.Dropdown(options=['Action', 'Comedy', 'Drama', 'Romance', 'Horror', 'Sci-Fi'], value='Drama', description="Genre:")
rating_input = widgets.Dropdown(options=['G', 'PG', 'PG-13', 'R', 'NR'], value='PG', description="Rating:")
studio_input = widgets.Text(value="Enter studio name", description="Studio:")

# Prediction Button
predict_button = widgets.Button(description="Predict Rating")

# Output widget for displaying result
output = widgets.Output()

# Define prediction function
def predict_rating(_):
    # Collect input data
    input_data = pd.DataFrame({
        'movie_info': [movie_info_input.value],
        'critics_consensus': [critics_consensus_input.value],
        'tomatometer_rating': [tomatometer_rating_input.value],
        'tomatometer_count': [tomatometer_count_input.value],
        'runtime_in_minutes': [runtime_input.value],
        'genre': [genre_input.value],
        'rating': [rating_input.value],
        'studio_name': [studio_input.value]
    })

    # Get prediction
    prediction = model.predict(input_data)[0]

    # Display result
    with output:
        output.clear_output()
        print(f"🌟 Predicted Audience Rating: {prediction:.2f}")

predict_button.on_click(predict_rating)

# Display the input widgets and prediction button
display(movie_info_input, critics_consensus_input, tomatometer_rating_input,
        tomatometer_count_input, runtime_input, genre_input, rating_input,
        studio_input, predict_button, output)
