In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
data = pd.read_csv('/Users/dk/Documents/IMDb Movies India.csv',encoding='latin1')
data.head(10)


Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali
5,...Aur Pyaar Ho Gaya,(1997),147 min,"Comedy, Drama, Musical",4.7,827.0,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor
6,...Yahaan,(2005),142 min,"Drama, Romance, War",7.4,1086.0,Shoojit Sircar,Jimmy Sheirgill,Minissha Lamba,Yashpal Sharma
7,.in for Motion,(2008),59 min,Documentary,,,Anirban Datta,,,
8,?: A Question Mark,(2012),82 min,"Horror, Mystery, Thriller",5.6,326.0,Allyson Patel,Yash Dave,Muntazir Ahmad,Kiran Bhatia
9,@Andheri,(2014),116 min,"Action, Crime, Thriller",4.0,11.0,Biju Bhaskar Nair,Augustine,Fathima Babu,Byon


In [3]:
print(data.isnull().sum())

Name           0
Year         528
Duration    8269
Genre       1877
Rating      7590
Votes       7589
Director     525
Actor 1     1617
Actor 2     2384
Actor 3     3144
dtype: int64


In [7]:
# Drop rows with missing target value
data= data.dropna(subset=['Rating'])


In [19]:
# Fill missing values
data['Year'] = data['Year'].fillna(data['Year'].mode()[0])
data['Duration'] = data['Duration'].fillna(data['Duration'].mode()[0])
data['Genre'] = data['Genre'].fillna(data['Genre'].mode()[0])
data['Director'] = data['Director'].fillna(data['Director'].mode()[0])
data['Actor 1'] = data['Actor 1'].fillna(data['Actor 1'].mode()[0])
data['Actor 2'] = data['Actor 2'].fillna(data['Actor 2'].mode()[0])
data['Actor 3'] = data['Actor 3'].fillna(data['Actor 3'].mode()[0])
data['Votes'] = pd.to_numeric(data['Votes'].astype(str).str.replace(',', ''), errors='coerce')
data['Votes'] = data['Votes'].fillna(data['Votes'].median())


# Remove commas and convert Votes to numeric. Since in votes there are commas so we cannot convert it into numeric directly.
data['Votes'] = data['Votes'].astype(str).str.replace(',', '')
data['Votes'] = pd.to_numeric(data['Votes'], errors='coerce')
# If there is a missing value, we have to handle it. So we filling those with median
data['Votes'] = data['Votes'].fillna(data['Votes'].median())


In [21]:
# Feature Engineering: Director success rate
director_success = data.groupby('Director')['Rating'].mean().rename('Director_Success')
data = data.merge(director_success, on='Director', how='left')

In [23]:
# Feature Engineering: Genre average rating
genre_avg_rating = data.groupby('Genre')['Rating'].mean().rename('Genre_Avg_Rating')
data = data.merge(genre_avg_rating, on='Genre', how='left')

In [25]:
# Define features and target
features = ['Year', 'Duration', 'Votes', 'Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3',
            'Director_Success', 'Genre_Avg_Rating']
X = data[features]
y = data['Rating']

In [33]:
data['Year'] = data['Year'].astype(str).str.extract('(\d{4})')  # extract 4-digit year
data['Year'] = pd.to_numeric(data['Year'], errors='coerce')
data['Year'] = data['Year'].fillna(data['Year'].mode()[0])


  data['Year'] = data['Year'].astype(str).str.extract('(\d{4})')  # extract 4-digit year


In [35]:
X = data.drop(['Rating'], axis=1)
y = data['Rating']

# One-hot encode categorical variables
X = pd.get_dummies(X)

# Confirm all numeric
print(X.dtypes)


Year                                         int64
Votes                                        int64
Director_Success                           float64
Genre_Avg_Rating                           float64
Name_#Gadhvi (He thought he was Gandhi)       bool
                                            ...   
Actor 3_Zeishan Quadri                        bool
Actor 3_Zenobia Shroff                        bool
Actor 3_Zohra                                 bool
Actor 3_Zoya Hussain                          bool
Actor 3_Zulfi Sayed                           bool
Length: 19613, dtype: object


In [37]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [39]:
# Predict and evaluate
y_pred = model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f'RMSE: {rmse:.2f}')
print(f'R² Score: {r2:.2f}')


RMSE: 0.78
R² Score: 0.67


# Observations

In [43]:
# Several key columns had a high number of missing values, including Duration, Rating, Votes, and Actors.
# The Votes column contained string data with commas (e.g: "1,234"), which had to be cleaned for numerical processing.
# The Year column contained extra characters (like parentheses), requiring regex cleaning to extract valid 4-digit years.
# Missing values were handled using a mix of mode (for categorical columns), median (for Votes), and mean (for group-derived columns).
# Categorical columns such as Genre, Director, and Actors were encoded using One-Hot Encoding.
# All features were ensured to be numeric before feeding into the model.
# Director Success Rate: Calculated as the average rating of all movies by the same director, helping capture their historical performance.
# Genre Average Rating: Represented typical performance of a genre, adding domain insight to the model.
# A Random Forest Regressor was chosen due to its robustness,ability to handle non-linear relationships, and reduced risk of overfitting on tabular data.

# Evaluations

In [46]:
# RMSE (Root Mean Square Error): 0.78 — indicates a reasonably low error in predicted ratings.
# R2 Score: 0.67 — the model explains 67% of the variance in the movie ratings.

# Coclusion

In [None]:
# The model is reasonably effective in predicting IMDb movie ratings based on metadata such as year, duration, genre, director, and cast.
# Feature engineering significantly improved performance, particularly with aggregated features like Director Success Rate and Genre Average Rating.