In [18]:
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Display plots inside notebook
%matplotlib inline


In [19]:
# Load the dataset with encoding fix
df = pd.read_csv("IMDb Movies India.csv", encoding="latin1")

# Show the first few rows
df.head()


Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali


In [34]:
df['Year'] = df['Year'].astype(str).str.extract('(\d{4})')


In [20]:
# Shape of the data
df.shape

# Info about datatypes and nulls
df.info()

# Summary stats (especially for Rating, Votes)
df.describe()

# Check nulls
df.isnull().sum()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15509 entries, 0 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      15509 non-null  object 
 1   Year      14981 non-null  object 
 2   Duration  7240 non-null   object 
 3   Genre     13632 non-null  object 
 4   Rating    7919 non-null   float64
 5   Votes     7920 non-null   object 
 6   Director  14984 non-null  object 
 7   Actor 1   13892 non-null  object 
 8   Actor 2   13125 non-null  object 
 9   Actor 3   12365 non-null  object 
dtypes: float64(1), object(9)
memory usage: 1.2+ MB


Name           0
Year         528
Duration    8269
Genre       1877
Rating      7590
Votes       7589
Director     525
Actor 1     1617
Actor 2     2384
Actor 3     3144
dtype: int64

In [21]:
df = df.dropna(subset=['Rating'])
df = df.drop(['Actor 3', 'Duration'], axis=1)
df = df.dropna()
# Fill missing Genre, Director, Actors with "Unknown"
for col in ['Genre', 'Director', 'Actor 1', 'Actor 2']:
    df[col] = df[col].fillna('Unknown')
df['Votes'] = df['Votes'].str.replace(',', '')
df['Votes'] = pd.to_numeric(df['Votes'], errors='coerce')



In [22]:
df.info()
df.isnull().sum()


<class 'pandas.core.frame.DataFrame'>
Index: 7641 entries, 1 to 15508
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      7641 non-null   object 
 1   Year      7641 non-null   object 
 2   Genre     7641 non-null   object 
 3   Rating    7641 non-null   float64
 4   Votes     7641 non-null   int64  
 5   Director  7641 non-null   object 
 6   Actor 1   7641 non-null   object 
 7   Actor 2   7641 non-null   object 
dtypes: float64(1), int64(1), object(6)
memory usage: 537.3+ KB


Name        0
Year        0
Genre       0
Rating      0
Votes       0
Director    0
Actor 1     0
Actor 2     0
dtype: int64

In [35]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
import numpy as np

# Clean Year
df['Year'] = df['Year'].str.extract('(\d{4})')
df['Year'] = df['Year'].astype(float)

# Select features
features = ['Genre', 'Director', 'Votes', 'Year', 'Actor 1', 'Actor 2']
df = df[features + ['Rating']].dropna()

# Clean Votes
df['Votes'] = df['Votes'].astype(str).str.replace(',', '')
df['Votes'] = df['Votes'].astype(int)

# Encode categorical features
X = pd.get_dummies(df[features], drop_first=True)

# Target
y = df['Rating']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Model
model = RandomForestRegressor(n_estimators=150, max_depth=15, random_state=42)
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluation
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f"\n✅ RMSE: {rmse:.3f}")
print(f"✅ R² Score: {r2:.3f}\n")

# Show sample predictions
comparison_df = pd.DataFrame({
    'Actual Rating': y_test.values,
    'Predicted Rating': y_pred
})
print("🎬 Sample Predictions (first 10):")
print(comparison_df.head(10))



✅ RMSE: 1.125
✅ R² Score: 0.337

🎬 Sample Predictions (first 10):
   Actual Rating  Predicted Rating
0            8.2          7.827763
1            2.6          5.179469
2            5.3          4.800635
3            3.2          6.483569
4            5.2          4.664931
5            7.5          6.433688
6            4.2          4.982085
7            5.7          6.134204
8            5.7          5.587684
9            6.6          6.288945
