# Task 2: Movie Rating Prediction with Python



**Dataset Source:**  
https://www.kaggle.com/datasets/adrianmcmahon/imdb-india-movies



---

In [6]:
# Step 1: Import required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Step 2: Load and clean data
df = pd.read_csv("IMDb Movies India.csv", encoding='ISO-8859-1')

# Drop rows where rating is missing
df = df[df['Rating'].notna()].copy()

# Clean 'Votes' column
df['Votes'] = df['Votes'].astype(str).str.replace(",", "")
df['Votes'] = pd.to_numeric(df['Votes'], errors='coerce')

# Extract first genre only
df['Genre'] = df['Genre'].astype(str).str.split(',').str[0]

# Keep only selected columns
df = df[['Year', 'Votes', 'Director', 'Genre', 'Rating']]
df = df.dropna()

# Encode categorical columns
label_cols = ['Director', 'Genre', 'Year']
for col in label_cols:
    df[col] = LabelEncoder().fit_transform(df[col])

# Step 3: Define input and target
X = df.drop(columns='Rating')
y = df['Rating']

# Train-test split
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=7)

# Step 4: Train Gradient Boosting Regressor
model = GradientBoostingRegressor()
model.fit(xtrain, ytrain)

# Step 5: Predict and evaluate
ypred = model.predict(xtest)

mae = mean_absolute_error(ytest, ypred)
rmse = np.sqrt(mean_squared_error(ytest, ypred))
r2 = r2_score(ytest, ypred)

print("MAE:", mae)
print("RMSE:", rmse)
print("R2 Score:", r2)

MAE: 0.888100375960727
RMSE: 1.1530576357303295
R2 Score: 0.33228703358060896
