In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import MultiLabelBinarizer
from xgboost import XGBRegressor

In [4]:
df = pd.read_csv(r'C:\Users\prakh\Desktop\Immersivify Project\movie-rating-prediction\data\IMDb Movies India.csv', 
                 encoding='cp1252')


In [5]:
df['Year'] = df['Year'].astype(str).str.replace(r'[()]', '', regex=True)
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')

In [7]:
# Convert Duration (remove "min", parse numeric)
df['Duration'] = (
    df['Duration'].astype(str)
    .str.replace('min', '', regex=False)
    .str.strip()
)

In [8]:
df['Duration'] = pd.to_numeric(df['Duration'], errors='coerce')

In [9]:
# Convert Votes (remove commas, parse numeric)
df['Votes'] = (
    df['Votes'].astype(str)
    .str.replace(',', '', regex=False)
    .str.strip()
)
df['Votes'] = pd.to_numeric(df['Votes'], errors='coerce')


In [10]:
# Drop rows with no Rating
df.dropna(subset=['Rating'], inplace=True)

# Impute numeric columns
df['Duration'].fillna(df['Duration'].median(), inplace=True)
for cat_col in ['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']:
    if cat_col in df.columns:
        df[cat_col].fillna('Unknown', inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Duration'].fillna(df['Duration'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[cat_col].fillna('Unknown', inplace=True)


In [11]:
# 3. FEATURE ENGINEERING
# Log transform Votes
df['Log_Votes'] = np.log1p(df['Votes'].fillna(0))

# Create Decade
df['Decade'] = (df['Year'] // 10) * 10

# Split Genre into dummy columns
if 'Genre' in df.columns:
    df['Genre_List'] = df['Genre'].apply(lambda x: [g.strip() for g in x.split(',')])
    mlb = MultiLabelBinarizer()
    genre_dummies = mlb.fit_transform(df['Genre_List'])
    genre_df = pd.DataFrame(genre_dummies, columns=mlb.classes_, index=df.index)
    df = pd.concat([df, genre_df], axis=1)
    df.drop(columns=['Genre_List'], inplace=True)


In [12]:
# 4. DEFINE FEATURES AND TARGET
genre_cols = [
    'Action','Adventure','Animation','Biography','Comedy','Crime','Documentary',
    'Drama','Family','Fantasy','History','Horror','Music','Musical','Mystery',
    'News','Romance','Sci-Fi','Sport','Thriller','Unknown','War','Western'
]
features = ['Duration', 'Log_Votes', 'Decade'] + genre_cols
target = 'Rating'

df = df.dropna(subset=features)  # Ensure no NaNs in features
X = df[features].copy()
y = df[target].copy()


In [13]:
# 5. TRAIN/TEST SPLIT
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [14]:
# 6. XGBOOST WITH RANDOMIZEDSEARCHCV
param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7, 10],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'min_child_weight': [1, 3, 5]
}


In [15]:
xgb_model = XGBRegressor(random_state=42)

search = RandomizedSearchCV(
    xgb_model,
    param_distributions=param_dist,
    n_iter=10,
    scoring='neg_mean_squared_error',
    cv=3,
    random_state=42,
    n_jobs=-1
)
search.fit(X_train, y_train)

best_model = search.best_estimator_
y_pred = best_model.predict(X_test)


In [16]:
# 7. EVALUATION
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("BEST PARAMETERS:", search.best_params_)
print(f"RMSE: {rmse:.3f}")
print(f"R^2:  {r2:.3f}")

BEST PARAMETERS: {'subsample': 1.0, 'n_estimators': 300, 'min_child_weight': 5, 'max_depth': 5, 'learning_rate': 0.05, 'colsample_bytree': 0.8}
RMSE: 1.074
R^2:  0.380
