In [1]:
import pandas as pd

# Load dataset (modify path if using Google Drive)
df = pd.read_csv("IMDb Movies India.csv", encoding="latin1")

# Display first few rows
df.head()

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali


In [2]:
# Fill missing values in numerical columns with median
df["Rating"].fillna(df["Rating"].median(), inplace=True)

# Fill missing values in categorical columns with "Unknown"
categorical_cols = ["Genre", "Director", "Actor 1", "Actor 2", "Actor 3"]
for col in categorical_cols:
    df[col].fillna("Unknown", inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Rating"].fillna(df["Rating"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna("Unknown", inplace=True)


In [3]:
# Convert Year to numeric (remove parentheses)
df["Year"] = df["Year"].str.extract("(\d+)").astype(float)

# Convert Votes to numeric
df["Votes"] = pd.to_numeric(df["Votes"], errors="coerce").fillna(0).astype(int)


In [5]:
#Director Success Rate
director_avg_rating = df.groupby("Director")["Rating"].mean().to_dict()
df["Director_Avg_Rating"] = df["Director"].map(director_avg_rating)

#Average Genre Rating
genre_avg_rating = df.groupby("Genre")["Rating"].mean().to_dict()
df["Genre_Avg_Rating"] = df["Genre"].map(genre_avg_rating)

#Average Rating of Lead Actors
for actor in ["Actor 1", "Actor 2", "Actor 3"]:
    actor_avg_rating = df.groupby(actor)["Rating"].mean().to_dict()
    df[f"{actor}_Avg_Rating"] = df[actor].map(actor_avg_rating)

#Total Votes for Director (Indicates Popularity)
director_total_votes = df.groupby("Director")["Votes"].sum().to_dict()
df["Director_Total_Votes"] = df["Director"].map(director_total_votes)

#Movie Duration in Minutes
df["Duration"] = df["Duration"].str.extract("(\d+)").astype(float)

#Genre-Based Success Rate
genre_success_rate = df.groupby("Genre")["Rating"].apply(lambda x: (x > 7.0).mean()).to_dict()
df["Genre_Success_Rate"] = df["Genre"].map(genre_success_rate)


In [10]:
# Ensure all required columns exist before handling missing values
missing_cols = ["Year_Avg_Rating", "Actor_Pair_Frequency"]
for col in missing_cols:
    if col not in df.columns:
        df[col] = None  # Create the column if missing

# Define the numerical columns again (after ensuring they exist)
numerical_cols = ["Duration", "Director_Avg_Rating", "Genre_Avg_Rating", "Director_Movie_Count",
                  "Director_Total_Votes", "Genre_Success_Rate", "Year_Avg_Rating", "Actor_Pair_Frequency"]

# Fill missing values with median
df[numerical_cols] = df[numerical_cols].apply(lambda x: x.fillna(x.median()))


  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  df[numerical_cols] = df[numerical_cols].apply(lambda x: x.fillna(x.median()))
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  df[numerical_cols] = df[numerical_cols].apply(lambda x: x.fillna(x.median()))


In [11]:
from sklearn.preprocessing import StandardScaler

# Define numerical features for scaling
numerical_features = ["Duration", "Director_Avg_Rating", "Genre_Avg_Rating", "Director_Movie_Count",
                      "Director_Total_Votes", "Genre_Success_Rate", "Year_Avg_Rating", "Actor_Pair_Frequency", "Votes"]

# Apply StandardScaler
scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


In [12]:
from sklearn.preprocessing import LabelEncoder

# Define categorical columns
categorical_features = ["Director", "Genre", "Actor 1", "Actor 2", "Actor 3"]

# Encode categorical features
for col in categorical_features:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])


In [13]:
from sklearn.model_selection import train_test_split

# Define target variable (Rating)
y = df["Rating"]

# Define features (all except Rating)
features = numerical_features + categorical_features
X = df[features]

# Train-test split (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [14]:
from sklearn.ensemble import RandomForestRegressor

# Initialize model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train model
model.fit(X_train, y_train)


In [15]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Predict movie ratings
y_pred = model.predict(X_test)

# Evaluate model
print("Mean Absolute Error (MAE):", mean_absolute_error(y_test, y_pred))
print("Mean Squared Error (MSE):", mean_squared_error(y_test, y_pred))
print("R-squared Score (R²):", r2_score(y_test, y_pred))


Mean Absolute Error (MAE): 0.325487819686224
Mean Squared Error (MSE): 0.3836731830303926
R-squared Score (R²): 0.6054408296707124


In [16]:
from sklearn.model_selection import GridSearchCV

# Define hyperparameter grid
param_grid = {
    "n_estimators": [50, 100, 200],
    "max_depth": [10, 20, None],
    "min_samples_split": [2, 5, 10]
}

# Grid search for best hyperparameters
grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=3, scoring="neg_mean_absolute_error", n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best parameters
print("Best Parameters:", grid_search.best_params_)

# Train final model with best parameters
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)


Best Parameters: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 200}


In [17]:
import joblib

# Save the trained model
joblib.dump(model, "movie_rating_predictor.pkl")

# Load model if needed later
loaded_model = joblib.load("movie_rating_predictor.pkl")
