In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, explained_variance_score

# Load the dataset
df = pd.read_csv("IMDb Movies India.csv", encoding="ISO-8859-1")

# Convert 'Year' and 'Duration' to numeric
df["Year"] = df["Year"].str.extract("(\d+)").astype(float)
df["Duration"] = df["Duration"].str.extract("(\d+)").astype(float)

# Convert 'Votes' to numeric (handling commas and non-numeric values)
df["Votes"] = pd.to_numeric(df["Votes"].str.replace(",", "", regex=True), errors="coerce")

# Drop rows where 'Rating' (target variable) is missing
df = df.dropna(subset=["Rating"])

# Fill missing values
df["Duration"].fillna(df["Duration"].median(), inplace=True)
df["Year"].fillna(df["Year"].median(), inplace=True)
df["Votes"].fillna(0, inplace=True)
df.fillna("Unknown", inplace=True)  # Fill categorical missing values with 'Unknown'

# Label encode categorical columns
categorical_cols = ["Genre", "Director", "Actor 1", "Actor 2", "Actor 3"]
label_encoders = {}
for col in categorical_cols:
    label_encoders[col] = LabelEncoder()
    df[col] = label_encoders[col].fit_transform(df[col].astype(str))

# Drop 'Name' column as it's not useful for prediction
df.drop(columns=["Name"], inplace=True)

# Define features and target variable
X = df.drop(columns=["Rating"])  # Features
y = df["Rating"]  # Target

# Split the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
ev = explained_variance_score(y_test, y_pred)

print(f"RMSE: {rmse}")
print(f"R² Score: {r2}")
print(f"Mean Absolute Error: {mae}")
print(f"Explained Variance Score: {ev}")


  df["Year"] = df["Year"].str.extract("(\d+)").astype(float)
  df["Duration"] = df["Duration"].str.extract("(\d+)").astype(float)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Duration"].fillna(df["Duration"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Year"].fillna(df["Year"].median(), in

RMSE: 1.103787863267195
R² Score: 0.3446731938987354
Mean Absolute Error: 0.8353049242424243
Explained Variance Score: 0.34719739458981236


