## Movie Rating Prediction

In [1]:
##  Importing libaries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score



### Importing dataset

In [2]:
# Load the dataset
df = pd.read_csv("IMDb Movies India.csv", encoding='latin1')

### Handlng missing values

In [3]:

# Handle missing values before dropping columns
df['Genre'] = df['Genre'].fillna('Unknown')
df['Rating'] = df['Rating'].fillna(df['Rating'].mean())
df['Director'] = df['Director'].fillna('Unknown')
df['Actor 1'] = df['Actor 1'].fillna('Unknown')
df['Actor 2'] = df['Actor 2'].fillna('Unknown')
df['Actor 3'] = df['Actor 3'].fillna('Unknown')

# Process 'Votes'
df['Votes'] = df['Votes'].astype(str).str.replace(',', '')
df['Votes'] = pd.to_numeric(df['Votes'], errors='coerce')
df['Votes'] = df['Votes'].fillna(df['Votes'].median())

# Process 'Year'
df['Year'] = df['Year'].astype(str).str.extract(r'(\d{4})')
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')
df['Year'] = df['Year'].fillna(df['Year'].median())

# Process 'Duration'
df['Duration'] = df['Duration'].astype(str).replace(r'\D', '', regex=True)
df['Duration'] = pd.to_numeric(df['Duration'], errors='coerce')
df['Duration'] = df['Duration'].fillna(df['Duration'].median())

# Drop unnecessary columns
df = df.drop(columns=['Director', 'Actor 1', 'Actor 2', 'Actor 3', 'Name'])

# One-hot encode 'Genre' (consider using drop_first=True to reduce dimensionality)
df = pd.get_dummies(df, columns=['Genre'], drop_first=True)



### Scaling numerical Features

In [4]:
# Scale numerical features
numerical_cols = ['Duration', 'Votes', 'Year']
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

### Split data into train and test 

In [5]:
# Define features and target variable
X = df.drop(['Rating'], axis=1)
y = df['Rating']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### Train Model

In [6]:
# Train the model
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

### Model Evaluation 

In [7]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f'RMSE: {rmse}')
print(f'R²: {r2}')

RMSE: 0.8373756889735804
R²: 0.2714654247179271
