In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


In [None]:

df = pd.read_csv('IMDbMoviesIndia.csv' , encoding='ISO-8859-1')
df.head()



# Preprocessing

In [8]:
df_model = df.dropna(subset=['Rating'])
features = ['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']
X = df_model[features]
y = df_model['Rating']


# Build Model Pipeline

In [9]:
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), features)
])
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])


# Train and Evaluate

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print(f"Mean Squared Error: {mean_squared_error(y_test, y_pred):.2f}")
print(f"R² Score: {r2_score(y_test, y_pred):.2f}")


Mean Squared Error: 8.14
R² Score: -3.38
