In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# Load the dataset

df = pd.read_csv('/content/IMDb_Movies_India.csv', encoding='latin-1')

In [None]:
#  Data Cleaning
df = df.dropna(subset=['Rating'])  # Remove rows with missing ratings
df['Year'] = df['Year'].str.extract('(\d+)').astype(float)
df['Duration'] = df['Duration'].str.replace(' min', '', regex=True).astype(float)
df['Votes'] = df['Votes'].str.replace(',', '', regex=True).astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Year'] = df['Year'].str.extract('(\d+)').astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Duration'] = df['Duration'].str.replace(' min', '', regex=True).astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Votes'] = df['Votes'].str.replace(',', '', regex=True).astyp

In [None]:
# Fill missing categorical values
categorical_cols = ['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']
df[categorical_cols] = df[categorical_cols].fillna('Unknown')

In [None]:
# Feature Engineering: One-Hot Encoding for 'Genre'
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoded_genre = encoder.fit_transform(df[['Genre']])
encoded_genre_df = pd.DataFrame(encoded_genre, columns=encoder.get_feature_names_out(['Genre']))

In [None]:
# Drop original text columns & merge encoded features
df = df.drop(columns=['Name', 'Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3'])
df = pd.concat([df, encoded_genre_df], axis=1)

In [None]:
# Train-Test Split
X = df.drop(columns=['Rating'])  # Features
y = df['Rating']  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Check and handle NaN values in y_train and y_test
print(f"NaN values in y_train: {y_train.isna().sum()}")
print(f"NaN values in y_test: {y_test.isna().sum()}")

NaN values in y_train: 3019
NaN values in y_test: 755


In [None]:
# Remove rows with NaN values in y_train and y_test
X_train = X_train[y_train.notna()]
y_train = y_train[y_train.notna()]
X_test = X_test[y_test.notna()]
y_test = y_test[y_test.notna()]

In [None]:
# Handle missing values in numeric columns
X_train = X_train.fillna(X_train.median())
X_test = X_test.fillna(X_train.median())

In [None]:
# Train the Model
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
# Predictions & Evaluation
y_pred = model.predict(X_test)

# Calculate MSE first, then take the square root for RMSE
mse = mean_squared_error(y_test, y_pred)  # Calculate MSE
rmse = mse**0.5
r2 = r2_score(y_test, y_pred)

In [None]:
print(f" Model Performance:\nRMSE: {rmse:.2f}\nR² Score: {r2:.2f}")

 Model Performance:
RMSE: 1.36
R² Score: 0.04
