In [19]:
import numpy as np 
import pandas as pd

import warnings 
warnings.filterwarnings('ignore')

In [20]:
#ISO-8859-1 encoding
data = pd.read_csv("IMDb Movies India.csv", encoding='ISO-8859-1')

In [21]:
data

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali
...,...,...,...,...,...,...,...,...,...,...
15504,Zulm Ko Jala Doonga,(1988),,Action,4.6,11,Mahendra Shah,Naseeruddin Shah,Sumeet Saigal,Suparna Anand
15505,Zulmi,(1999),129 min,"Action, Drama",4.5,655,Kuku Kohli,Akshay Kumar,Twinkle Khanna,Aruna Irani
15506,Zulmi Raj,(2005),,Action,,,Kiran Thej,Sangeeta Tiwari,,
15507,Zulmi Shikari,(1988),,Action,,,,,,


In [22]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15509 entries, 0 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      15509 non-null  object 
 1   Year      14981 non-null  object 
 2   Duration  7240 non-null   object 
 3   Genre     13632 non-null  object 
 4   Rating    7919 non-null   float64
 5   Votes     7920 non-null   object 
 6   Director  14984 non-null  object 
 7   Actor 1   13892 non-null  object 
 8   Actor 2   13125 non-null  object 
 9   Actor 3   12365 non-null  object 
dtypes: float64(1), object(9)
memory usage: 1.2+ MB
None


In [23]:
#feature engineering
data[['Actor 1', 'Actor 2', 'Actor 3']] = data[['Actor 1', 'Actor 2', 'Actor 3']].fillna('')
data['Actors'] = data[['Actor 1', 'Actor 2', 'Actor 3']].apply(lambda row: ', '.join(row), axis=1)

In [24]:
# Drop the original individual actor columns 
data = data.drop(columns=['Actor 1', 'Actor 2', 'Actor 3'])

In [25]:
data

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actors
0,,,,Drama,,,J.S. Randhawa,"Manmauji, Birbal, Rajendra Bhatia"
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8,Gaurav Bakshi,"Rasika Dugal, Vivek Ghamande, Arvind Jangid"
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,"Sayani Gupta, Plabita Borthakur, Roy Angana"
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35,Ovais Khan,"Prateik, Ishita Raj, Siddhant Kapoor"
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,"Rajat Kapoor, Rituparna Sengupta, Antara Mali"
...,...,...,...,...,...,...,...,...
15504,Zulm Ko Jala Doonga,(1988),,Action,4.6,11,Mahendra Shah,"Naseeruddin Shah, Sumeet Saigal, Suparna Anand"
15505,Zulmi,(1999),129 min,"Action, Drama",4.5,655,Kuku Kohli,"Akshay Kumar, Twinkle Khanna, Aruna Irani"
15506,Zulmi Raj,(2005),,Action,,,Kiran Thej,"Sangeeta Tiwari, ,"
15507,Zulmi Shikari,(1988),,Action,,,,", ,"


In [26]:
print(data.columns)  # Inspect the column names

# Update the selected columns based on the actual column names
selected_columns = ['Genre','Director','Actors','Rating']  # Use the correct column names
data = data[selected_columns]

Index(['Name', 'Year', 'Duration', 'Genre', 'Rating', 'Votes', 'Director',
       'Actors'],
      dtype='object')


In [27]:
data.dropna(subset=['Rating'], inplace=True) #Handling missing values

In [28]:
# Apply a simple imputer to fill any missing categorical values
for column in ['Genre', 'Director', 'Actors']:
    data[column].fillna("Unknown", inplace=True)

In [29]:
# Encoding categorical features
# Label encoding and one-hot encoding for categorical variables

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

encoder = ColumnTransformer(
    transformers=[
        ('Genre', OneHotEncoder(handle_unknown='ignore'), ['Genre']),
        ('Director', OneHotEncoder(handle_unknown='ignore'), ['Director']),
        ('Actors', OneHotEncoder(handle_unknown='ignore'), ['Actors'])
    ], remainder='passthrough'
)


In [30]:
X = data.drop('Rating', axis=1)
y = data['Rating']

In [31]:
X

Unnamed: 0,Genre,Director,Actors
1,Drama,Gaurav Bakshi,"Rasika Dugal, Vivek Ghamande, Arvind Jangid"
3,"Comedy, Romance",Ovais Khan,"Prateik, Ishita Raj, Siddhant Kapoor"
5,"Comedy, Drama, Musical",Rahul Rawail,"Bobby Deol, Aishwarya Rai Bachchan, Shammi Kapoor"
6,"Drama, Romance, War",Shoojit Sircar,"Jimmy Sheirgill, Minissha Lamba, Yashpal Sharma"
8,"Horror, Mystery, Thriller",Allyson Patel,"Yash Dave, Muntazir Ahmad, Kiran Bhatia"
...,...,...,...
15501,"Action, Crime, Drama",Bharat Rangachary,"Dharmendra, Moushumi Chatterjee, Govinda"
15503,"Action, Crime, Drama",S.P. Muthuraman,"Chiranjeevi, Jayamalini, Rajinikanth"
15504,Action,Mahendra Shah,"Naseeruddin Shah, Sumeet Saigal, Suparna Anand"
15505,"Action, Drama",Kuku Kohli,"Akshay Kumar, Twinkle Khanna, Aruna Irani"


In [32]:
y

1        7.0
3        4.4
5        4.7
6        7.4
8        5.6
        ... 
15501    5.3
15503    5.8
15504    4.6
15505    4.5
15508    6.2
Name: Rating, Length: 7919, dtype: float64

In [33]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [34]:
# Build pipeline for model training

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor



pipeline = Pipeline([
    ('preprocessor', encoder),
    ('scaler', StandardScaler(with_mean=False)),
    ('model', RandomForestRegressor(n_estimators=100, random_state=42))
])

In [35]:
pipeline.fit(X_train, y_train)

In [36]:
y_pred = pipeline.predict(X_test)

In [37]:
from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)


In [40]:
print("Model Performance: ")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R^2 Score: {r2}")

Model Performance: 
Root Mean Squared Error (RMSE): 1.2933586062893314
R^2 Score: 0.10024427469683406
