In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import missingno as msno
import seaborn as sns
%matplotlib inline
matplotlib.style.use('ggplot')

import warnings
warnings.filterwarnings('ignore')

## Load the Dataset 

In [4]:
df = pd.read_csv("C:/Users/sreel/OneDrive/Desktop/MSc DataScience/Data sets/IMDb Movies India.csv", encoding='ISO-8859-1')

In [5]:
df.head()

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali


## Data Preprocessing

In [6]:
#To get basic information about dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15509 entries, 0 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      15509 non-null  object 
 1   Year      14981 non-null  object 
 2   Duration  7240 non-null   object 
 3   Genre     13632 non-null  object 
 4   Rating    7919 non-null   float64
 5   Votes     7920 non-null   object 
 6   Director  14984 non-null  object 
 7   Actor 1   13892 non-null  object 
 8   Actor 2   13125 non-null  object 
 9   Actor 3   12365 non-null  object 
dtypes: float64(1), object(9)
memory usage: 1.2+ MB


In [7]:
#Checking for missing values
df.isnull().sum()

Name           0
Year         528
Duration    8269
Genre       1877
Rating      7590
Votes       7589
Director     525
Actor 1     1617
Actor 2     2384
Actor 3     3144
dtype: int64

In [8]:
#To ge the statistical summary of numerical features
df.describe()

Unnamed: 0,Rating
count,7919.0
mean,5.841621
std,1.381777
min,1.1
25%,4.9
50%,6.0
75%,6.8
max,10.0


In [9]:
#Unique values in categorical features
categorical_columns=["Genre","Director","Actor 1","Actor 2","Actor 3"]
categorical_col=df[categorical_columns]
for column in categorical_col:
    print(f"Unique values in {column}: {df[column].nunique()}")

Unique values in Genre: 485
Unique values in Director: 5938
Unique values in Actor 1: 4718
Unique values in Actor 2: 4891
Unique values in Actor 3: 4820


## Data Cleaning 

In [12]:
#Filling or dropping the missing values
#Dropping the rows where Rating is missing
df=df.dropna(subset=["Rating"])
#Filling other missing values with "Unknown"
df=df.fillna("Unknown")

In [13]:
df.isnull().sum()

Name        0
Year        0
Duration    0
Genre       0
Rating      0
Votes       0
Director    0
Actor 1     0
Actor 2     0
Actor 3     0
dtype: int64

## Convert categorical variables to numerical 

In [17]:
from sklearn.preprocessing import OneHotEncoder
categorical_columns = ["Genre","Director","Actor 1","Actor 2","Actor 3"]

# Initialize OneHotEncoder with handle_unknown='ignore'
encoder = OneHotEncoder(handle_unknown='ignore')


# Fit and transform the categorical columns
encoded_features = encoder.fit_transform(df[categorical_columns])

# Convert the encoded features to a DataFrame
encoded_df = pd.DataFrame(encoded_features.toarray(), columns=encoder.get_feature_names_out())

# Drop original categorical columns from movies DataFrame
df = df.drop(columns=categorical_columns)

# Concatenate movies DataFrame with encoded_df along columns axis
df = pd.concat([df, encoded_df], axis=1)
print(df.head())  

                                 Name    Year Duration  Rating  Votes  \
1  #Gadhvi (He thought he was Gandhi)  (2019)  109 min     7.0      8   
3                             #Yaaram  (2019)  110 min     4.4     35   
5                ...Aur Pyaar Ho Gaya  (1997)  147 min     4.7    827   
6                           ...Yahaan  (2005)  142 min     7.4  1,086   
8                  ?: A Question Mark  (2012)   82 min     5.6    326   

   Genre_Action  Genre_Action, Adventure  Genre_Action, Adventure, Biography  \
1           0.0                      0.0                                 0.0   
3           0.0                      0.0                                 0.0   
5           0.0                      0.0                                 0.0   
6           0.0                      0.0                                 0.0   
8           0.0                      0.0                                 0.0   

   Genre_Action, Adventure, Comedy  Genre_Action, Adventure, Crime  ...  \
1    

In [19]:
# Clean and convert 'Year', 'Duration', and 'Votes' columns to numeric
df['Year'] = pd.to_numeric(df['Year'].str.replace(r'[()]', '', regex=True), errors='coerce')
df['Duration'] = pd.to_numeric(df['Duration'].str.replace(r'\D', '', regex=True), errors='coerce')
df['Votes'] = pd.to_numeric(df['Votes'].str.replace(',', '', regex=True), errors='coerce')

## Building Gradient Boosting 

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor

In [21]:
X=df.drop(["Rating","Name"],axis=1)
y=df["Rating"]

In [23]:
#Split the data into training and testing data
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [24]:
# Drop rows with NaN values from X_train and y_train
X_train.dropna(inplace=True)
y_train = y_train[y_train.index.isin(X_train.index)]  

# Drop rows with NaN values from X_test and y_test 
X_test.dropna(inplace=True)
y_test = y_test[y_test.index.isin(X_test.index)]

In [26]:
#Initialize Gradient Boosting Regressor
model=GradientBoostingRegressor(n_estimators=100,learning_rate=0.1,max_depth=3,random_state = 42)

In [27]:
#Fit the model
model.fit(X_train,y_train)

In [28]:
#Predict on the test data
y_pred=model.predict(X_test)
y_pred

array([5.11560736, 6.65512389, 4.66350104, 5.69004399, 6.48624601,
       6.5188394 , 6.59072434, 7.09008   , 6.72719684, 5.18052645,
       5.78433231, 5.96303899, 6.23377336, 5.84931974, 6.61845091,
       5.6408953 , 5.11560736, 6.03873   , 6.88185655, 5.17682393,
       5.11560736, 5.87208237, 6.7265262 , 5.97017661, 6.61845091,
       5.43011438, 7.34186967, 6.05662223, 7.70832194, 5.11560736,
       5.98828199, 6.16646237, 6.11638232, 5.48741587, 5.11560736,
       5.76202957, 6.32887592, 5.22860062, 5.93911219, 6.48624601,
       6.7265262 , 5.11560736, 5.11560736, 5.78615959, 5.11560736,
       4.82624007, 5.93686281, 6.3287295 , 5.11560736, 6.19434803,
       5.11560736, 5.68947225, 6.11080863, 5.11560736, 5.58059197,
       5.82771659, 5.96864433, 6.85262944, 5.11560736, 6.8286517 ,
       6.18672947, 6.7285649 , 6.18449774, 5.17290886, 6.15268709,
       5.83600434, 6.55986418, 5.31613551, 6.55986418, 5.82771659,
       6.55986418, 5.82771659, 5.17290886, 5.69004399, 5.11560

## Evaluation of the model

In [30]:
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error
import numpy as np

mse=mean_squared_error(y_test,y_pred)
rmse=np.sqrt(mse)
r2=r2_score(y_test,y_pred)
mae=mean_absolute_error(y_test,y_pred)

print("Mean Squared Error:",mse)
print("Root Mean Squared Error:",rmse)
print("R-squared:",r2)
print("Mean Absolute Score:",mae)

Mean Squared Error: 1.371883736710272
Root Mean Squared Error: 1.1712744070926642
R-squared: 0.23295603965191003
Mean Absolute Score: 0.8854814647028669
