In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [2]:
movie_rating_data = pd.read_csv("/content/IMDb Movies India.csv", encoding="ISO-8859-1")
movie_rating_data

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali
...,...,...,...,...,...,...,...,...,...,...
15504,Zulm Ko Jala Doonga,(1988),,Action,4.6,11,Mahendra Shah,Naseeruddin Shah,Sumeet Saigal,Suparna Anand
15505,Zulmi,(1999),129 min,"Action, Drama",4.5,655,Kuku Kohli,Akshay Kumar,Twinkle Khanna,Aruna Irani
15506,Zulmi Raj,(2005),,Action,,,Kiran Thej,Sangeeta Tiwari,,
15507,Zulmi Shikari,(1988),,Action,,,,,,


In [3]:
movie_rating_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15509 entries, 0 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      15509 non-null  object 
 1   Year      14981 non-null  object 
 2   Duration  7240 non-null   object 
 3   Genre     13632 non-null  object 
 4   Rating    7919 non-null   float64
 5   Votes     7920 non-null   object 
 6   Director  14984 non-null  object 
 7   Actor 1   13892 non-null  object 
 8   Actor 2   13125 non-null  object 
 9   Actor 3   12365 non-null  object 
dtypes: float64(1), object(9)
memory usage: 1.2+ MB


In [4]:
movie_rating_data.shape

(15509, 10)

In [5]:
movie_rating_data.isnull().sum()

Name           0
Year         528
Duration    8269
Genre       1877
Rating      7590
Votes       7589
Director     525
Actor 1     1617
Actor 2     2384
Actor 3     3144
dtype: int64

***Handling the missing values***

In [6]:
movie_rating_data.dropna(subset=['Name', 'Year', 'Duration', 'Rating', 'Votes', 'Director', 'Actor 1', 'Actor 2', 'Actor 3'], inplace=True)


In [7]:
movie_rating_data.isnull().sum()

Name         0
Year         0
Duration     0
Genre       29
Rating       0
Votes        0
Director     0
Actor 1      0
Actor 2      0
Actor 3      0
dtype: int64

In [8]:
movie_rating_data['Year'] = movie_rating_data['Year'].str.replace(r'\(|\)', '', regex=True)

In [9]:
movie_rating_data.isnull().sum()

Name         0
Year         0
Duration     0
Genre       29
Rating       0
Votes        0
Director     0
Actor 1      0
Actor 2      0
Actor 3      0
dtype: int64

***Coverting all values to numeric***

In [10]:
movie_rating_data['Year']=movie_rating_data['Year'].str.replace(r'\(|\)','',regex=True)

In [11]:
movie_rating_data['Name']=movie_rating_data['Name'].str.extract('([A-Za-z\s\'\-]+)')

In [12]:
movie_rating_data['Duration'] = movie_rating_data['Duration'].apply(lambda x: str(x) if not isinstance(x, str) else x)
movie_rating_data['Duration']=pd.to_numeric(movie_rating_data['Duration'].str.replace(r'min','',regex=True), errors='coerce')

In [13]:
movie_rating_data['Votes'] = pd.to_numeric(movie_rating_data['Votes'].astype(str).str.replace(',', ''), errors='coerce')

In [14]:
from sklearn.preprocessing import LabelEncoder
label_encoder=LabelEncoder()
movie_rating_data['Director']= label_encoder.fit_transform(movie_rating_data['Director'])
movie_rating_data['Actor 1']= label_encoder.fit_transform(movie_rating_data['Actor 1'])
movie_rating_data['Actor 2']= label_encoder.fit_transform(movie_rating_data['Actor 2'])
movie_rating_data['Actor 3']= label_encoder.fit_transform(movie_rating_data['Actor 3'])
movie_rating_data

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
1,Gadhvi,2019,109,Drama,7.0,8,630,1365,2287,322
3,Yaaram,2019,110,"Comedy, Romance",4.4,35,1340,1210,724,2162
5,Aur Pyaar Ho Gaya,1997,147,"Comedy, Drama, Musical",4.7,827,1535,384,75,2058
6,Yahaan,2005,142,"Drama, Romance, War",7.4,1086,2050,702,1121,2540
8,A Question Mark,2012,82,"Horror, Mystery, Thriller",5.6,326,135,1948,1184,1023
...,...,...,...,...,...,...,...,...,...,...
15493,Zubaan,2015,115,Drama,6.1,408,1228,1875,1813,1628
15494,Zubeidaa,2001,153,"Biography, Drama, History",6.2,1496,2065,774,1631,1194
15503,Zulm Ki Zanjeer,1989,125,"Action, Crime, Drama",5.8,44,1799,412,759,1698
15505,Zulmi,1999,129,"Action, Drama",4.5,655,1028,113,2177,317


In [15]:
genre_mean_rating = movie_rating_data.groupby('Genre')['Rating'].transform('mean')
movie_rating_data['Genre_mean_rating'] = genre_mean_rating

In [16]:
movie_rating_data['Name'].fillna(movie_rating_data['Name'].mode()[0], inplace=True)

In [17]:
movie_rating_data.isnull().sum()

Name                  0
Year                  0
Duration              0
Genre                29
Rating                0
Votes                 0
Director              0
Actor 1               0
Actor 2               0
Actor 3               0
Genre_mean_rating    29
dtype: int64

In [18]:
movie_rating_data['Genre'].fillna(movie_rating_data['Genre'].mode()[0], inplace=True)
movie_rating_data['Genre_mean_rating'].fillna(movie_rating_data['Genre_mean_rating'].mode()[0], inplace=True)

***Data Analysis***

In [19]:
movie_rating_data.describe()

Unnamed: 0,Duration,Rating,Votes,Director,Actor 1,Actor 2,Actor 3,Genre_mean_rating
count,5688.0,5688.0,5688.0,5688.0,5688.0,5688.0,5688.0,5688.0
mean,133.486639,5.900738,2683.977848,1216.101617,975.69462,1190.152075,1285.098277,5.901169
std,25.312671,1.379355,13617.99177,688.687613,568.54828,671.448259,724.107008,0.593489
min,21.0,1.1,5.0,0.0,0.0,0.0,0.0,2.4
25%,119.0,5.0,30.0,638.75,476.0,620.75,658.75,5.516505
50%,135.0,6.1,128.0,1204.5,983.0,1206.0,1316.5,5.906452
75%,150.0,6.9,910.75,1799.0,1485.0,1780.25,1879.25,6.415521
max,321.0,10.0,591417.0,2437.0,1973.0,2335.0,2571.0,9.4


In [20]:
movie_rating_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5688 entries, 1 to 15508
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Name               5688 non-null   object 
 1   Year               5688 non-null   object 
 2   Duration           5688 non-null   int64  
 3   Genre              5688 non-null   object 
 4   Rating             5688 non-null   float64
 5   Votes              5688 non-null   int64  
 6   Director           5688 non-null   int64  
 7   Actor 1            5688 non-null   int64  
 8   Actor 2            5688 non-null   int64  
 9   Actor 3            5688 non-null   int64  
 10  Genre_mean_rating  5688 non-null   float64
dtypes: float64(2), int64(6), object(3)
memory usage: 662.3+ KB


***Seperating target and features***

In [21]:
x=movie_rating_data[['Year', 'Duration', 'Genre_mean_rating', 'Votes', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']]
y=movie_rating_data[['Rating']]

In [22]:
print(x)
print(y)

       Year  Duration  Genre_mean_rating  Votes  Director  Actor 1  Actor 2  \
1      2019       109           6.415521      8       630     1365     2287   
3      2019       110           5.716822     35      1340     1210      724   
5      1997       147           6.242222    827      1535      384       75   
6      2005       142           6.820000   1086      2050      702     1121   
8      2012        82           5.477778    326       135     1948     1184   
...     ...       ...                ...    ...       ...      ...      ...   
15493  2015       115           6.415521    408      1228     1875     1813   
15494  2001       153           6.950000   1496      2065      774     1631   
15503  1989       125           5.743465     44      1799      412      759   
15505  1999       129           5.516505    655      1028      113     2177   
15508  1998       130           5.516505     20       898      476      758   

       Actor 3  
1          322  
3         2162  


***Splitting data into training and testing***

In [23]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2)

In [24]:
print(x.shape,x_train.shape,x_test.shape)

(5688, 8) (4550, 8) (1138, 8)


In [25]:
print(y.shape,y_train.shape,y_test.shape)

(5688, 1) (4550, 1) (1138, 1)


***Model Training:-RandomForestRegressor***

In [26]:
model = RandomForestRegressor()

In [27]:
model.fit(x_train,y_train)

  model.fit(x_train,y_train)


In [28]:
x_train_prediction=model.predict(x_train)
x_train_prediction

array([5.68 , 7.024, 4.434, ..., 7.87 , 5.297, 6.403])

In [29]:
x_test_prediction=model.predict(x_test)
x_test_prediction

array([5.274, 7.612, 6.366, ..., 5.116, 6.677, 4.224])

In [30]:
mae = mean_absolute_error(y_train, x_train_prediction)
print("Mean Absolute Error:", mae)

Mean Absolute Error: 0.300160879120879


In [31]:
mse = mean_squared_error(y_train, x_train_prediction)
print("Mean Squared Error:", mse)

Mean Squared Error: 0.16213981890109888


In [32]:
R2 = r2_score(y_train,x_train_prediction)
print("R2:-", R2)

R2:- 0.9148540666865362


***Model Testing***

In [33]:
x.head()

Unnamed: 0,Year,Duration,Genre_mean_rating,Votes,Director,Actor 1,Actor 2,Actor 3
1,2019,109,6.415521,8,630,1365,2287,322
3,2019,110,5.716822,35,1340,1210,724,2162
5,1997,147,6.242222,827,1535,384,75,2058
6,2005,142,6.82,1086,2050,702,1121,2540
8,2012,82,5.477778,326,135,1948,1184,1023


In [34]:
y.head()

Unnamed: 0,Rating
1,7.0
3,4.4
5,4.7
6,7.4
8,5.6


In [35]:
data = {'Year': [2024], 'Duration': [166],'Genre_mean_rating': [98],'Votes': [1500],'Director': [120], 'Actor 1': [1537], 'Actor 2': [1069], 'Actor 3': [1032],}
df = pd.DataFrame(data)

In [36]:
predicted_rating = model.predict(df)
print("Predicted Rating:", predicted_rating[0])

Predicted Rating: 8.161999999999999
