In [73]:
#IMPORTING REQUIRED LIBRARIES
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score,accuracy_score
from sklearn.preprocessing import LabelBinarizer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [2]:
# Load the movie dataset
file=pd.read_csv("E:\CODSOFT\datasets\IMDb Movies India.csv") #Specify the path of the dataset
file.head(5)

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),-2019.0,109 min,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,-2021.0,90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,-2019.0,110 min,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,-2010.0,105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali


In [3]:
#data cleaning
file.isnull().sum()

Name           0
Year         528
Duration    8269
Genre       1877
Rating      7590
Votes       7589
Director     525
Actor 1     1617
Actor 2     2384
Actor 3     3144
dtype: int64

In [4]:
file.dropna(inplace=True)

In [5]:
file.isnull().sum()

Name        0
Year        0
Duration    0
Genre       0
Rating      0
Votes       0
Director    0
Actor 1     0
Actor 2     0
Actor 3     0
dtype: int64

In [6]:
file.drop_duplicates(inplace=True)
file['Year'] = file['Year'].abs()
file['Votes'] = file['Votes'].str.replace(',', '').astype(int)
file['Duration'] = file['Duration'].str.extract('(\d+)')
file['Duration'] = pd.to_numeric(file['Duration'], errors='coerce')
file.head()

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
1,#Gadhvi (He thought he was Gandhi),2019.0,109,Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
3,#Yaaram,2019.0,110,"Comedy, Romance",4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
5,...Aur Pyaar Ho Gaya,1997.0,147,"Comedy, Drama, Musical",4.7,827,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor
6,...Yahaan,2005.0,142,"Drama, Romance, War",7.4,1086,Shoojit Sircar,Jimmy Sheirgill,Minissha Lamba,Yashpal Sharma
8,?: A Question Mark,2012.0,82,"Horror, Mystery, Thriller",5.6,326,Allyson Patel,Yash Dave,Muntazir Ahmad,Kiran Bhatia


In [7]:
file['Actors'] = file['Actor 1'] + ',' + file['Actor 2'] + ',' + file['Actor 3']
file["Actor"] = file['Actors'].astype('category').cat.codes
file["Directors"] = file['Director'].astype('category').cat.codes
file["Genres"] = file['Genre'].astype('category').cat.codes
file.head()

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3,Actors,Actor,Directors,Genres
1,#Gadhvi (He thought he was Gandhi),2019.0,109,Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid,"Rasika Dugal,Vivek Ghamande,Arvind Jangid",3875,629,229
3,#Yaaram,2019.0,110,"Comedy, Romance",4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor,"Prateik,Ishita Raj,Siddhant Kapoor",3343,1335,184
5,...Aur Pyaar Ho Gaya,1997.0,147,"Comedy, Drama, Musical",4.7,827,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor,"Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor",1119,1530,157
6,...Yahaan,2005.0,142,"Drama, Romance, War",7.4,1086,Shoojit Sircar,Jimmy Sheirgill,Minissha Lamba,Yashpal Sharma,"Jimmy Sheirgill,Minissha Lamba,Yashpal Sharma",2086,2044,289
8,?: A Question Mark,2012.0,82,"Horror, Mystery, Thriller",5.6,326,Allyson Patel,Yash Dave,Muntazir Ahmad,Kiran Bhatia,"Yash Dave,Muntazir Ahmad,Kiran Bhatia",5562,135,320


In [8]:
file.shape

(5659, 14)

In [9]:
#Remove outliners 
Q1 = file['Genres'].quantile(0.25)
Q3 = file['Genres'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
file = file[(file['Genres'] >= lower_bound) & (file['Genres'] <= upper_bound)]

In [10]:
Q1 = file['Actor'].quantile(0.25)
Q3 = file['Actor'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
file = file[(file['Actor'] >= lower_bound) & (file['Actor'] <= upper_bound)]

In [11]:
Q1 = file['Directors'].quantile(0.25)
Q3 = file['Directors'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
file = file[(file['Directors'] >= lower_bound) & (file['Directors'] <= upper_bound)]

In [12]:
file.shape

(5659, 14)

In [13]:
#feature and target selection
X = file[['Genres','Votes','Year','Duration', 'Directors', 'Actor']]
Y = file['Rating']

In [21]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5659 entries, 1 to 15508
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Genres     5659 non-null   int16  
 1   Votes      5659 non-null   int32  
 2   Year       5659 non-null   float64
 3   Duration   5659 non-null   int64  
 4   Directors  5659 non-null   int16  
 5   Actor      5659 non-null   int16  
dtypes: float64(1), int16(3), int32(1), int64(1)
memory usage: 187.9 KB


In [22]:
# Splitting the dataset into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1)

In [35]:
model = LinearRegression()

# Training the model
model.fit(X_train, Y_train)

# Make predictions on the test set
Y_pred = model.predict(X_test)

In [55]:
#Accuracy of model
LRscore = r2_score(Y_test,Y_pred)*1000
print('Accuracy : {:0.2f}%'.format(r2))
LRmsr = mean_squared_error(Y_test, Y_pred, squared=False)
print("Mean Squared Error = {:0.2f}\n".format(LRmsr))


Accuracy : 98.70%
Mean Squared Error = 1.29



In [49]:
#Building other models
RFR = RandomForestRegressor(n_estimators=100, random_state=1)
RFR.fit(X_train, Y_train)
RFR_pred = RFR.predict(X_test)


In [56]:
RFRscore = r2_score(Y_test,RFR_pred)*1000
print('Accuracy : {:0.2f}%'.format(RFRscore))
RFRmsr = mean_squared_error(Y_test, RFR_pred, squared=False)
print("Mean Squared Error = {:0.2f}\n".format(RFRmsr))

Accuracy : 331.95%
Mean Squared Error = 1.11



In [53]:
DTR = DecisionTreeRegressor(random_state=1)
DTR.fit(X_train, Y_train)
DTR_pred = DTR.predict(X_test)

In [57]:
DTRscore = r2_score(Y_test,DTR_pred)*1000
print('Accuracy : {:0.2f}%'.format(DTRscore))
DTRmsr = mean_squared_error(Y_test, DTR_pred, squared=False)
print("Mean Squared Error = {:0.2f}\n".format(DTRmsr))

Accuracy : -283.08%
Mean Squared Error = 1.54



In [72]:
score=pd.DataFrame({
    "Accuracy": [LRscore,RFRscore,DTRscore],
    "Mean Sqaure Error": [LRmsr,RFRmsr,DTRmsr]},
    index=['LinearRegression','RandomForestClassifier','DecisionTreeRegressor']
)
score.sort_values(by='Accuracy',ascending=False)

Unnamed: 0,Accuracy,Mean Sqaure Error
RandomForestClassifier,331.954083,1.111593
LinearRegression,98.704447,1.291148
DecisionTreeRegressor,-283.084518,1.54053
