# Import Libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


# Load and Explore the Data

In [29]:
# Load the dataset
df = pd.read_csv('IMDbMoviesIndia.csv', encoding='ISO-8859-1')
df.head()


Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali


In [30]:
df.isnull().sum()

Name           0
Year         528
Duration    8269
Genre       1877
Rating      7590
Votes       7589
Director     525
Actor 1     1617
Actor 2     2384
Actor 3     3144
dtype: int64

In [31]:
df.describe()

Unnamed: 0,Rating
count,7919.0
mean,5.841621
std,1.381777
min,1.1
25%,4.9
50%,6.0
75%,6.8
max,10.0


# Data Preprocessing

In [34]:
df.drop(['Name', 'Year', 'Duration', 'Votes'],axis=1,inplace =True)
df.head()

Unnamed: 0,Genre,Rating,Director,Actor 1,Actor 2,Actor 3
0,Drama,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,Drama,7.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,"Drama, Musical",,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,"Comedy, Romance",4.4,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,Drama,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali


In [35]:
df.isnull().sum()

Genre       1877
Rating      7590
Director     525
Actor 1     1617
Actor 2     2384
Actor 3     3144
dtype: int64

In [36]:
print('mean  -' +str(df.Rating.mean()))

print('median  -'+str(df.Rating.median()))

print('mode  -'+str(df.Rating.mode()))


mean  -5.841621416845562
median  -6.0
mode  -0    6.2
Name: Rating, dtype: float64


In [37]:
df

Unnamed: 0,Genre,Rating,Director,Actor 1,Actor 2,Actor 3
0,Drama,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,Drama,7.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,"Drama, Musical",,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,"Comedy, Romance",4.4,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,Drama,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali
...,...,...,...,...,...,...
15504,Action,4.6,Mahendra Shah,Naseeruddin Shah,Sumeet Saigal,Suparna Anand
15505,"Action, Drama",4.5,Kuku Kohli,Akshay Kumar,Twinkle Khanna,Aruna Irani
15506,Action,,Kiran Thej,Sangeeta Tiwari,,
15507,Action,,,,,


In [39]:
d=df
d

Unnamed: 0,Genre,Rating,Director,Actor 1,Actor 2,Actor 3
0,Drama,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,Drama,7.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,"Drama, Musical",,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,"Comedy, Romance",4.4,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,Drama,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali
...,...,...,...,...,...,...
15504,Action,4.6,Mahendra Shah,Naseeruddin Shah,Sumeet Saigal,Suparna Anand
15505,"Action, Drama",4.5,Kuku Kohli,Akshay Kumar,Twinkle Khanna,Aruna Irani
15506,Action,,Kiran Thej,Sangeeta Tiwari,,
15507,Action,,,,,


In [41]:
df.Rating.fillna(df.Rating.median(),inplace=True)
df.isnull().sum()

Genre       1877
Rating         0
Director     525
Actor 1     1617
Actor 2     2384
Actor 3     3144
dtype: int64

In [38]:
df.isnull().sum()

Genre       1877
Rating      7590
Director     525
Actor 1     1617
Actor 2     2384
Actor 3     3144
dtype: int64

In [28]:
df

Unnamed: 0,Genre,Rating,Director,Actor 1,Actor 2,Actor 3
1,Drama,7.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
3,"Comedy, Romance",4.4,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
5,"Comedy, Drama, Musical",4.7,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor
6,"Drama, Romance, War",7.4,Shoojit Sircar,Jimmy Sheirgill,Minissha Lamba,Yashpal Sharma
8,"Horror, Mystery, Thriller",5.6,Allyson Patel,Yash Dave,Muntazir Ahmad,Kiran Bhatia
...,...,...,...,...,...,...
15501,"Action, Crime, Drama",5.3,Bharat Rangachary,Dharmendra,Moushumi Chatterjee,Govinda
15503,"Action, Crime, Drama",5.8,S.P. Muthuraman,Chiranjeevi,Jayamalini,Rajinikanth
15504,Action,4.6,Mahendra Shah,Naseeruddin Shah,Sumeet Saigal,Suparna Anand
15505,"Action, Drama",4.5,Kuku Kohli,Akshay Kumar,Twinkle Khanna,Aruna Irani


In [None]:
# categorical_columns = ['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']

# for column in categorical_columns:
#     data[column].fillna('Unknown', inplace=True)

In [42]:
dff=df

In [43]:
dff.dropna(inplace=True)
dff


Unnamed: 0,Genre,Rating,Director,Actor 1,Actor 2,Actor 3
0,Drama,6.0,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,Drama,7.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,"Drama, Musical",6.0,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,"Comedy, Romance",4.4,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,Drama,6.0,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali
...,...,...,...,...,...,...
15502,Action,6.0,Dinesh-Ramanesh,Ramesh Puri,Jalal Agha,Valerie Agha
15503,"Action, Crime, Drama",5.8,S.P. Muthuraman,Chiranjeevi,Jayamalini,Rajinikanth
15504,Action,4.6,Mahendra Shah,Naseeruddin Shah,Sumeet Saigal,Suparna Anand
15505,"Action, Drama",4.5,Kuku Kohli,Akshay Kumar,Twinkle Khanna,Aruna Irani


In [44]:
categorical_columns = ['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']

for column in categorical_columns:
    dff[column].fillna('Unknown', inplace=True)

In [49]:
dff.isnull().sum()

Genre       0
Rating      0
Director    0
Actor 1     0
Actor 2     0
Actor 3     0
dtype: int64

In [50]:
dc=dff
dc

Unnamed: 0,Genre,Rating,Director,Actor 1,Actor 2,Actor 3
0,Drama,6.0,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,Drama,7.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,"Drama, Musical",6.0,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,"Comedy, Romance",4.4,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,Drama,6.0,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali
...,...,...,...,...,...,...
15502,Action,6.0,Dinesh-Ramanesh,Ramesh Puri,Jalal Agha,Valerie Agha
15503,"Action, Crime, Drama",5.8,S.P. Muthuraman,Chiranjeevi,Jayamalini,Rajinikanth
15504,Action,4.6,Mahendra Shah,Naseeruddin Shah,Sumeet Saigal,Suparna Anand
15505,"Action, Drama",4.5,Kuku Kohli,Akshay Kumar,Twinkle Khanna,Aruna Irani


In [53]:
from sklearn.preprocessing import OneHotEncoder
# Define categorical columns
categorical_columns = ['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']

# Create a one-hot encoder
encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)

# Fit and transform the categorical columns
X_categorical_encoded = encoder.fit_transform(data[categorical_columns])

# Get the feature names after one-hot encoding
encoded_feature_names = encoder.get_feature_names_out(input_features=categorical_columns)

# Convert the encoded data into a DataFrame with feature names
X_categorical_encoded_df = pd.DataFrame(X_categorical_encoded, columns=encoded_feature_names)

# Append the encoded data to the existing DataFrame 'dff'
dff = pd.concat([dff, X_categorical_encoded_df], axis=1)

# Display the first few rows of the updated DataFrame 'dff'
print(dff.head())



             Genre  Rating            Director       Actor 1  \
0            Drama     6.0       J.S. Randhawa      Manmauji   
1            Drama     7.0       Gaurav Bakshi  Rasika Dugal   
2   Drama, Musical     6.0  Soumyajit Majumdar  Sayani Gupta   
3  Comedy, Romance     4.4          Ovais Khan       Prateik   
4            Drama     6.0        Amol Palekar  Rajat Kapoor   

              Actor 2          Actor 3  Genre_Action  Genre_Action, Adventure  \
0              Birbal  Rajendra Bhatia           0.0                      0.0   
1      Vivek Ghamande    Arvind Jangid           0.0                      0.0   
2   Plabita Borthakur       Roy Angana           0.0                      0.0   
3          Ishita Raj  Siddhant Kapoor           0.0                      0.0   
4  Rituparna Sengupta      Antara Mali           0.0                      0.0   

   Genre_Action, Adventure, Biography  Genre_Action, Adventure, Comedy  ...  \
0                                 0.0            

In [54]:
dff

Unnamed: 0,Genre,Rating,Director,Actor 1,Actor 2,Actor 3,Genre_Action,"Genre_Action, Adventure","Genre_Action, Adventure, Biography","Genre_Action, Adventure, Comedy",...,Actor 3_Zeeshan Khan,Actor 3_Zeishan Quadri,Actor 3_Zenobia Shroff,Actor 3_Zohra,Actor 3_Zoya Hussain,Actor 3_Zubeida,Actor 3_Zuber K. Khan,Actor 3_Zulfi Sayed,Actor 3_Zunaid Memon,Actor 3_nan
0,Drama,6.0,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Drama,7.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Drama, Musical",6.0,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Comedy, Romance",4.4,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Drama,6.0,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15495,,,,,,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
15496,,,,,,,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
15500,,,,,,,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
15506,,,,,,,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [55]:
# Define the list of categorical columns you want to remove
categorical_columns = ['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']

# Remove the original categorical columns
dff = dff.drop(columns=categorical_columns)

# Display the updated DataFrame without the original categorical columns
print(dff.head())


   Rating  Genre_Action  Genre_Action, Adventure  \
0     6.0           0.0                      0.0   
1     7.0           0.0                      0.0   
2     6.0           0.0                      0.0   
3     4.4           0.0                      0.0   
4     6.0           0.0                      0.0   

   Genre_Action, Adventure, Biography  Genre_Action, Adventure, Comedy  \
0                                 0.0                              0.0   
1                                 0.0                              0.0   
2                                 0.0                              0.0   
3                                 0.0                              0.0   
4                                 0.0                              0.0   

   Genre_Action, Adventure, Crime  Genre_Action, Adventure, Drama  \
0                             0.0                             0.0   
1                             0.0                             0.0   
2                             0.0  

In [58]:
dff.Rating.fillna(dff.Rating.median(),inplace=True)
dff.isnull().sum()

Rating                                0
Genre_Action                          0
Genre_Action, Adventure               0
Genre_Action, Adventure, Biography    0
Genre_Action, Adventure, Comedy       0
                                     ..
Actor 3_Zubeida                       0
Actor 3_Zuber K. Khan                 0
Actor 3_Zulfi Sayed                   0
Actor 3_Zunaid Memon                  0
Actor 3_nan                           0
Length: 20858, dtype: int64

In [59]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Define the features (X) and the target variable (y)
X = dff.drop(columns=['Rating'])  # Features
y = dff['Rating']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"R-squared: {r2}")


Mean Squared Error: 1.917611431569451e+24
Root Mean Squared Error: 1384778477435.814
R-squared: -1.9817136720109662e+24


In [60]:
from sklearn.metrics import accuracy_score
accuracy_score(X_test,y_pred)

  return x.astype(dtype, copy=copy, casting=casting)


ValueError: Classification metrics can't handle a mix of multilabel-indicator and continuous targets

In [113]:
# Remove special characters and extra spaces from actor and director names
data['Director'] = data['Director'].str.replace('[^\w\s]', '').str.strip()
data['Actor 1'] = data['Actor 1'].str.replace('[^\w\s]', '').str.strip()
data['Actor 2'] = data['Actor 2'].str.replace('[^\w\s]', '').str.strip()
data['Actor 3'] = data['Actor 3'].str.replace('[^\w\s]', '').str.strip()


  data['Director'] = data['Director'].str.replace('[^\w\s]', '').str.strip()
  data['Actor 1'] = data['Actor 1'].str.replace('[^\w\s]', '').str.strip()
  data['Actor 2'] = data['Actor 2'].str.replace('[^\w\s]', '').str.strip()
  data['Actor 3'] = data['Actor 3'].str.replace('[^\w\s]', '').str.strip()


In [114]:
data.head()

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
1,#Gadhvi (He thought he was Gandhi),2019,109.0,[Drama],7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,2021,90.0,"[Drama, Musical]",0.0,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,2019,110.0,"[Comedy, Romance]",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,2010,105.0,[Drama],0.0,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali
5,...Aur Pyaar Ho Gaya,1997,147.0,"[Comedy, Drama, Musical]",4.7,827.0,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor


In [115]:
# Split the 'Genre' column into binary columns for each genre
genre_dummies = data['Genre'].str.get_dummies(', ')

# Concatenate the binary genre columns with the original data
data = pd.concat([data, genre_dummies], axis=1)

# Drop the original 'Genre' column
data.drop(['Genre'], axis=1, inplace=True)
 

In [116]:
# Define the numerical features to be standardized
numerical_features = ['Rating', 'Votes']

# Initialize the StandardScaler
scaler = StandardScaler()

# Standardize the selected numerical features
data[numerical_features] = scaler.fit_transform(data[numerical_features])
 

In [117]:
# Fill missing values in numerical columns with their means
data['Rating'].fillna(data['Rating'].mean(), inplace=True)
data['Votes'].fillna(data['Votes'].mean(), inplace=True)
data['Year'].fillna(data['Year'].mode()[0], inplace=True)

# For categorical columns, fill missing values with 'Unknown'
categorical_columns = ['Director', 'Actor 1', 'Actor 2', 'Actor 3']
data[categorical_columns] = data[categorical_columns].fillna('Unknown')
 

In [121]:
# Remove extra characters from column names
data.columns = data.columns.str.replace(r"[\[\]\'\"]", '', regex=True)



In [119]:
data.columns

Index(['Name', 'Year', 'Duration', 'Rating', 'Votes', 'Director', 'Actor 1',
       'Actor 2', 'Actor 3', ''Action'', ''Action']', ''Adventure'',
       ''Adventure']', ''Biography'', ''Biography']', ''Comedy'', ''Comedy']',
       ''Crime'', ''Crime']', ''Drama'', ''Drama']', ''Family'', ''Family']',
       ''Fantasy'', ''Fantasy']', ''History'', ''History']', ''Horror'',
       ''Horror']', ''Music'', ''Music']', ''Musical'', ''Musical']',
       ''Mystery'', ''Mystery']', ''News']', ''Reality-TV']', ''Romance'',
       ''Romance']', ''Sci-Fi'', ''Sci-Fi']', ''Short']', ''Sport'',
       ''Sport']', ''Thriller'', ''Thriller']', ''War']', ''Western']', '['']',
       '['Action'', '['Action']', '['Adventure'', '['Adventure']',
       '['Animation'', '['Animation']', '['Biography'', '['Biography']',
       '['Comedy'', '['Comedy']', '['Crime'', '['Crime']', '['Documentary'',
       '['Documentary']', '['Drama'', '['Drama']', '['Family'', '['Family']',
       '['Fantasy'', '['Fantasy']',

In [122]:
data.head()

Unnamed: 0,Name,Year,Duration,Rating,Votes,Director,Actor 1,Actor 2,Actor 3,Action,...,Mystery,Reality-TV,Romance,Romance.1,Sci-Fi,Sci-Fi.1,Sport,Thriller,Thriller.1,War
1,#Gadhvi (He thought he was Gandhi),2019,109.0,1.303153,-0.1663892,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid,0,...,0,0,0,0,0,0,0,0,0,0
2,#Homecoming,2021,90.0,-0.967693,2.69145e-18,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana,0,...,0,0,0,0,0,0,0,0,0,0
3,#Yaaram,2019,110.0,0.459696,-0.1640618,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor,0,...,0,0,0,0,0,0,0,0,0,0
4,...And Once Again,2010,105.0,-0.967693,2.69145e-18,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali,0,...,0,0,0,0,0,0,0,0,0,0
5,...Aur Pyaar Ho Gaya,1997,147.0,0.557018,-0.09579166,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor,0,...,0,0,0,0,0,0,0,0,0,0
