# **Task2**
## Movie Rating Prediction 

## Import Necessary Libraries

In [27]:
import pandas as pd
import numpy as np 
import seaborn as sns 
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

## *Data Collection*

In [7]:
# Load the data set 
df_movie = pd.read_csv('IMDb Movies India.csv', sep=',', encoding='ISO-8859-1')
df_movie.head()


Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali


## *Data Preprocessing*

In [8]:
df_movie.isnull().sum()

Name           0
Year         528
Duration    8269
Genre       1877
Rating      7590
Votes       7589
Director     525
Actor 1     1617
Actor 2     2384
Actor 3     3144
dtype: int64

In [30]:
df_movie.columns

Index(['Name', 'Year', 'Duration', 'Genre', 'Rating', 'Votes', 'Director',
       'Actor 1', 'Actor 2', 'Actor 3', 'genre_encoded', 'director_encoded'],
      dtype='object')

In [16]:
# Remove rows with missing values
df_movie.dropna(subset=['Rating', 'Genre'], inplace=True)


In [12]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df_movie['genre_encoded'] = label_encoder.fit_transform(df_movie['Genre'])
df_movie['director_encoded'] = label_encoder.fit_transform(df_movie['Director'])


In [14]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df_movie[['genre_encoded', 'Rating']] = scaler.fit_transform(df_movie[['genre_encoded', 'Rating']])


## *Feature Engineering*

In [19]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df_movie[['genre_encoded']], df_movie['Rating'], test_size=0.2, random_state=42)

# Create and train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)



## *Model Evaluation*

In [20]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error: {rmse}")

Root Mean Squared Error: 0.1501071144022377


# Rating prediction Based on Director

In [22]:
# Data Preprocessing 
# Remove rows with missing values
df_movie.dropna(subset=['Rating', 'Director'], inplace=True)

label_encoder = LabelEncoder()
df_movie['director_encoded'] = label_encoder.fit_transform(df_movie['Director'])
scaler = MinMaxScaler()
df_movie[['director_encoded', 'Rating']] = scaler.fit_transform(df_movie[['director_encoded', 'Rating']])

### Model Evaluation & Prediction 

In [28]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df_movie[['director_encoded']], df_movie['Rating'], test_size=0.2, random_state=42)

# Create and train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error: {rmse}")


Root Mean Squared Error: 0.15592618472115258


# Movie Rating Based on actors 

In [31]:

# Concatenate actor columns and separate names with a delimiter (e.g., comma)
df_movie['actors'] = df_movie['Actor 1'] + ', ' + df_movie['Actor 2'] + ', ' + df_movie['Actor 3']

# Remove the original actor columns if needed
df = df_movie.drop(['Actor 1', 'Actor 2', 'Actor 3'], axis=1)
# Data Preprocessing 
# Remove rows with missing values
df_movie.dropna(subset=['Rating', 'actors'], inplace=True)

label_encoder = LabelEncoder()
df_movie['actor_encoded'] = label_encoder.fit_transform(df_movie['actors'])
scaler = MinMaxScaler()
df_movie[['actor_encoded', 'Rating']] = scaler.fit_transform(df_movie[['actor_encoded', 'Rating']])


## Model Evaluation & Predictions

In [32]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df_movie[['actor_encoded']], df_movie['Rating'], test_size=0.2, random_state=42)

# Create and train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error: {rmse}")

Root Mean Squared Error: 0.15292526995132594


### *An RMSE of Movie Rating Based on Actors is 0.15292526995132594, RMSE of Movie Rating Based on Directors 0.15592618472115258, and RMSE of Movie Rating Based on Genre 0.1501071144022377 is indicative of a movie rating prediction model that is performing well in terms of prediction accuracy.*