**TASK 2: MOVIE RATING PREDICTION**

Author: Shashank

Domain: Data Science

Batch: June 24

In [1]:
#importing libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')

ModuleNotFoundError: No module named 'plotly.io'

In [None]:
df=pd.read_csv('IMDb Movies India.csv',encoding='unicode_escape')
df.head(10)

In [None]:
df.shape

Data Cleaning

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.dropna(inplace=True)


In [None]:
df.shape

In [None]:
df.isnull().sum()

Data Pre-Processing

In [None]:
# Removing negative sign from year column and convert to numeric format
df['Year']=df['Year'].abs().astype(int)

In [None]:
# Removing 'min' word from duration column and converting it to numeric format
df['Duration']=df['Duration'].str.replace(' min','').astype(int)

In [None]:
# Splitting the Genre by comma(,) to keep unique genres and replacing the null values with mode

# Step 1: Split the Genre column into lists
df['Genre']=df['Genre'].str.split(', ')

# Step 2: Explode the lists into separate rows
df=df.explode('Genre')

# Step 3: Replace null values with mode
df['Genre'].fillna(df['Genre'].mode()[0],inplace=True)

In [None]:
# Convert votes column to numeric and remove comma(,) to keep only numeric part
df['Votes']=df['Votes'].str.replace(',','').astype(int)

In [None]:
df

In [None]:
df.info()

Data Visualizing:

It is done to show the relationships present in between the features present in the dataset

In [None]:
# Histogram shows distribution of years and its probablity density

year_hist=px.histogram(df, x='Year', histnorm='probability density', nbins=40)
year_hist.show()

In [None]:
# Group the data by (year,genre) and calculate the average rating
avg_rating_df=df.groupby(['Year','Genre'])['Rating'].mean().reset_index()
avg_rating_df

In [None]:
# Top 10 Genres
top_genres=df['Genre'].value_counts().head(10).index
top_genres

In [None]:
# Filtering to include only top 10 Genres
avg_rating_df=avg_rating_df[avg_rating_df['Genre'].isin(top_genres)]
avg_rating_df

In [None]:
#Line plot
figure=px.line(avg_rating_df, x='Year', y='Rating', color='Genre', title='Average Rating by Year for Top 10 Genre')
figure.show()

In [None]:
# Histogram shows distribution of ratings and its probablity density
rating_hist=px.histogram(df, x='Rating', histnorm='probability density', nbins=40)
rating_hist.show()

Feature Scaling

In [None]:
# Importing necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [None]:
df.drop('Name',axis=1,inplace=True)

In [None]:
df.head()

In [None]:
# Grouping the columns with their average rating and creating a new feature

genre_mean_rating = df.groupby('Genre')['Rating'].transform('mean')
df['Genre_mean_rating']=genre_mean_rating

director_mean_rating = df.groupby('Director')['Rating'].transform('mean')
df['Director_mean_rating']=director_mean_rating

actor1_mean_rating = df.groupby('Actor 1')['Rating'].transform('mean')
df['Actor1_mean_rating']=actor1_mean_rating

actor2_mean_rating = df.groupby('Actor 2')['Rating'].transform('mean')
df['Actor2_mean_rating']=actor2_mean_rating

actor3_mean_rating = df.groupby('Actor 3')['Rating'].transform('mean')
df['Actor3_mean_rating']=actor3_mean_rating

In [None]:
df.head()

In [None]:
x=df[['Year','Duration','Votes','Genre_mean_rating','Director_mean_rating','Actor1_mean_rating','Actor2_mean_rating','Actor3_mean_rating']]
y=df['Rating']

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

Model Building

In [None]:
lr=LinearRegression()
lr.fit(x_train,y_train)

In [None]:
pred=lr.predict(x_test)

In [None]:
pred

In [None]:
y_test

In [None]:
# Evaluating the performance of the model

# print("Accuracy Score: ",accuracy_score(y_test,pred))
print("Mean Squared Error: ",mean_squared_error(y_test,pred))
print("Mean Absolute Error: ",mean_absolute_error(y_test,pred))
print("R2 Score: ",r2_score(y_test,pred))

Model Testing

In [None]:
x.head()

In [None]:
y.head()

In [None]:
data={'Year':2018,
      'Duration':120,
      'Votes':1234,
      'Genre_mean_rating':7.3,
      'Director_mean_rating':5.3,
      'Actor1_mean_rating':6.5,
      'Actor2_mean_rating':5.8,
      'Actor3_mean_rating':9}
test=pd.DataFrame(data,index=[0])
test

In [None]:
test_pred=lr.predict(test)
print("Predicted Rating: ",test_pred[0])