## IMDB Score prediction for movies

**1. Importing Libraries**

In [1]:
#Import libraries 

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from plotnine import *
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics

**2. Data Preprocessing**

The dataset under consideration contains information about various movies. It encompasses details such as directors, cast members (actors), critic reviews, audience reactions, and other related attributes. Among the crucial metrics used to gauge a movie's success, the IMDb score holds a prominent position. The IMDb score is a numerical representation of how well-received a movie is among the general audience and critics alike. It reflects the average rating given by viewers and provides insights into a movie's popularity and quality.

In [None]:
#Reading the Data 

movie_IMDB_df=pd.read_csv("movie_metadata.csv")

In [None]:
movie_IMDB_df.head(10)

In [None]:
 movie_IMDB_df.dtypes

In [None]:
movie_IMDB_df.shape

In [None]:
#Summary statistics for numerical columns, represented by five key points 

movie_IMDB_df.describe().T

In [None]:
#Removing the color section as most of the movies is colored

movie_IMDB_df["color"].value_counts()

movie_IMDB_df.drop('color',axis=1,inplace=True)

In [None]:
#Dropping the Imdb link from the dataset

movie_IMDB_df.drop('movie_imdb_link', axis=1, inplace=True)

In [None]:
#Checking for the columns present in the datset

movie_IMDB_df.columns

In [None]:
#Checking No of the missing values in the dataset

movie_IMDB_df.isna().sum()

In [None]:
#Eliminate dataset null values with low counts to retain more data effectively

movie_IMDB_df.dropna(axis=0,subset=['director_name', 'num_critic_for_reviews','duration','director_facebook_likes','actor_3_facebook_likes','actor_2_name','actor_1_facebook_likes','actor_1_name','actor_3_name','facenumber_in_poster','num_user_for_reviews','language','country','actor_2_facebook_likes','plot_keywords'],inplace=True)

In [None]:
movie_IMDB_df.shape

In [None]:
#Replacing the content rating with Value R as it has highest frequency

movie_IMDB_df["content_rating"].fillna("R", inplace = True) 

In [None]:
#Replacing the aspect_ratio with the median of the value as the graph is right skewed 

movie_IMDB_df["aspect_ratio"].fillna(movie_IMDB_df["aspect_ratio"].median(),inplace=True)

In [None]:
#Replace the value in budget with the median of the value

movie_IMDB_df["budget"].fillna(movie_IMDB_df["budget"].median(),inplace=True)


In [None]:
#Replace the value in gross with the median of the value 

movie_IMDB_df['gross'].fillna(movie_IMDB_df['gross'].median(),inplace=True)

In [None]:
# Recheck that all the null values are removed

movie_IMDB_df.isna().sum()



In [None]:
#Removing the duplicate values in the dataset

movie_IMDB_df.drop_duplicates(inplace=True)
movie_IMDB_df.shape

In [None]:
#Count of the language values 

movie_IMDB_df["language"].value_counts()

In [None]:
#Most of the values for the languages is english therefore drop the english column

movie_IMDB_df.drop('language',axis=1,inplace=True)

In [None]:
#Creating a new column to check the net profit made by the company  

movie_IMDB_df["Profit"]=movie_IMDB_df['budget'].sub(movie_IMDB_df['gross'], axis = 0) 

movie_IMDB_df.head(5)

In [None]:
#Creating a new column to check the profit percentage made by the company 

movie_IMDB_df['Profit_Percentage']=(movie_IMDB_df["Profit"]/movie_IMDB_df["gross"])*100
movie_IMDB_df

In [None]:
#Value counts for the countries 

value_counts=movie_IMDB_df["country"].value_counts()
print(value_counts)

In [None]:
#Getting top 2 values of index

vals = value_counts[:2].index
print (vals)
movie_IMDB_df['country'] = movie_IMDB_df.country.where(movie_IMDB_df.country.isin(vals), 'other')


In [None]:
#Divided the country into three catogories 
movie_IMDB_df["country"].value_counts()

In [None]:
movie_IMDB_df.head(10)

**3. Data Visualization and EDA**

In [None]:
# Assuming 'Profit_Percentage' is a calculated column in your movie_df
plt.figure(figsize=(10, 6))
sns.scatterplot(data=movie_IMDB_df, x='imdb_score', y='Profit_Percentage')
plt.title('Relationship between IMDb Score and Profit Percentage')
plt.xlabel('IMDb Score')
plt.ylabel('Profit Percentage')
plt.show()

In [None]:
#Finding the corelation between imdb_rating with respect to Critic Reviews 

(ggplot(movie_IMDB_df)
 + aes(x='imdb_score', y='num_critic_for_reviews')
 + geom_line()
 + labs(title='IMDB_Score vs. Critic Reviews', x='IMDB scores', y='Critic Reviews')
)

In [None]:
#Top 20 movies based on the profit they made

plt.figure(figsize=(10, 8))
movie_IMDB_df = movie_IMDB_df.sort_values(by='Profit', ascending=False)
movie_IMDB_df_new = movie_IMDB_df.head(20)
ax = sns.pointplot(data=movie_IMDB_df_new, x='Profit', y='budget', hue='movie_title')
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.tight_layout()
plt.show()

In [None]:
#Top 20 movies based on the profit percentage they made

plt.figure(figsize=(10, 8))
movie_IMDB_df = movie_IMDB_df.sort_values(by='Profit_Percentage', ascending=False)
movie_IMDB_df_new = movie_IMDB_df.head(20)
ax = sns.pointplot(data=movie_IMDB_df_new, x='Profit_Percentage', y='budget', hue='movie_title')
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.xlabel('Profit Percentage')
plt.ylabel('Budget')
plt.title('Top 20 Movies Based on Profit Percentage')
plt.tight_layout()
plt.show()


In [None]:
#Commercial success vs critial acclaim

movie_IMDB_df= movie_IMDB_df.sort_values(by ='Profit_Percentage' , ascending=False)
movie_IMDB_df_new=movie_IMDB_df.head(20)
(ggplot(movie_IMDB_df_new)
 + aes(x='imdb_score', y='gross',color = "content_rating")
 + geom_point()
 +  geom_hline(aes(yintercept = 600)) + 
  geom_vline(aes(xintercept = 10)) + 
  xlab("Imdb score") + 
  ylab("Gross money earned in million dollars") + 
  ggtitle("Commercial success Vs Critical acclaim") +
  annotate("text", x = 8.5, y = 700, label = "High ratings \n & High gross"))

In [None]:
#Top 20 actors of movies based on the commerical success

plt.figure(figsize=(10, 8))
movie_IMDB_df = movie_IMDB_df.sort_values(by='Profit_Percentage', ascending=False)
movie_IMDB_df_new = movie_IMDB_df.head(20)
ax = sns.pointplot(x='actor_1_name', y='Profit_Percentage', data=movie_IMDB_df_new, hue='movie_title')
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.tight_layout()
plt.show()


In [None]:
#Top 20 actors of movies based on the commerical success

plt.figure(figsize=(12, 8))

movie_IMDB_df = movie_IMDB_df.sort_values(by='Profit_Percentage', ascending=False)
movie_IMDB_df_new = movie_IMDB_df.head(20)
ax = sns.barplot(x='Profit_Percentage', y='actor_1_name', data=movie_IMDB_df_new, hue='movie_title', dodge=False)
ax.set_xlabel('Profit Percentage')
ax.set_ylabel('Actor')
ax.set_title('Top 20 Actors Based on Commercial Success')
ax.invert_yaxis()
plt.tight_layout()
plt.show()


In [None]:
#Top 20 actors of movies based on the imdb rating of the movies 

plt.figure(figsize=(10, 8))
movie_IMDB_df = movie_IMDB_df.sort_values(by='imdb_score', ascending=False)
movie_IMDB_df_new = movie_IMDB_df.head(20)
ax = sns.pointplot(x='actor_1_name', y='imdb_score', data=movie_IMDB_df_new, hue='movie_title')
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.tight_layout()
plt.show()



In [None]:
# Country of Top 20 movies based on imdb rating

plt.figure(figsize=(10, 8))
movie_IMDB_df = movie_IMDB_df.sort_values(by='imdb_score', ascending=False)
movie_IMDB_df_new = movie_IMDB_df.head(20)
ax = sns.barplot(x='country', y='imdb_score', data=movie_IMDB_df_new, hue='movie_title')
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.tight_layout()
plt.show()


**4.Data Preparation for the models - Dropping the columns with categorical values**

In [None]:
movie_IMDB_df.drop('director_name', axis=1, inplace=True)

In [None]:
movie_IMDB_df.drop('actor_1_name',axis=1,inplace=True)

In [None]:
movie_IMDB_df.drop('actor_2_name',axis=1,inplace=True)

In [None]:
movie_IMDB_df.drop('actor_3_name',axis=1,inplace=True)

In [None]:
movie_IMDB_df.drop('movie_title',axis=1,inplace=True)

In [None]:
movie_IMDB_df.drop('plot_keywords',axis=1,inplace=True)

In [None]:
movie_IMDB_df['genres'].value_counts()

In [None]:
movie_IMDB_df.drop('genres',axis=1,inplace =True)

In [None]:
movie_IMDB_df.drop('Profit',axis=1,inplace=True)

In [None]:
movie_IMDB_df.drop('Profit_Percentage',axis=1,inplace=True)

In [None]:
# Correlation with heat map
import matplotlib.pyplot as plt
import seaborn as sns
corr = movie_IMDB_df.corr()
sns.set_context("notebook", font_scale=1.0, rc={"lines.linewidth": 2.5})
plt.figure(figsize=(13,7))
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask, 1)] = True
a = sns.heatmap(corr,mask=mask, annot=True, fmt='.2f')
rotx = a.set_xticklabels(a.get_xticklabels(), rotation=90)
roty = a.set_yticklabels(a.get_yticklabels(), rotation=30)

In [None]:
#Adding the facebook likes of actor 2 and actor 3 together 
movie_IMDB_df['Other_actor_facebbok_likes']=movie_IMDB_df["actor_2_facebook_likes"] + movie_IMDB_df['actor_3_facebook_likes']



In [None]:
#Dropping the actor 2 and actor 3 facebook likes columns as they have been added together 

movie_IMDB_df.drop('actor_2_facebook_likes',axis=1,inplace=True)

In [None]:
movie_IMDB_df.drop('actor_3_facebook_likes',axis=1,inplace=True)

In [None]:
movie_IMDB_df.drop('cast_total_facebook_likes',axis=1,inplace=True)

In [None]:
#Ratio of the ratio of num_user_for_reviews and num_critic_for_reviews.

movie_IMDB_df['critic_review_ratio']=movie_IMDB_df['num_critic_for_reviews']/movie_IMDB_df['num_user_for_reviews']

In [None]:
#Dropping the num_critic_for_review

movie_IMDB_df.drop('num_critic_for_reviews',axis=1,inplace=True)
movie_IMDB_df.drop('num_user_for_reviews',axis=1,inplace=True)

In [None]:
# New Correlation matrix shown in the figure 

import matplotlib.pyplot as plt
import seaborn as sns
corr = movie_IMDB_df.corr()
sns.set_context("notebook", font_scale=1.0, rc={"lines.linewidth": 2.5})
plt.figure(figsize=(13,7))
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask, 1)] = True
a = sns.heatmap(corr,mask=mask, annot=True, fmt='.2f')
rotx = a.set_xticklabels(a.get_xticklabels(), rotation=90)
roty = a.set_yticklabels(a.get_yticklabels(), rotation=30)

Now we can see none of the attributes are not much correlated to each other.All are below 0.7 

In [None]:
# We need to categorize the imdb values in the range of 0-4,4-6,6-8 and 8-10 to mark them as the bad,average,good and excellent movies respectively

movie_IMDB_df["imdb_binned_score"]=pd.cut(movie_IMDB_df['imdb_score'], bins=[0,4,6,8,10], right=True, labels=False)+1

In [None]:
#Dropping the imdb_score column as it is being replaced with the imdb_binned_score values 
movie_IMDB_df.drop('imdb_score',axis=1,inplace=True)

In [None]:
movie_IMDB_df.head(5)

**5. Handling the categorical data**

In [None]:
movie_IMDB_df = pd.get_dummies(data = movie_IMDB_df, columns = ['country'] , prefix = ['country'] , drop_first = True)
movie_IMDB_df = pd.get_dummies(data = movie_IMDB_df, columns = ['content_rating'] , prefix = ['content_rating'] , drop_first = True)

In [None]:
movie_IMDB_df.columns

**6. Splitting the data into training and test data**

In [None]:
#Splitting the data into training and test data
X=pd.DataFrame(columns=['duration','director_facebook_likes','actor_1_facebook_likes','gross','num_voted_users','facenumber_in_poster','budget','title_year','aspect_ratio','movie_facebook_likes','Other_actor_facebbok_likes','critic_review_ratio','country_USA','country_other','content_rating_G','content_rating_GP','content_rating_M','content_rating_NC-17','content_rating_Not Rated','content_rating_PG','content_rating_PG-13','content_rating_Passed','content_rating_R','content_rating_TV-14','content_rating_TV-G','content_rating_TV-PG','content_rating_Unrated','content_rating_X'],data=movie_IMDB_df)
y=pd.DataFrame(columns=['imdb_binned_score'],data=movie_IMDB_df)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.3,random_state=100)

**7.Feature scaling**

In [None]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

**8. Model Selection**

**SVC**

In [None]:
# Define the parameter grid for hyperparameter tuning
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'degree': [2, 3, 4],
    'gamma': ['scale', 'auto', 0.1, 1]
}

svc = SVC()
grid_search = GridSearchCV(estimator=svc, param_grid=param_grid, cv=2, n_jobs=-1, verbose=2)


grid_search.fit(X_train, np.ravel(y_train, order='C'))
best_svc = grid_search.best_estimator_
svcpred = best_svc.predict(X_test)
cnf_matrix = metrics.confusion_matrix(y_test, svcpred)
print(cnf_matrix)
print("Accuracy for SVC:", metrics.accuracy_score(y_test, svcpred))


**Random Forest**

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

# Create a Random Forest classifier
rfc = RandomForestClassifier()
grid_search = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=2, n_jobs=-1, verbose=2)


grid_search.fit(X_train, np.ravel(y_train, order='C'))
best_rfc = grid_search.best_estimator_

rfcpred = best_rfc.predict(X_test)


cnf_matrix = metrics.confusion_matrix(y_test, rfcpred)
print(cnf_matrix)
print("Accuracy for Random Forest:", metrics.accuracy_score(y_test, rfcpred))


**Gradient Boosting**



In [None]:
# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5]
}

# Create a Gradient Boosting classifier
gbcl = GradientBoostingClassifier()
grid_search = GridSearchCV(estimator=gbcl, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)


grid_search.fit(X_train, np.ravel(y_train, order='C'))
best_gbcl = grid_search.best_estimator_

test_pred = best_gbcl.predict(X_test)

cnf_matrix = metrics.confusion_matrix(y_test, test_pred)
print(cnf_matrix)
print("Accuracy for Gradient Boosting:", metrics.accuracy_score(y_test, test_pred))


**9. Model comparison**

In [None]:
#Classification Report
from sklearn.metrics import classification_report

print('SVC Reports\n',classification_report(y_test, svcpred))
print('Random Forests Reports\n',classification_report(y_test, rfcpred))
print('Gradient Boosting',classification_report(y_test, test_pred))

In [None]:
#Accuracy Comparison
import matplotlib.pyplot as plt

# Define the accuracy scores for each model
svc_accuracy = 0.72  
rfc_accuracy = 0.75  
gbcl_accuracy = 0.74 

# Create a new plot
plt.figure(figsize=(10, 6))

model_names = ['SVC', 'Random Forests', 'Gradient Boosting']
accuracies = [svc_accuracy, rfc_accuracy, gbcl_accuracy]


for model_name, accuracy in zip(model_names, accuracies):
    plt.bar(model_name, accuracy)

plt.title('Accuracy Comparison')
plt.ylabel('Accuracy')
plt.ylim([0, 1])
plt.yticks([i / 10 for i in range(11)])

# Display the plot
plt.show()


In [None]:
#Plotting Classification Reports
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report


svc_report = classification_report(y_test, svcpred, output_dict=True)
rfc_report = classification_report(y_test, rfcpred, output_dict=True)
gbcl_report = classification_report(y_test, test_pred, output_dict=True)


reports = {
    'SVC': svc_report,
    'Random Forests': rfc_report,
    'Gradient Boosting': gbcl_report
}

model_names = ['SVC', 'Random Forests', 'Gradient Boosting']
metric_names = ['precision', 'recall', 'f1-score']

In [None]:
fig, axes = plt.subplots(len(metric_names), 1, figsize=(10, 15))

for i, metric_name in enumerate(metric_names):
    ax = axes[i]
    ax.set_title(metric_name.capitalize())
    
    for model_name in model_names:
        score = reports[model_name]['weighted avg'][metric_name]
        ax.bar(model_name, score)

    ax.set_ylabel(metric_name.capitalize())
    ax.set_ylim([0, 1])
    ax.set_yticks([i / 10 for i in range(11)])
    
plt.tight_layout()
plt.show()