In [1]:
#Add Dependencies
import pandas as pd
import numpy as np
import psycopg2 as pg
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
import seaborn as sns

### Connection to the Postgres database and Retrive data from the tables

In [2]:
#Read Data from the PostgressSql
conn = pg.connect(user = "postgres",
                                  password = "akshaj10",
                                  host = "127.0.0.1",
                                  port = "5432",
                                  database = "movie_data")
sql1 = "SELECT * FROM oscar_metadata;"
combined_df = pd.read_sql_query(sql1, conn)
conn = None

In [3]:
#Load Data into Pandas Data frame and Display the dataframe.
combined_df.shape

(51147, 17)

In [4]:
# Filter all rows for US as the country
movie_us_df = combined_df[combined_df['country'] == 'United States of America'] 
movie_us_df.shape

(22647, 17)

In [5]:
# Find null values
for column in movie_us_df.columns:
    print(f"Column{column} has {movie_us_df[column].isnull().sum()} null values")

Columnimdb_id has 3 null values
Columnbudget has 0 null values
Columnoriginal_title has 0 null values
Columnpopularity has 0 null values
Columnrevenue has 0 null values
Columnruntime has 5 null values
Columntitle has 0 null values
Columnvote_average has 0 null values
Columnvote_count has 0 null values
Columnproduction has 0 null values
Columncountry has 0 null values
Columnnew_genre has 0 null values
ColumnRelease_year has 2 null values
Columnindex has 15911 null values
Columncategory has 15911 null values
Columnfilm has 15911 null values
Columnwinner has 15911 null values


In [6]:
#Set the index to IMDB_ID
movie_us_df = movie_us_df.set_index("imdb_id")
movie_us_df.head()

Unnamed: 0_level_0,budget,original_title,popularity,revenue,runtime,title,vote_average,vote_count,production,country,new_genre,Release_year,index,category,film,winner
imdb_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
tt0114709,30000000,Toy Story,30000000.0,373554033.0,81.0,Toy Story,7.7,5415.0,Pixar Animation Studios,United States of America,Animation,1995,7548.0,WRITING (Screenplay Written Directly for the S...,Toy Story,False
tt0114709,30000000,Toy Story,30000000.0,373554033.0,81.0,Toy Story,7.7,5415.0,Pixar Animation Studios,United States of America,Animation,1995,7514.0,MUSIC (Original Song),Toy Story,False
tt0114709,30000000,Toy Story,30000000.0,373554033.0,81.0,Toy Story,7.7,5415.0,Pixar Animation Studios,United States of America,Animation,1995,7508.0,MUSIC (Original Musical or Comedy Score),Toy Story,False
tt0113497,65000000,Jumanji,65000000.0,262797249.0,104.0,Jumanji,6.9,2413.0,TriStar Pictures,United States of America,Adventure,1995,,,,
tt0113228,0,Grumpier Old Men,0.0,0.0,101.0,Grumpier Old Men,6.5,92.0,Warner Bros.,United States of America,Romance,1995,,,,


In [7]:
columns =['budget','original_title','popularity','revenue','runtime','vote_average','vote_count','production','country','new_genre','Release_year', 'winner']

target = ['winner']

In [8]:
movie_us_df = movie_us_df.loc[:, columns].copy()
movie_us_df.shape

(22647, 12)

In [9]:
#Convert winner column into numbers 
movie_us_df["winner"] = movie_us_df["winner"].replace({True: 1, False: 0})
movie_us_df.shape

(22647, 12)

In [10]:
#Fill null values in winner column to zero 
movie_us_df["winner"] = movie_us_df["winner"].fillna(value=0)
movie_us_df.shape

(22647, 12)

In [11]:
# convert the 'Date' column to datetime format 
movie_us_df['Release_year'] = movie_us_df['Release_year'].astype('datetime64[ns]')

In [12]:
# Extract year from date 
movie_us_df['Release_year'] = pd.DatetimeIndex(movie_us_df['Release_year']).year
movie_us_df.shape

(22647, 12)

In [17]:
# Filter all rows for movies from 1990
movie_us_df = movie_us_df[movie_us_df['Release_year'] >= 1989] 
movie_us_df.shape

(11828, 12)

In [18]:
movie_us_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11828 entries, tt0114709 to tt0303758
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   budget          11828 non-null  int64  
 1   original_title  11828 non-null  object 
 2   popularity      11828 non-null  float64
 3   revenue         11828 non-null  float64
 4   runtime         11824 non-null  float64
 5   vote_average    11828 non-null  float64
 6   vote_count      11828 non-null  float64
 7   production      11828 non-null  object 
 8   country         11828 non-null  object 
 9   new_genre       11828 non-null  object 
 10  Release_year    11828 non-null  float64
 11  winner          11828 non-null  float64
dtypes: float64(7), int64(1), object(4)
memory usage: 1.2+ MB


In [16]:
movie_us_df.head()

Unnamed: 0_level_0,budget,original_title,popularity,revenue,runtime,vote_average,vote_count,production,country,new_genre,Release_year,winner
imdb_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
tt0114709,30000000,Toy Story,30000000.0,373554033.0,81.0,7.7,5415.0,Pixar Animation Studios,United States of America,Animation,1995.0,0.0
tt0114709,30000000,Toy Story,30000000.0,373554033.0,81.0,7.7,5415.0,Pixar Animation Studios,United States of America,Animation,1995.0,0.0
tt0114709,30000000,Toy Story,30000000.0,373554033.0,81.0,7.7,5415.0,Pixar Animation Studios,United States of America,Animation,1995.0,0.0
tt0113497,65000000,Jumanji,65000000.0,262797249.0,104.0,6.9,2413.0,TriStar Pictures,United States of America,Adventure,1995.0,0.0
tt0113228,0,Grumpier Old Men,0.0,0.0,101.0,6.5,92.0,Warner Bros.,United States of America,Romance,1995.0,0.0


In [19]:
# Create a title Dataframe.
title_df=pd.DataFrame(data=movie_us_df['original_title'],index=movie_us_df.index,copy=True)
title_df.head()

Unnamed: 0_level_0,original_title
imdb_id,Unnamed: 1_level_1
tt0114709,Toy Story
tt0114709,Toy Story
tt0114709,Toy Story
tt0113497,Jumanji
tt0113228,Grumpier Old Men


In [20]:
# Remove the movie title from the dataframe
movie_us_df = movie_us_df.drop('original_title',1)
movie_us_df.head()

Unnamed: 0_level_0,budget,popularity,revenue,runtime,vote_average,vote_count,production,country,new_genre,Release_year,winner
imdb_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
tt0114709,30000000,30000000.0,373554033.0,81.0,7.7,5415.0,Pixar Animation Studios,United States of America,Animation,1995.0,0.0
tt0114709,30000000,30000000.0,373554033.0,81.0,7.7,5415.0,Pixar Animation Studios,United States of America,Animation,1995.0,0.0
tt0114709,30000000,30000000.0,373554033.0,81.0,7.7,5415.0,Pixar Animation Studios,United States of America,Animation,1995.0,0.0
tt0113497,65000000,65000000.0,262797249.0,104.0,6.9,2413.0,TriStar Pictures,United States of America,Adventure,1995.0,0.0
tt0113228,0,0.0,0.0,101.0,6.5,92.0,Warner Bros.,United States of America,Romance,1995.0,0.0


In [21]:
# Drop null rows
movie_us_df = movie_us_df.dropna(axis = 0,how= 'any')
movie_us_df.shape

(11824, 11)

In [22]:
# Generate our categorical variable list for movie dataset
movie_cat = movie_us_df.dtypes[movie_us_df.dtypes == "object"].index.tolist()

# Check the number of unique values in each column
movie_us_df[movie_cat].nunique()

production    3323
country          1
new_genre       20
dtype: int64

### One Hot encoding production -Using top 10 values

In [23]:
# Check the production unique value counts to see if binning is required
production_counts= movie_us_df.production.value_counts()
production_counts

Other                       1764
Paramount Pictures           409
Universal Pictures           318
Columbia Pictures            283
Walt Disney Pictures         247
                            ... 
Bloody Disgusting              1
Radar Pictures                 1
Madcap Entertainment           1
Incendiary Features            1
Chicken And Egg Pictures       1
Name: production, Length: 3323, dtype: int64

In [24]:
# Determine which values to replace for bucketing
replace_production = list(production_counts[production_counts < 80].index)

# Replace in DataFrame
for production in replace_production:
    movie_us_df.production = movie_us_df.production.replace(production,"Other")


# Check to make sure binning was successful
movie_us_df.production.value_counts()

Other                                     9542
Paramount Pictures                         409
Universal Pictures                         318
Columbia Pictures                          283
Walt Disney Pictures                       247
Twentieth Century Fox Film Corporation     216
New Line Cinema                            212
Miramax Films                              207
Warner Bros.                               109
TriStar Pictures                           108
DreamWorks SKG                              93
Fox Searchlight Pictures                    80
Name: production, dtype: int64

In [25]:
# let's examine how many columns we will obtain after one hot encoding these variables
encode_df=pd.get_dummies(movie_us_df[movie_cat], drop_first=True)
encode_df.shape

(11824, 30)

In [26]:
# Merge one-hot encoded features and drop the originals
oscar_us_df = movie_us_df.merge(encode_df,left_index=True, right_index=True)
oscar_us_df = oscar_us_df.drop(movie_cat,1)
oscar_us_df.head(2)

Unnamed: 0_level_0,budget,popularity,revenue,runtime,vote_average,vote_count,Release_year,winner,production_DreamWorks SKG,production_Fox Searchlight Pictures,...,new_genre_History,new_genre_Horror,new_genre_Music,new_genre_Mystery,new_genre_Romance,new_genre_Science Fiction,new_genre_TV Movie,new_genre_Thriller,new_genre_War,new_genre_Western
imdb_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tt0035423,0,0.0,0.0,118.0,6.0,430.0,2001.0,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0
tt0081145,0,0.0,0.0,94.0,0.0,0.0,1993.0,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Classification (RandomForestClassification)

In [27]:
# Define the features set.
X = oscar_us_df.copy()
X = X.drop('winner', axis=1)
X.shape

(19904, 37)

In [28]:
# Define the target set(output label)
y = oscar_us_df['winner']
y.head(2)

imdb_id
tt0035423    0.0
tt0081145    0.0
Name: winner, dtype: float64

In [29]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [30]:
# Creating a StandardScaler instance.
scaler = StandardScaler()

In [31]:
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

In [32]:
# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [33]:
# Create a random forest classifier.
#rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [34]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [35]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)
predictions

array([0., 0., 0., ..., 0., 0., 0.])

In [36]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,4144,155
Actual 1,445,232


In [37]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)
acc_score

0.8794212218649518

In [38]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,4144,155
Actual 1,445,232


Accuracy Score : 0.8794212218649518
Classification Report
              precision    recall  f1-score   support

         0.0       0.90      0.96      0.93      4299
         1.0       0.60      0.34      0.44       677

    accuracy                           0.88      4976
   macro avg       0.75      0.65      0.68      4976
weighted avg       0.86      0.88      0.86      4976



In [39]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([8.55269686e-02, 8.61312440e-02, 1.34067092e-01, 1.57039041e-01,
       1.27950714e-01, 1.74916579e-01, 8.72207172e-02, 2.85902478e-03,
       2.09517038e-03, 1.36946579e-02, 1.57732421e-03, 1.64463010e-02,
       1.53415662e-02, 1.43757311e-03, 6.39049403e-03, 3.44763489e-03,
       3.37849378e-03, 1.94605396e-03, 1.14338362e-02, 9.83911278e-03,
       6.37695794e-03, 4.09775030e-03, 3.29020965e-03, 1.67728436e-02,
       9.61195306e-04, 3.29522495e-03, 1.98971269e-07, 9.59872786e-04,
       2.59907095e-03, 2.13356412e-03, 1.56554943e-03, 3.17840605e-03,
       1.19903636e-03, 4.89963488e-03, 4.81990879e-03, 2.68196983e-04,
       8.42780126e-04])

In [40]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.17491657855115408, 'vote_count'),
 (0.15703904142877229, 'runtime'),
 (0.13406709170080267, 'revenue'),
 (0.12795071410161205, 'vote_average'),
 (0.08722071724572605, 'Release_year'),
 (0.086131243984054, 'popularity'),
 (0.08552696863841383, 'budget'),
 (0.016772843551914645, 'new_genre_Drama'),
 (0.016446300990701775, 'production_Other'),
 (0.015341566206327279, 'production_Paramount Pictures'),
 (0.013694657898790602, 'production_Miramax Films'),
 (0.01143383616644636, 'new_genre_Adventure'),
 (0.009839112783915041, 'new_genre_Animation'),
 (0.006390494028172891, 'production_Twentieth Century Fox Film Corporation'),
 (0.006376957940468445, 'new_genre_Comedy'),
 (0.004899634884874038, 'new_genre_TV Movie'),
 (0.004819908786982366, 'new_genre_Thriller'),
 (0.004097750302731667, 'new_genre_Crime'),
 (0.0034476348910244987, 'production_Universal Pictures'),
 (0.0033784937800449086, 'production_Walt Disney Pictures'),
 (0.003295224954137113, 'new_genre_Fantasy'),
 (0.0032902096467005