In [1]:
#Add Dependencies
import pandas as pd
import numpy as np
import psycopg2 as pg
from sklearn.preprocessing import OneHotEncoder
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
import seaborn as sns

### Connection to the Postgres database and Retrive data from the tables

In [2]:
#Read Data from the PostgressSql
conn = pg.connect(user = "postgres",
                                  password = "akshaj10",
                                  host = "127.0.0.1",
                                  port = "5432",
                                  database = "movie_data")
sql1 = "SELECT * FROM oscar_metadata;"
combined_df = pd.read_sql_query(sql1, conn)
conn = None

In [3]:
#Load Data into Pandas Data frame and Display the dataframe.
combined_df.shape

(51147, 17)

In [4]:
# Filter all rows for US as the country
movie_us_df = combined_df[combined_df['country'] == 'United States of America'] 
movie_us_df.shape

(22647, 17)

In [5]:
# Find null values
for column in movie_us_df.columns:
    print(f"Column{column} has {movie_us_df[column].isnull().sum()} null values")

Columnimdb_id has 3 null values
Columnbudget has 0 null values
Columnoriginal_title has 0 null values
Columnpopularity has 0 null values
Columnrevenue has 0 null values
Columnruntime has 5 null values
Columntitle has 0 null values
Columnvote_average has 0 null values
Columnvote_count has 0 null values
Columnproduction has 0 null values
Columncountry has 0 null values
Columnnew_genre has 0 null values
ColumnRelease_year has 2 null values
Columnindex has 15911 null values
Columncategory has 15911 null values
Columnfilm has 15911 null values
Columnwinner has 15911 null values


In [6]:
#Set the index to IMDB_ID
movie_us_df = movie_us_df.set_index("imdb_id")
movie_us_df.head()

Unnamed: 0_level_0,budget,original_title,popularity,revenue,runtime,title,vote_average,vote_count,production,country,new_genre,Release_year,index,category,film,winner
imdb_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
tt0114709,30000000,Toy Story,30000000.0,373554033.0,81.0,Toy Story,7.7,5415.0,Pixar Animation Studios,United States of America,Animation,1995,7548.0,WRITING (Screenplay Written Directly for the S...,Toy Story,False
tt0114709,30000000,Toy Story,30000000.0,373554033.0,81.0,Toy Story,7.7,5415.0,Pixar Animation Studios,United States of America,Animation,1995,7514.0,MUSIC (Original Song),Toy Story,False
tt0114709,30000000,Toy Story,30000000.0,373554033.0,81.0,Toy Story,7.7,5415.0,Pixar Animation Studios,United States of America,Animation,1995,7508.0,MUSIC (Original Musical or Comedy Score),Toy Story,False
tt0113497,65000000,Jumanji,65000000.0,262797249.0,104.0,Jumanji,6.9,2413.0,TriStar Pictures,United States of America,Adventure,1995,,,,
tt0113228,0,Grumpier Old Men,0.0,0.0,101.0,Grumpier Old Men,6.5,92.0,Warner Bros.,United States of America,Romance,1995,,,,


In [7]:
columns =['budget','original_title','popularity','revenue','runtime','vote_average','vote_count','production','country','new_genre','Release_year', 'winner']

target = ['winner']

In [8]:
movie_us_df = movie_us_df.loc[:, columns].copy()
movie_us_df.shape

(22647, 12)

In [9]:
#Convert winner column into numbers 
movie_us_df["winner"] = movie_us_df["winner"].replace({True: 1, False: 0})
movie_us_df.shape

(22647, 12)

In [10]:
#Fill null values in winner column to zero 
movie_us_df["winner"] = movie_us_df["winner"].fillna(value=0)
movie_us_df.shape

(22647, 12)

In [11]:
# convert the 'Date' column to datetime format 
movie_us_df['Release_year'] = movie_us_df['Release_year'].astype('datetime64[ns]')

In [12]:
# Extract year from date 
movie_us_df['Release_year'] = pd.DatetimeIndex(movie_us_df['Release_year']).year
movie_us_df.shape

(22647, 12)

In [13]:
# Filter all rows for movies from 1990
movie_us_df = movie_us_df[movie_us_df['Release_year'] >= 1990] 
movie_us_df.shape

(11828, 12)

In [14]:
# Drop the Release year after filtering the data.
movie_us_df = movie_us_df.drop('Release_year',1)

In [15]:
# Create a title Dataframe.
title_df=pd.DataFrame(data=movie_us_df['original_title'],index=movie_us_df.index,copy=True)
title_df.head()

Unnamed: 0_level_0,original_title
imdb_id,Unnamed: 1_level_1
tt0114709,Toy Story
tt0114709,Toy Story
tt0114709,Toy Story
tt0113497,Jumanji
tt0113228,Grumpier Old Men


In [16]:
# Remove the movie title from the dataframe
movie_us_df = movie_us_df.drop('original_title',1)
movie_us_df.head()

Unnamed: 0_level_0,budget,popularity,revenue,runtime,vote_average,vote_count,production,country,new_genre,winner
imdb_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
tt0114709,30000000,30000000.0,373554033.0,81.0,7.7,5415.0,Pixar Animation Studios,United States of America,Animation,0.0
tt0114709,30000000,30000000.0,373554033.0,81.0,7.7,5415.0,Pixar Animation Studios,United States of America,Animation,0.0
tt0114709,30000000,30000000.0,373554033.0,81.0,7.7,5415.0,Pixar Animation Studios,United States of America,Animation,0.0
tt0113497,65000000,65000000.0,262797249.0,104.0,6.9,2413.0,TriStar Pictures,United States of America,Adventure,0.0
tt0113228,0,0.0,0.0,101.0,6.5,92.0,Warner Bros.,United States of America,Romance,0.0


In [17]:
# Drop null rows
movie_us_df = movie_us_df.dropna(axis = 0,how= 'any')
movie_us_df.shape

(11824, 10)

In [18]:
# Generate our categorical variable list for movie dataset
movie_cat = movie_us_df.dtypes[movie_us_df.dtypes == "object"].index.tolist()

# Check the number of unique values in each column
movie_us_df[movie_cat].nunique()

production    3323
country          1
new_genre       20
dtype: int64

### One Hot encoding production -Using top 10 values

In [19]:
# Check the production unique value counts to see if binning is required
production_counts= movie_us_df.production.value_counts()
production_counts

Other                                  1764
Paramount Pictures                      409
Universal Pictures                      318
Columbia Pictures                       283
Walt Disney Pictures                    247
                                       ... 
Metro-Goldwyn-Mayer                       1
Alla Prima Productions                    1
Madcap Entertainment                      1
Virgin Produced                           1
National Film Board of Canada (NFB)       1
Name: production, Length: 3323, dtype: int64

In [20]:
# Determine which values to replace for bucketing
replace_production = list(production_counts[production_counts < 80].index)

# Replace in DataFrame
for production in replace_production:
    movie_us_df.production = movie_us_df.production.replace(production,"Other")


# Check to make sure binning was successful
movie_us_df.production.value_counts()

Other                                     9542
Paramount Pictures                         409
Universal Pictures                         318
Columbia Pictures                          283
Walt Disney Pictures                       247
Twentieth Century Fox Film Corporation     216
New Line Cinema                            212
Miramax Films                              207
Warner Bros.                               109
TriStar Pictures                           108
DreamWorks SKG                              93
Fox Searchlight Pictures                    80
Name: production, dtype: int64

In [21]:
# let's examine how many columns we will obtain after one hot encoding these variables
encode_df=pd.get_dummies(movie_us_df[movie_cat], drop_first=True)
encode_df.shape

(11824, 30)

In [22]:
# Merge one-hot encoded features and drop the originals
oscar_us_df = movie_us_df.merge(encode_df,left_index=True, right_index=True)
oscar_us_df = oscar_us_df.drop(movie_cat,1)
oscar_us_df.head(2)

Unnamed: 0_level_0,budget,popularity,revenue,runtime,vote_average,vote_count,winner,production_DreamWorks SKG,production_Fox Searchlight Pictures,production_Miramax Films,...,new_genre_History,new_genre_Horror,new_genre_Music,new_genre_Mystery,new_genre_Romance,new_genre_Science Fiction,new_genre_TV Movie,new_genre_Thriller,new_genre_War,new_genre_Western
imdb_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tt0035423,0,0.0,0.0,118.0,6.0,430.0,0.0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
tt0081145,0,0.0,0.0,94.0,0.0,0.0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
columns1 =['new_genre_Drama', 'production_Paramount Pictures', 'production_Other', 'budget', 'popularity', 'runtime', 'vote_average', 'revenue', 'vote_count', 'winner']

target = ['winner']

In [24]:
oscar_us_df = oscar_us_df.loc[:, columns1].copy()
oscar_us_df.shape

(19904, 10)

### Classification (RandomForestClassification)

In [25]:
# Define the features set.
X = oscar_us_df.copy()
X = X.drop('winner', axis=1)
X.shape

(19904, 9)

In [26]:
# Define the target set(output label)
y = oscar_us_df['winner']
y.head(2)

imdb_id
tt0035423    0.0
tt0081145    0.0
Name: winner, dtype: float64

In [27]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [28]:
# Determine the shape of our training and testing sets.
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(14928, 9)
(4976, 9)
(14928,)
(4976,)


In [29]:
# Creating a StandardScaler instance.
scaler = StandardScaler()

In [30]:
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

In [31]:
# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [32]:
# Creating the decision tree classifier instance.
model = tree.DecisionTreeClassifier()
# Fitting the model.
model = model.fit(X_train_scaled, y_train)

In [33]:
# Making predictions using the testing data.
predictions = model.predict(X_test_scaled)
predictions

array([0., 0., 0., ..., 0., 0., 0.])

In [34]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,4124,175
Actual 1,447,230


In [35]:
# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)
acc_score

0.875

In [36]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,4124,175
Actual 1,447,230


Accuracy Score : 0.875
Classification Report
              precision    recall  f1-score   support

         0.0       0.90      0.96      0.93      4299
         1.0       0.57      0.34      0.43       677

    accuracy                           0.88      4976
   macro avg       0.74      0.65      0.68      4976
weighted avg       0.86      0.88      0.86      4976

