In [1]:
#Add Dependencies
import pandas as pd
import numpy as np
import psycopg2 as pg
from sklearn.preprocessing import OneHotEncoder
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

### Connection to the Postgres database and Retrive data from the tables

In [2]:
#Read Data from the PostgressSql
conn = pg.connect(user = "postgres",
                                  password = "akshaj10",
                                  host = "127.0.0.1",
                                  port = "5432",
                                  database = "movie_data")
sql1 = "Select * From movie_metadata;"
sql2 = "Select * from award;"
movie_df = pd.read_sql_query(sql1, conn)
award_df = pd.read_sql_query(sql2, conn)
conn = None

In [3]:
#Load Data into Pandas Data frame and Display the dataframe.
movie_df.shape

(45006, 13)

In [4]:
#Load Data into Pandas Data frame and Display the dataframe.
award_df.shape

(10395, 8)

In [5]:
#Merge the two dataframe
combined_df = pd.merge(movie_df, award_df, left_on='title', right_on='film' )
combined_df.shape

(10477, 21)

In [6]:
# Filter all rows for US as the country
movie_us_df = combined_df[combined_df['country'] == 'United States of America'] 
movie_us_df.shape

(6740, 21)

In [7]:
# Find null values
for column in movie_us_df.columns:
    print(f"Column{column} has {movie_us_df[column].isnull().sum()} null values")

Columnimdb_id has 0 null values
Columnbudget has 0 null values
Columnoriginal_title has 0 null values
Columnpopularity has 0 null values
Columnrelease_date has 0 null values
Columnrevenue has 0 null values
Columnruntime has 0 null values
Columntitle has 0 null values
Columnvote_average has 0 null values
Columnvote_count has 0 null values
Columnproduction has 143 null values
Columncountry has 0 null values
Columnnew_genre has 0 null values
Columnindex has 0 null values
Columnyear_film has 0 null values
Columnyear_ceremony has 0 null values
Columnceremony has 0 null values
Columncategory has 0 null values
Columnname has 0 null values
Columnfilm has 0 null values
Columnwinner has 0 null values


In [8]:
#Set the index to IMDB_ID
movie_us_df = movie_us_df.set_index("imdb_id")

In [9]:
columns =['budget','original_title','popularity','revenue','runtime','vote_average','vote_count','production','country','new_genre','year_film', 'winner']

target = ['winner']

In [10]:
movie_us_df = movie_us_df.loc[:, columns].copy()
movie_us_df.shape

(6740, 12)

In [11]:
#Convert winner column into numbers 
movie_us_df["winner"] = movie_us_df["winner"].replace({True: 1, False: 0})
movie_us_df.shape

(6740, 12)

In [12]:
# Create a title Dataframe.
title_df=pd.DataFrame(data=movie_us_df['original_title'],index=movie_us_df.index,copy=True)
title_df.head()

Unnamed: 0_level_0,original_title
imdb_id,Unnamed: 1_level_1
tt0114709,Toy Story
tt0114709,Toy Story
tt0114709,Toy Story
tt0047437,Sabrina
tt0047437,Sabrina


In [13]:
# Remove the movie title from the dataframe
movie_us_df = movie_us_df.drop('original_title',1)
movie_us_df.head()

Unnamed: 0_level_0,budget,popularity,revenue,runtime,vote_average,vote_count,production,country,new_genre,year_film,winner
imdb_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
tt0114709,30000000,21.946943,373554033.0,81.0,7.7,5415.0,Pixar Animation Studios,United States of America,Animation,1995,0
tt0114709,30000000,21.946943,373554033.0,81.0,7.7,5415.0,Pixar Animation Studios,United States of America,Animation,1995,0
tt0114709,30000000,21.946943,373554033.0,81.0,7.7,5415.0,Pixar Animation Studios,United States of America,Animation,1995,0
tt0047437,2238813,7.359741,10000000.0,113.0,7.4,284.0,Paramount Pictures,United States of America,Comedy,1954,0
tt0047437,2238813,7.359741,10000000.0,113.0,7.4,284.0,Paramount Pictures,United States of America,Comedy,1954,0


In [14]:
movie_us_df['budget']=movie_us_df.budget.astype('int64')

In [15]:
# Generate our categorical variable list for movie dataset
movie_cat = movie_us_df.dtypes[movie_us_df.dtypes == "object"].index.tolist()

# Check the number of unique values in each column
movie_us_df[movie_cat].nunique()

popularity    2390
production     527
country          1
new_genre       19
dtype: int64

In [16]:
# Drop null rows
movie_us_df = movie_us_df.dropna(axis = 0,how= 'any')
movie_us_df.shape


(6597, 11)

In [17]:
# let's examine how many columns we will obtain after one hot encoding these variables
encode_df=pd.get_dummies(movie_us_df[movie_cat], drop_first=True)
encode_df.shape

(6597, 2849)

In [18]:
# Merge one-hot encoded features and drop the originals
oscar_us_df = movie_us_df.merge(encode_df,left_index=True, right_index=True)
oscar_us_df = oscar_us_df.drop(movie_cat,1)
oscar_us_df.head(2)

Unnamed: 0_level_0,budget,revenue,runtime,vote_average,vote_count,year_film,winner,popularity_0.00118,popularity_0.001191,popularity_0.001224,...,new_genre_History,new_genre_Horror,new_genre_Music,new_genre_Mystery,new_genre_Romance,new_genre_Science Fiction,new_genre_TV Movie,new_genre_Thriller,new_genre_War,new_genre_Western
imdb_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tt0002101,0,0.0,88.0,4.0,1.0,1934,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
tt0002101,0,0.0,88.0,4.0,1.0,1934,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Decision Tree

In [19]:
# Define the features set.
X = oscar_us_df.copy()
X = X.drop('winner', axis=1)
X.shape

(36263, 2855)

In [20]:
# Define the target set(output label)
y = oscar_us_df['winner']
y.head(2)

imdb_id
tt0002101    0
tt0002101    0
Name: winner, dtype: int64

In [21]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [22]:
# Determine the shape of our training and testing sets.
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(27197, 2855)
(9066, 2855)
(27197,)
(9066,)


In [23]:
# Creating a StandardScaler instance.
scaler = StandardScaler()

In [24]:
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

In [25]:
# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [26]:
# Creating the decision tree classifier instance.
model = tree.DecisionTreeClassifier()
# Fitting the model.
model = model.fit(X_train_scaled, y_train)

In [27]:
# Making predictions using the testing data.
predictions = model.predict(X_test_scaled)
predictions

array([0, 0, 1, ..., 0, 0, 0], dtype=int64)

In [28]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,6141,560
Actual 1,1433,932


In [29]:
# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)
acc_score

0.7801676593867196

In [30]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,6141,560
Actual 1,1433,932


Accuracy Score : 0.7801676593867196
Classification Report
              precision    recall  f1-score   support

           0       0.81      0.92      0.86      6701
           1       0.62      0.39      0.48      2365

    accuracy                           0.78      9066
   macro avg       0.72      0.66      0.67      9066
weighted avg       0.76      0.78      0.76      9066

