In [1]:
#Add Dependencies
import pandas as pd
import numpy as np
import psycopg2 as pg
from path import Path
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#Read Data from the PostgressSql
conn = pg.connect(user = "postgres",
                                  password = "akshaj10",
                                  host = "127.0.0.1",
                                  port = "5432",
                                  database = "movie_data")
sql1 = "Select * from award;"
award_df = pd.read_sql_query(sql1, conn)
conn = None

In [3]:
#Load Data into Pandas Data frame and Display the dataframe.
award_df.shape

(10395, 4)

In [4]:
# Loading 2020 movie data
file_path = Path("Resources/tmdb-movies-2020.csv")
df_movies = pd.read_csv(file_path)
df_movies.head()

Unnamed: 0,id,video,vote_count,vote_average,title,release_date,original_language,original_title,genre_ids/0,backdrop_path,...,overview,poster_path,popularity,id_imdb,genre_ids/1,genre_ids/2,genre_ids/3,genre_ids/4,genre_ids/5,genre_ids/6
0,634233,False,0,0.0,Class Action Park,,en,Class Action Park,99.0,,...,"Discover the legacy of Action Park, a very rea...",/h21s0GnRiI9aL4hIFQWJGPZSwGg.jpg,0.6,tt11015214,,,,,,
1,436786,False,0,0.0,Clapboard Jungle: Surviving the Independent Fi...,2020-03-26,en,Clapboard Jungle: Surviving the Independent Fi...,99.0,,...,A survival guide for the modern independent fi...,/3ZslUuPHnv5EngMHbWyQzb9PX0L.jpg,0.926,tt4284084,,,,,,
2,688335,False,2,7.5,Close Encounters of the Fifth Kind,2020-04-07,en,Close Encounters of the Fifth Kind,99.0,/efiDNmExd9GweIqmQKcwIt09hi7.jpg,...,"Dr. Steven Greer’s previous works, SIRIUS and ...",/R0rmE1wDbxMb2W0MyDE3HKPhOR.jpg,5.217,tt12108272,,,,,,
3,639103,False,1,4.0,Clover,2020-04-03,en,Clover,35.0,/hgvNpunbkYhPLKj46GotQqlTXng.jpg,...,Brothers Jackie and Mickey along with a teen w...,/wTp5SYAuTuHDGQGWePkF2z5sHTD.jpg,5.588,tt7801350,53.0,,,,,
4,674334,False,0,0.0,Climbing Blind,2020-03-20,en,Climbing Blind,99.0,/rjtXLRO0ixmbjoQM27Rj7rFRWe9.jpg,...,Blind climber Jesse Dufton's ascent of the Old...,/rODi66xlhEIaHs2krHlj4A8da5f.jpg,2.586,tt11801494,,,,,,


In [5]:
#Drop movies considered "Adult"
df_movies = df_movies.drop(df_movies[df_movies.adult == True].index)

In [6]:
#Keep the column which are relevant
columns1 = ['vote_count', 'vote_average','title',
       'release_date', 'original_language','id_imdb', 'popularity', 'genre_ids/0']

In [7]:
#drop the unnecessary columns
df_movies = df_movies.loc[:, columns1].copy()
df_movies.shape

(2471, 8)

In [8]:
#Rename the genre column to make it more readable
df_movies.rename(columns={'genre_ids/0': 'genre_ids'}, inplace=True)

In [9]:
df_movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2471 entries, 0 to 2475
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   vote_count         2471 non-null   int64  
 1   vote_average       2471 non-null   float64
 2   title              2471 non-null   object 
 3   release_date       1909 non-null   object 
 4   original_language  2471 non-null   object 
 5   id_imdb            2471 non-null   object 
 6   popularity         2471 non-null   float64
 7   genre_ids          1940 non-null   float64
dtypes: float64(3), int64(1), object(4)
memory usage: 173.7+ KB


In [10]:
# convert the 'Date' column to datetime format 
df_movies['release_date'] = df_movies['release_date'].astype('datetime64[ns]')

In [11]:
# Extract year from date 
df_movies['release_date'] = pd.DatetimeIndex(df_movies['release_date']).year
df_movies.shape

(2471, 8)

In [12]:
#Check the dataframe
df_movies.head()

Unnamed: 0,vote_count,vote_average,title,release_date,original_language,id_imdb,popularity,genre_ids
0,0,0.0,Class Action Park,,en,tt11015214,0.6,99.0
1,0,0.0,Clapboard Jungle: Surviving the Independent Fi...,2020.0,en,tt4284084,0.926,99.0
2,2,7.5,Close Encounters of the Fifth Kind,2020.0,en,tt12108272,5.217,99.0
3,1,4.0,Clover,2020.0,en,tt7801350,5.588,35.0
4,0,0.0,Climbing Blind,2020.0,en,tt11801494,2.586,99.0


In [13]:
# Filter all rows for movies from 2020
df_movies = df_movies[df_movies['release_date'] >= 2020] 
df_movies.shape

(1765, 8)

In [14]:
# Drop the Release year after filtering the data.
df_movies = df_movies.drop('release_date',1)

In [15]:
#Check the dataframe
df_movies.head()

Unnamed: 0,vote_count,vote_average,title,original_language,id_imdb,popularity,genre_ids
1,0,0.0,Clapboard Jungle: Surviving the Independent Fi...,en,tt4284084,0.926,99.0
2,2,7.5,Close Encounters of the Fifth Kind,en,tt12108272,5.217,99.0
3,1,4.0,Clover,en,tt7801350,5.588,35.0
4,0,0.0,Climbing Blind,en,tt11801494,2.586,99.0
6,1,10.0,Classic Again,th,tt10075442,0.6,18.0


In [16]:
# Drop null rows
df_movies = df_movies.dropna(axis = 0,how= 'any')
df_movies.shape

(1537, 7)

In [17]:
# Loading 2020 genre data
file_path_g = Path("Resources/tmdb-movies-2020-genre.csv")
df_genre = pd.read_csv(file_path_g)
df_genre.head()

Unnamed: 0,genres/id,genres/name
0,28,Action
1,12,Adventure
2,16,Animation
3,35,Comedy
4,80,Crime


In [18]:
#Rename the columns to make it more readable
df_genre.rename(columns={'genres/id': 'genre_ids','genres/name': 'genre_name'}, inplace=True)

In [19]:
df_genre.head()

Unnamed: 0,genre_ids,genre_name
0,28,Action
1,12,Adventure
2,16,Animation
3,35,Comedy
4,80,Crime


In [20]:
#Merge the two dataframe
df_movies2020 = pd.merge(df_movies, df_genre, how='left', left_on='genre_ids', right_on='genre_ids' )
df_movies2020.shape

(1537, 8)

In [21]:
#Check the dataframe
df_movies2020.head()

Unnamed: 0,vote_count,vote_average,title,original_language,id_imdb,popularity,genre_ids,genre_name
0,0,0.0,Clapboard Jungle: Surviving the Independent Fi...,en,tt4284084,0.926,99.0,Documentary
1,2,7.5,Close Encounters of the Fifth Kind,en,tt12108272,5.217,99.0,Documentary
2,1,4.0,Clover,en,tt7801350,5.588,35.0,Comedy
3,0,0.0,Climbing Blind,en,tt11801494,2.586,99.0,Documentary
4,1,10.0,Classic Again,th,tt10075442,0.6,18.0,Drama


In [22]:
# Drop the Release year after filtering the data.
df_movies2020 = df_movies2020.drop('genre_ids',1)
df_movies2020.head()

Unnamed: 0,vote_count,vote_average,title,original_language,id_imdb,popularity,genre_name
0,0,0.0,Clapboard Jungle: Surviving the Independent Fi...,en,tt4284084,0.926,Documentary
1,2,7.5,Close Encounters of the Fifth Kind,en,tt12108272,5.217,Documentary
2,1,4.0,Clover,en,tt7801350,5.588,Comedy
3,0,0.0,Climbing Blind,en,tt11801494,2.586,Documentary
4,1,10.0,Classic Again,th,tt10075442,0.6,Drama


In [23]:
#Merge the two dataframe
combined_df = pd.merge(df_movies2020, award_df,how='left', left_on='title', right_on='film' )
combined_df.shape

(1581, 11)

In [24]:
# Find null values
for column in combined_df.columns:
    print(f"Column{column} has {combined_df[column].isnull().sum()} null values")

Columnvote_count has 0 null values
Columnvote_average has 0 null values
Columntitle has 0 null values
Columnoriginal_language has 0 null values
Columnid_imdb has 0 null values
Columnpopularity has 0 null values
Columngenre_name has 0 null values
Columnindex has 1512 null values
Columncategory has 1512 null values
Columnfilm has 1512 null values
Columnwinner has 1512 null values


In [25]:
#Convert winner column into numbers 
combined_df["winner"] = combined_df["winner"].replace({True: 1, False: 0})
combined_df.shape

(1581, 11)

In [26]:
# Filter all rows of  movies from 2020
df_movieswinner = combined_df[combined_df['winner'] == 1] 
df_movieswinner

Unnamed: 0,vote_count,vote_average,title,original_language,id_imdb,popularity,genre_name,index,category,film,winner
34,0,0.0,Love Story,te,tt11384004,4.767,Romance,4835.0,MUSIC (Original Score),Love Story,1.0
263,0,0.0,Rebecca,en,tt2235695,5.528,Drama,971.0,CINEMATOGRAPHY (Black-and-White),Rebecca,1.0
267,0,0.0,Rebecca,en,tt2235695,5.528,Drama,1034.0,OUTSTANDING PRODUCTION,Rebecca,1.0
742,0,0.0,Death on the Nile,en,tt7657566,6.683,Crime,5650.0,COSTUME DESIGN,Death on the Nile,1.0
995,0,0.0,Cocoon,de,tt10107222,1.4,Drama,6366.0,ACTOR IN A SUPPORTING ROLE,Cocoon,1.0
996,0,0.0,Cocoon,de,tt10107222,1.4,Drama,6453.0,VISUAL EFFECTS,Cocoon,1.0
1053,0,0.0,Exodus,fa,tt11771036,1.4,Drama,3666.0,MUSIC (Music Score of a Dramatic or Comedy Pic...,Exodus,1.0
1141,0,0.0,West Side Story,en,tt3581652,11.005,Crime,3719.0,ACTOR IN A SUPPORTING ROLE,West Side Story,1.0
1142,0,0.0,West Side Story,en,tt3581652,11.005,Crime,3733.0,ACTRESS IN A SUPPORTING ROLE,West Side Story,1.0
1143,0,0.0,West Side Story,en,tt3581652,11.005,Crime,3743.0,ART DIRECTION (Color),West Side Story,1.0


In [27]:
#Fill null values in winner column to zero 
combined_df["winner"] = combined_df["winner"].fillna(value=0)
combined_df.shape

(1581, 11)

In [28]:
# Remove the movie title from the dataframe
combined_df = combined_df.drop(columns=['index','category','film'] )
combined_df.head()

Unnamed: 0,vote_count,vote_average,title,original_language,id_imdb,popularity,genre_name,winner
0,0,0.0,Clapboard Jungle: Surviving the Independent Fi...,en,tt4284084,0.926,Documentary,0.0
1,2,7.5,Close Encounters of the Fifth Kind,en,tt12108272,5.217,Documentary,0.0
2,1,4.0,Clover,en,tt7801350,5.588,Comedy,0.0
3,0,0.0,Climbing Blind,en,tt11801494,2.586,Documentary,0.0
4,1,10.0,Classic Again,th,tt10075442,0.6,Drama,0.0


In [29]:
#Reset the index to imdb id 
combined_df = combined_df.set_index('id_imdb')

In [30]:
combined_df.head()

Unnamed: 0_level_0,vote_count,vote_average,title,original_language,popularity,genre_name,winner
id_imdb,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
tt4284084,0,0.0,Clapboard Jungle: Surviving the Independent Fi...,en,0.926,Documentary,0.0
tt12108272,2,7.5,Close Encounters of the Fifth Kind,en,5.217,Documentary,0.0
tt7801350,1,4.0,Clover,en,5.588,Comedy,0.0
tt11801494,0,0.0,Climbing Blind,en,2.586,Documentary,0.0
tt10075442,1,10.0,Classic Again,th,0.6,Drama,0.0


In [31]:
# Create a title Dataframe.
title_df=pd.DataFrame(data=combined_df['title'],index=combined_df.index,copy=True)
title_df.head()

Unnamed: 0_level_0,title
id_imdb,Unnamed: 1_level_1
tt4284084,Clapboard Jungle: Surviving the Independent Fi...
tt12108272,Close Encounters of the Fifth Kind
tt7801350,Clover
tt11801494,Climbing Blind
tt10075442,Classic Again


In [32]:
# Remove the movie title from the dataframe
combined_df = combined_df.drop('title',1)
combined_df.head()

Unnamed: 0_level_0,vote_count,vote_average,original_language,popularity,genre_name,winner
id_imdb,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
tt4284084,0,0.0,en,0.926,Documentary,0.0
tt12108272,2,7.5,en,5.217,Documentary,0.0
tt7801350,1,4.0,en,5.588,Comedy,0.0
tt11801494,0,0.0,en,2.586,Documentary,0.0
tt10075442,1,10.0,th,0.6,Drama,0.0


In [33]:
# Generate our categorical variable list for movie dataset
movie_cat = combined_df.dtypes[combined_df.dtypes == "object"].index.tolist()

# Check the number of unique values in each column
combined_df[movie_cat].nunique()

original_language    51
genre_name           19
dtype: int64

In [34]:
# Check the winner unique value counts to see if binning is required
lang_counts= combined_df.original_language.value_counts()
lang_counts

en    755
es     93
fr     88
de     57
hi     46
ru     40
ta     39
ja     39
te     35
id     33
it     31
pt     30
zh     24
fa     22
cs     19
fi     19
no     16
tr     16
ml     16
sv     15
nl     13
da     11
ko     11
pl     10
kn     10
tl      8
uk      8
ar      8
pa      8
sr      7
bn      7
hu      6
el      5
cn      4
sk      3
ro      3
ms      3
th      3
mr      3
lt      3
lv      2
et      2
he      2
eu      1
bg      1
gu      1
is      1
ca      1
sq      1
kk      1
hr      1
Name: original_language, dtype: int64

In [35]:
# Determine which values to replace for bucketing
replace_lang = list(lang_counts[lang_counts < 40].index)

# Replace in DataFrame
for lang in replace_lang:
    combined_df.original_language = combined_df.original_language.replace(lang,"Other")


# Check to make sure binning was successful
combined_df.original_language.value_counts()

en       755
Other    502
es        93
fr        88
de        57
hi        46
ru        40
Name: original_language, dtype: int64

In [36]:
# let's examine how many columns we will obtain after one hot encoding these variables
encode_df=pd.get_dummies(combined_df[movie_cat], drop_first=True)
encode_df.shape

(1581, 24)

In [37]:
# Merge one-hot encoded features and drop the originals
oscar_us_df = combined_df.merge(encode_df,left_index=True, right_index=True)
oscar_us_df = oscar_us_df.drop(movie_cat,1)
oscar_us_df.head(2)

Unnamed: 0_level_0,vote_count,vote_average,popularity,winner,original_language_de,original_language_en,original_language_es,original_language_fr,original_language_hi,original_language_ru,...,genre_name_History,genre_name_Horror,genre_name_Music,genre_name_Mystery,genre_name_Romance,genre_name_Science Fiction,genre_name_TV Movie,genre_name_Thriller,genre_name_War,genre_name_Western
id_imdb,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tt0062336,0,0.0,1.27,0.0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
tt0120589,0,0.0,1.779,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
# Define the features set.
X = oscar_us_df.copy()
X = X.drop('winner', axis=1)
X.shape

(1905, 27)

In [39]:
# Define the target set(output label)
y = oscar_us_df['winner']
y.head(2)

id_imdb
tt0062336    0.0
tt0120589    0.0
Name: winner, dtype: float64

In [40]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [41]:
# Creating a StandardScaler instance.
scaler = StandardScaler()

In [42]:
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

In [43]:
# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [44]:
# Create a random forest classifier.
#rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [45]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [46]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)
predictions

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1.,
       1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 1., 0.,
       0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
       0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 0., 0.

In [47]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,430,4
Actual 1,14,29


In [48]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)
acc_score

0.9622641509433962

In [49]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,430,4
Actual 1,14,29


Accuracy Score : 0.9622641509433962
Classification Report
              precision    recall  f1-score   support

         0.0       0.97      0.99      0.98       434
         1.0       0.88      0.67      0.76        43

    accuracy                           0.96       477
   macro avg       0.92      0.83      0.87       477
weighted avg       0.96      0.96      0.96       477

