# Predicting Movie Success
- Numann Malik
- March 18th 2023

Objective: We will create a Linear Regression model and use our results to make 3 recommendations on how to create a successful movie, i.e. predicting revenue.

In [1]:
# Import relevant libraries
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

  plt.style.use(['seaborn-talk'])


## Function Definitions


In [28]:
## Function to obtain genre names as a list 
import json
def get_genre_name(x):
    x = x.replace("'",'"')
    x = json.loads(x)
    
    genres = []
    for genre in x:
        genres.append(genre['name'])
    return genres

## Load and Inspect Data

In [2]:
df = pd.read_csv('Data/final_tmdb_data_combined.csv.gz',lineterminator='\n')

In [3]:
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60130 entries, 0 to 60129
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_id                60130 non-null  object 
 1   adult                  60130 non-null  float64
 2   backdrop_path          36994 non-null  object 
 3   belongs_to_collection  3738 non-null   object 
 4   budget                 60130 non-null  float64
 5   genres                 60130 non-null  object 
 6   homepage               14776 non-null  object 
 7   id                     60130 non-null  float64
 8   original_language      60130 non-null  object 
 9   original_title         60130 non-null  object 
 10  overview               58761 non-null  object 
 11  popularity             60130 non-null  float64
 12  poster_path            54382 non-null  object 
 13  production_companies   60130 non-null  object 
 14  production_countries   60130 non-null  object 
 15  re

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,tt0113026,0.0,/vMFs7nw6P0bIV1jDsQpxAieAVnH.jpg,,10000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10402, '...",,62127.0,en,The Fantasticks,...,0.0,86.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Try to remember the first time magic happened,The Fantasticks,0.0,5.4,21.0,
1,tt0113092,0.0,,,0.0,"[{'id': 878, 'name': 'Science Fiction'}]",,110977.0,en,For the Cause,...,0.0,100.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,The ultimate showdown on a forbidden planet.,For the Cause,0.0,4.4,7.0,
2,tt0116391,0.0,,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",,442869.0,hi,Gang,...,0.0,152.0,"[{'english_name': 'Hindi', 'iso_639_1': 'hi', ...",Released,,Gang,0.0,0.0,0.0,
3,tt0118694,0.0,/n4GJFGzsc7NinI1VeGDXIcQjtU2.jpg,,150000.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",http://www.wkw-inthemoodforlove.com/,843.0,cn,花樣年華,...,12854953.0,99.0,"[{'english_name': 'Cantonese', 'iso_639_1': 'c...",Released,"Feel the heat, keep the feeling burning, let t...",In the Mood for Love,0.0,8.1,1868.0,PG
4,tt0118852,0.0,,,0.0,"[{'id': 18, 'name': 'Drama'}]",,49511.0,en,Chinese Coffee,...,0.0,99.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,There's a fine line between friendship and bet...,Chinese Coffee,0.0,6.9,46.0,R


In [4]:
## check for null values 
df.isna().sum()

imdb_id                      0
adult                        0
backdrop_path            23136
belongs_to_collection    56392
budget                       0
genres                       0
homepage                 45354
id                           0
original_language            0
original_title               0
overview                  1369
popularity                   0
poster_path               5748
production_companies         0
production_countries         0
release_date              1052
revenue                      0
runtime                    832
spoken_languages             0
status                       0
tagline                  38051
title                        0
video                        0
vote_average                 0
vote_count                   0
certification            45506
dtype: int64

In [7]:
df['adult'].value_counts()

0.0    60025
1.0      105
Name: adult, dtype: int64

In [36]:
df['status'].value_counts()

KeyError: 'status'

In [16]:
df['id'].value_counts(dropna=False)

62127.0     1
448776.0    1
491831.0    1
283984.0    1
607201.0    1
           ..
282908.0    1
65664.0     1
771035.0    1
67793.0     1
630392.0    1
Name: id, Length: 60130, dtype: int64

In [18]:
df['original_language'].value_counts()

en    36201
fr     2659
ja     2645
es     2396
zh     1606
      ...  
ti        1
ks        1
gd        1
fy        1
tw        1
Name: original_language, Length: 118, dtype: int64

In [19]:
df['overview'].value_counts()

No overview found.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               45
The story follows a boy named Quon and others who suddenly wake up with supernatural powers.                                                                                                                                                                                                                                                                                                                                                                                                        

In [20]:
df['popularity'].value_counts()

0.600     10120
1.400      1481
0.840       627
1.960       238
1.176        81
          ...  
10.790        1
7.920         1
6.573         1
9.618         1
22.184        1
Name: popularity, Length: 14506, dtype: int64

In [21]:
df['poster_path'].value_counts(dropna=False)

/sgG8c1wZ8Brwt2TCSpXtGbAZS4x.jpg    2
/dwBbI5JcSrjNMhoRkdbYHCi662d.jpg    2
/eTVQ3wrch24qY7r460OrEHmJ9i1.jpg    1
/t87zsKyNQTmvCWFewysvAYaEue.jpg     1
/2ORi4ZGduwRwACJQT89nM8jhGn5.jpg    1
                                   ..
/7fJCmHvZZxVuy3LCKWbzvZZ1sz8.jpg    1
/f23LnCDFsWAGQ3x6YcvLvN4Xy3r.jpg    1
/1TsSb6ifZasB8lk2AwKe6FRjtkT.jpg    1
/rBmo10OOO7jOwtZ3OdVZwtuwfnC.jpg    1
/rYobaxr1JHhmOSszqQh5Aj4Viiu.jpg    1
Name: poster_path, Length: 54380, dtype: int64

In [22]:
df['production_companies'].value_counts()

[]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   20878
[{'id': 4676, 'logo_path': '/fu7gOKwS2l2fiFlEQFamITFEWDk.png', 'name': 'Star Cinema – ABS-CBN Film Productions', 'origin_country': 'PH'}]                                                                                                                                                                                                                                                                    

In [23]:
df['production_countries'].value_counts()

[{'iso_3166_1': 'US', 'name': 'United States of America'}]                                                                                                                                      16203
[]                                                                                                                                                                                              13222
[{'iso_3166_1': 'JP', 'name': 'Japan'}]                                                                                                                                                          2487
[{'iso_3166_1': 'IN', 'name': 'India'}]                                                                                                                                                          2440
[{'iso_3166_1': 'GB', 'name': 'United Kingdom'}]                                                                                                                                                 1733
          

In [35]:
df['video'].value_counts()

KeyError: 'video'

In [25]:
df['belongs_to_collection'].value_counts(dropna=False)

{'id': 39199, 'name': 'Detective Conan Collection', 'poster_path': '/1wBfr532NOQK68wlo5ApjCmiQIB.jpg', 'backdrop_path': '/9bogrpii4e61SR6a9qLHow7I46U.jpg'}       18
{'id': 148065, 'name': 'Doraemon Collection', 'poster_path': '/4TLSP1KD1uAlp2q1rTrc6SFlktX.jpg', 'backdrop_path': '/rc6OFcSasL5YxBRPUQVwxmVF6h5.jpg'}             16
{'id': 403643, 'name': 'Troublesome Night Collection', 'poster_path': '/bPTx3TP4UJTHQfcLx4qIub9LXmi.jpg', 'backdrop_path': '/n3a7zF5GuxM2X8oPF6pKXqYS6ER.jpg'}    15
{'id': 23456, 'name': 'One Piece Collection', 'poster_path': '/nvAPotUDNcKStSOv2ojGZBNOX8A.jpg', 'backdrop_path': '/3RqSKjokWlXyTBUt3tcR9CrOG57.jpg'}             13
{'id': 534673, 'name': 'Madea - Collection', 'poster_path': '/bO9NvwpSElW8lp33fGyy8VaS5s9.jpg', 'backdrop_path': '/qxLbjNPrMKTgYrMhIIDzj4zgYjf.jpg'}              11
                                                                                                                                                                  ..
{'id': 950

In [33]:
df['genres'].value_counts()

KeyError: 'genres'

In [34]:
df['certification'].value_counts(dropna=False)

NaN                                45506
R                                   6097
NR                                  3261
PG-13                               3224
PG                                  1432
G                                    442
NC-17                                156
Unrated                                5
-                                      1
UR                                     1
Not Rated                              1
ScreamFest Horror Film Festival        1
R                                      1
PG-13                                  1
10                                     1
Name: certification, dtype: int64

In [None]:
df['release_date'].value_counts(dropna=False)

## Feature Engineering

In [26]:
## Exclude columns unlikely relevant to revenue
drop_cols = ['backdrop_path','original_title','overview',
            'poster_path','tagline','id','homepage',
            'production_countries','video','spoken_languages',
            'original_language']
df = df.drop(columns=drop_cols)
df

Unnamed: 0,imdb_id,belongs_to_collection,budget,genres,popularity,production_companies,release_date,revenue,runtime,title,vote_average,vote_count,certification
0,tt0113026,,10000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10402, '...",2.491,"[{'id': 51207, 'logo_path': None, 'name': 'Sul...",2000-09-22,0.0,86.0,The Fantasticks,5.4,21.0,
1,tt0113092,,0.0,"[{'id': 878, 'name': 'Science Fiction'}]",1.974,"[{'id': 7405, 'logo_path': '/rfnws0uY8rsNAsrLb...",2000-11-15,0.0,100.0,For the Cause,4.4,7.0,
2,tt0116391,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",1.341,[],2000-04-14,0.0,152.0,Gang,0.0,0.0,
3,tt0118694,,150000.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",22.411,"[{'id': 539, 'logo_path': None, 'name': 'Block...",2000-09-29,12854953.0,99.0,In the Mood for Love,8.1,1868.0,PG
4,tt0118852,,0.0,"[{'id': 18, 'name': 'Drama'}]",4.889,"[{'id': 67930, 'logo_path': None, 'name': 'Cha...",2000-09-02,0.0,99.0,Chinese Coffee,6.9,46.0,R
...,...,...,...,...,...,...,...,...,...,...,...,...,...
60125,tt9895024,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 35, 'name...",0.840,"[{'id': 154995, 'logo_path': None, 'name': 'AS...",2021-09-08,0.0,118.0,Heiko's World,0.0,0.0,
60126,tt9896876,,0.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",3.665,"[{'id': 737, 'logo_path': '/dK1GZ1u4our8sJW0St...",2021-11-19,0.0,101.0,India Sweets and Spices,6.0,2.0,PG-13
60127,tt9898844,,0.0,"[{'id': 27, 'name': 'Horror'}]",421.143,"[{'id': 160303, 'logo_path': None, 'name': 'Mo...",2022-01-21,0.0,91.0,The Hunting,4.9,20.0,
60128,tt9900940,,0.0,"[{'id': 80, 'name': 'Crime'}, {'id': 18, 'name...",3.416,"[{'id': 91999, 'logo_path': None, 'name': 'Luc...",2021-12-07,0.0,87.0,The Scrapper,6.0,2.0,


In [27]:
## Use .notna() to convert 'belongs_to_collection' into Boolean feature
df['belongs_to_collection'] = df['belongs_to_collection'].notna()
df

Unnamed: 0,imdb_id,belongs_to_collection,budget,genres,popularity,production_companies,release_date,revenue,runtime,title,vote_average,vote_count,certification
0,tt0113026,False,10000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10402, '...",2.491,"[{'id': 51207, 'logo_path': None, 'name': 'Sul...",2000-09-22,0.0,86.0,The Fantasticks,5.4,21.0,
1,tt0113092,False,0.0,"[{'id': 878, 'name': 'Science Fiction'}]",1.974,"[{'id': 7405, 'logo_path': '/rfnws0uY8rsNAsrLb...",2000-11-15,0.0,100.0,For the Cause,4.4,7.0,
2,tt0116391,False,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",1.341,[],2000-04-14,0.0,152.0,Gang,0.0,0.0,
3,tt0118694,False,150000.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",22.411,"[{'id': 539, 'logo_path': None, 'name': 'Block...",2000-09-29,12854953.0,99.0,In the Mood for Love,8.1,1868.0,PG
4,tt0118852,False,0.0,"[{'id': 18, 'name': 'Drama'}]",4.889,"[{'id': 67930, 'logo_path': None, 'name': 'Cha...",2000-09-02,0.0,99.0,Chinese Coffee,6.9,46.0,R
...,...,...,...,...,...,...,...,...,...,...,...,...,...
60125,tt9895024,False,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 35, 'name...",0.840,"[{'id': 154995, 'logo_path': None, 'name': 'AS...",2021-09-08,0.0,118.0,Heiko's World,0.0,0.0,
60126,tt9896876,False,0.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",3.665,"[{'id': 737, 'logo_path': '/dK1GZ1u4our8sJW0St...",2021-11-19,0.0,101.0,India Sweets and Spices,6.0,2.0,PG-13
60127,tt9898844,False,0.0,"[{'id': 27, 'name': 'Horror'}]",421.143,"[{'id': 160303, 'logo_path': None, 'name': 'Mo...",2022-01-21,0.0,91.0,The Hunting,4.9,20.0,
60128,tt9900940,False,0.0,"[{'id': 80, 'name': 'Crime'}, {'id': 18, 'name...",3.416,"[{'id': 91999, 'logo_path': None, 'name': 'Luc...",2021-12-07,0.0,87.0,The Scrapper,6.0,2.0,


In [29]:
# Explode genres column using get_genre_name function
df['genres_list'] = df['genres'].apply(get_genre_name)
df_explode = df.explode('genres_list')

In [30]:
## save unique genres
unique_genres = df_explode['genres_list'].dropna().unique()

In [31]:
## Manually One-Hot-Encode Genres
for genre in unique_genres:
    df[f"Genre_{genre}"] = df['genres'].str.contains(genre,regex=False)  

In [32]:
## Drop original genre cols
df = df.drop(columns=['genres','genres_list'])
df

Unnamed: 0,imdb_id,belongs_to_collection,budget,popularity,production_companies,release_date,revenue,runtime,title,vote_average,...,Genre_Fantasy,Genre_Horror,Genre_Thriller,Genre_History,Genre_Family,Genre_Mystery,Genre_Western,Genre_War,Genre_TV Movie,Genre_Documentary
0,tt0113026,False,10000000.0,2.491,"[{'id': 51207, 'logo_path': None, 'name': 'Sul...",2000-09-22,0.0,86.0,The Fantasticks,5.4,...,False,False,False,False,False,False,False,False,False,False
1,tt0113092,False,0.0,1.974,"[{'id': 7405, 'logo_path': '/rfnws0uY8rsNAsrLb...",2000-11-15,0.0,100.0,For the Cause,4.4,...,False,False,False,False,False,False,False,False,False,False
2,tt0116391,False,0.0,1.341,[],2000-04-14,0.0,152.0,Gang,0.0,...,False,False,False,False,False,False,False,False,False,False
3,tt0118694,False,150000.0,22.411,"[{'id': 539, 'logo_path': None, 'name': 'Block...",2000-09-29,12854953.0,99.0,In the Mood for Love,8.1,...,False,False,False,False,False,False,False,False,False,False
4,tt0118852,False,0.0,4.889,"[{'id': 67930, 'logo_path': None, 'name': 'Cha...",2000-09-02,0.0,99.0,Chinese Coffee,6.9,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60125,tt9895024,False,0.0,0.840,"[{'id': 154995, 'logo_path': None, 'name': 'AS...",2021-09-08,0.0,118.0,Heiko's World,0.0,...,False,False,False,False,False,False,False,False,False,False
60126,tt9896876,False,0.0,3.665,"[{'id': 737, 'logo_path': '/dK1GZ1u4our8sJW0St...",2021-11-19,0.0,101.0,India Sweets and Spices,6.0,...,False,False,False,False,False,False,False,False,False,False
60127,tt9898844,False,0.0,421.143,"[{'id': 160303, 'logo_path': None, 'name': 'Mo...",2022-01-21,0.0,91.0,The Hunting,4.9,...,False,True,False,False,False,False,False,False,False,False
60128,tt9900940,False,0.0,3.416,"[{'id': 91999, 'logo_path': None, 'name': 'Luc...",2021-12-07,0.0,87.0,The Scrapper,6.0,...,False,False,True,False,False,False,False,False,False,False


In [None]:
# Clean MPAA rating feature
df['certification'] = df['certification'].str.strip()

repl_cert = {'UR':'NR',
             'Not Rated':'NR',
             'Unrated':'NR',
             '-':'NR',
             '10':np.nan,
             'ScreamFest Horror Film Festival':'NR'}

df['certification'] = df['certification'].replace(repl_cert)
df['certification'].value_counts(dropna=False)

In [None]:
# Spliting release_date into 3 columns
new_cols = ['year','month','day']
df[new_cols] = df['release_date'].str.split('-',expand=True)
df[new_cols] = df[new_cols].astype(float)

## drop original feature
df = df.drop(columns=['release_date'])
df

In [None]:
## Filter to keep only Released movies
df = df.loc[ df['status'] == 'Released']
df = df.drop(columns=['status'])
df

In [None]:
## Filter out movies with no financial data
df = df.loc[(df['budget'] >0 ) & (df['revenue']>0)]
df

## Test Assumptions