In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# <u>**TMDB BOX OFFICE PREDICTION**</u>

## Introduction

In a world… where movies made an estimated $41.7 billion in 2018, the film industry is more popular than ever. But what movies make the most money at the box office? How much does a director matter? Or the budget? For some movies, it's "You had me at 'Hello.'" For others, the trailer falls short of expectations and you think "What we have here is a failure to communicate."

## Import Libraries

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt 
import matplotlib.dates as md
%matplotlib inline
import seaborn as sns
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.compose import ColumnTransformer
from sklearn.metrics import make_scorer
import ast
from scipy import stats
from sklearn.linear_model import LinearRegression
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
plt.style.use('seaborn')
mpl.rcParams['figure.figsize'] = (15,5)
from collections import Counter

## Read Data

In [None]:
# data from https://www.kaggle.com/c/tmdb-box-office-prediction/data
df_train = pd.read_csv('/kaggle/input/tmdb-box-office-prediction/train.csv')
df_test = pd.read_csv('/kaggle/input/tmdb-box-office-prediction/test.csv')

In [None]:
print(df_train.shape)
df_train.head()

In [None]:
print(df_test.shape)
df_test.head(3)

## Describe the Datasets
### <font color = 'blue'>Training Dataframe </font>

In [None]:
df_train.describe()

In [None]:
df_train.info()

### <font color = 'blue'>Testing Dataframe </font>

In [None]:
df_test.describe()

### <font color = 'blue'>Read Data Summary </font>

In [None]:
col_vary = []
test_col = df_test.columns
train_col = df_train.columns
var = list(set(df_train)-set(df_test))

print("The Training DataFrame contains all the same Headers as the Testing DataFrame except " + str(var[0])) 

From our initial look of the data we see that we have 2 diffent data sets. 

<b>Training:</b> 
<ul>
    <li>Has 3000 rows of data and 23 columns </li>
    <li>Looking at the Medium and the Mean there seems to be some outliers with the <b>Budget</b> and <b>Runtime</b></li>
    <li>It is odd that there are runtime and budgets of 0</li>            
</ul>
<b>Test:</b>
<ul>
    <li>This will be used to run our analysis on to predict <b>Revenue</b> </li>
    <li>Has 4398 Rows of Data </li>
</ul>
      

---

## Outliers

From Reading our data we noticed Outlier values. Here we will create a secondary Dataframe that will remove outliers for both <b>Runtime</b> and <b>Budget</b>

### <font color = 'blue'>Runtime</font>

In [None]:
sns.boxplot(x=df_train['runtime'])
plt.title('Boxplot of Runtime')

 ### <font color = 'blue'>Budget</font>

In [None]:
sns.boxplot(x=df_train['budget'])
plt.title('Boxplot of Budget')

In [None]:
df_train_na = df_train.isna().sum().sort_values(ascending=True)
df_train_na.plot(kind='barh')
plt.title("Distribution of NA")

for i, v in enumerate(sorted(df_train_na)):
    plt.text(v, i, str(v), va="center")

 ### <font color = 'blue'>Remove Outliers</font>

In [None]:
bq_low = df_train['budget'].quantile(0.01)
bq_hi  = df_train['budget'].quantile(0.99)
rq_low = df_train['runtime'].quantile(0.01)
rq_hi  = df_train['runtime'].quantile(0.99)
df_train_filtered = df_train[(df_train['budget'] < bq_hi) & (df_train['budget'] > bq_low) & (df_train['runtime'] < rq_hi) & (df_train['runtime'] > rq_low)]
df_train_filtered.shape

In [None]:
sns.boxplot(x=df_train_filtered['runtime'])
plt.title('Boxplot of Runtime(Outliers Removed)')

In [None]:
sns.boxplot(x=df_train_filtered['budget'])
plt.title('Boxplot of Budget(Outliers Removed)')

In [None]:
len(df_train) - len(df_train_filtered)

 ### <font color = 'blue'> Outliers Summary</font>
Removing Outliers removes 885 rows. If the dataset was larger it may be useful to remove outliers but since the Dataset is so small it may be useful to keep the outliers in training our model

---

## Dictionary Columns
Reading our data we noticed that there were Dictionaries in our data frame. we need to convert columns with Dictionaries into actual dictionaries that we can use.

In [None]:
# from this kernel: https://www.kaggle.com/gravix/gradient-in-a-box
dict_columns = ['belongs_to_collection', 'genres', 'production_companies',
                'production_countries', 'spoken_languages', 'Keywords', 'cast', 'crew']

def text_to_dict(df):
    for column in dict_columns:
        df[column] = df[column].apply(lambda x: {} if pd.isna(x) else ast.literal_eval(x) )
    return df
        
df_train = text_to_dict(df_train)
df_test = text_to_dict(df_test)

In [None]:
df_train.head(2)

---

## Manipulating Data

### Belongs to Collection
#### <font color = 'blue'>Training Dataframe</font>

In [None]:
# original code from: https://www.kaggle.com/artgor/eda-feature-engineering-and-model-interpretation

# creates a new column with collection name
df_train['collection_name'] = df_train['belongs_to_collection'].apply(lambda x: x[0]['name'] if x != {} else 0)
df_train['collection_name'].head()

In [None]:
# creates a bool val if in collection or not
df_train['has_collection'] = df_train['belongs_to_collection'].apply(lambda x: len(x) if x != {} else 0)
df_train['has_collection'].head()

In [None]:
col_count = df_train['has_collection'].value_counts()
print(" In the Trainining data there are " + str(col_count[0])+" Films in a collection")

#### <font color = 'blue'>Testing Dataframe</font>

In [None]:
# creates a new column with collection name for Testing df
df_test['collection_name'] = df_test['belongs_to_collection'].apply(lambda x: x[0]['name'] if x != {} else 0)
df_test['collection_name'].head()

In [None]:
# creates a bool val if in collection or not for Testing df
df_test['has_collection'] = df_test['belongs_to_collection'].apply(lambda x: len(x) if x != {} else 0)
df_test['has_collection'].head()

### Genres
#### <font color = 'blue'>Training Dataframe</font>

In [None]:
# original code from: https://www.kaggle.com/artgor/eda-feature-engineering-and-model-interpretation

# creates a list of all the possible genres
list_of_genres = list(df_train['genres'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values)
# creates a num_genres column on the train df that has the number of genres that a movie has.
df_train['num_genres'] = df_train['genres'].apply(lambda x: len(x) if x != {} else 0)
# creates a column on the train df that takes all the genres out of the Dictionary 
df_train['all_genres'] = df_train['genres'].apply(lambda x: ' '.join(sorted([i['name'] for i in x])) if x != {} else '')
# create a list of unique genres for Training dataframe
unique_genres = []
for l in list_of_genres:
    for g in l:
        if g not in unique_genres:
            unique_genres.append(g)
        else:
            pass
# create a separate column for all the genres with bool values if true
for g in unique_genres:
    df_train['genre_' + g] = df_train['all_genres'].apply(lambda x: 1 if g in x else 0)
df_train.head(3)

#### <font color = 'blue'>Testing Dataframe</font>

In [None]:
# creates a list of all the possible genres for Testing df
list_of_genres = list(df_test['genres'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values)
# creates a num_genres column on the test df that has the number of genres that a movie has.
df_test['num_genres'] = df_test['genres'].apply(lambda x: len(x) if x != {} else 0)
# creates a column on the test df that takes all the genres out of the Dictionary 
df_test['all_genres'] = df_test['genres'].apply(lambda x: ' '.join(sorted([i['name'] for i in x])) if x != {} else '')


# create a separate column for all the genres with bool values if true
for g in unique_genres:
    df_test['genre_' + g] = df_test['all_genres'].apply(lambda x: 1 if g in x else 0)
        
df_test.head(3)

### Production Company
#### <font color = 'blue'>Training Dataframe</font>

In [None]:
# creates a list of all the possible production companies
list_of_production = list(df_train['production_companies'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values)
# creates a num_prod_co column on the train df that has the number of genres that a movie has.
df_train['num_prod_co'] = df_train['production_companies'].apply(lambda x: len(x) if x != {} else 0)
# creates a column on the train df that takes all the production companies out of the Dictionary 
df_train['all_prod_co'] = df_train['production_companies'].apply(lambda x: ' '.join(sorted([i['name'] for i in x])) if x != {} else '')
# create a list of unique genres for Training dataframe
unique_prod_co = []
for l in list_of_production:
    for g in l:
        if g not in unique_prod_co:
            unique_prod_co.append(g)
        else:
            pass
print(" There are "+ str(len(unique_prod_co)) + " unique Production Companies")
# creates a boolean column on the train df for each of the possible production co
top_prod_co = [m[0] for m in Counter([i for j in list_of_production for i in j]).most_common(15)]
for g in top_prod_co:
    df_train['prod_co_' + g] = df_train['all_prod_co'].apply(lambda x: 1 if g in x else 0)
    
top_prod_co

> #### <font color = 'blue'>Testing Dataframe</font>

In [None]:
list_of_production = list(df_test['production_companies'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values)
# creates a num_prod_co column on the train df that has the number of genres that a movie has.
df_test['num_prod_co'] = df_test['production_companies'].apply(lambda x: len(x) if x != {} else 0)
# creates a column on the train df that takes all the production companies out of the Dictionary 
df_test['all_prod_co'] = df_test['production_companies'].apply(lambda x: ' '.join(sorted([i['name'] for i in x])) if x != {} else '')
# creates a boolean column on the train df for each of the possible production co
top_prod_co = [m[0] for m in Counter([i for j in list_of_production for i in j]).most_common(15)]
for g in top_prod_co:
    df_test['prod_co_' + g] = df_test['all_prod_co'].apply(lambda x: 1 if g in x else 0)

### Keywords
#### <font color = 'blue'>Training Dataframe</font>

In [None]:
# creates a list of all the possible keywords
list_of_keywords = list(df_train['Keywords'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values)
# creates a num_keywords column on the train df that has the number of keywords.
df_train['num_keywords'] = df_train['Keywords'].apply(lambda x: len(x) if x != {} else 0)
# creates a column on the train df that takes all the keywords out of the Dictionary 
df_train['all_keywords'] = df_train['Keywords'].apply(lambda x: ' '.join(sorted([i['name'] for i in x])) if x != {} else '')
# create a list of unique keywords for Training dataframe
unique_keywords = []
for l in list_of_keywords:
    for g in l:
        if g not in unique_keywords:
            unique_keywords.append(g)
        else:
            pass
print(" There are "+ str(len(unique_keywords)) + " unique Keywords")
# creates a boolean column on the train df for each of the possible keywords
top_keywords = [m[0] for m in Counter([i for j in list_of_keywords for i in j]).most_common(15)]
for g in top_keywords:
    df_train['keyword_' + g] = df_train['all_keywords'].apply(lambda x: 1 if g in x else 0)
    
top_keywords


#### <font color ='blue'>Testing Dataframe</font>

In [None]:
# creates a list of all the possible keywords
list_of_keywords = list(df_test['Keywords'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values)
# creates a num_keywords column on the train df that has the number of keywords.
df_test['num_keywords'] = df_test['Keywords'].apply(lambda x: len(x) if x != {} else 0)
# creates a column on the train df that takes all the keywords out of the Dictionary 
df_test['all_keywords'] = df_test['Keywords'].apply(lambda x: ' '.join(sorted([i['name'] for i in x])) if x != {} else '')
for g in top_keywords:
    df_test['keyword_' + g] = df_test['all_keywords'].apply(lambda x: 1 if g in x else 0)


### Cast
#### <font color = 'blue'>Training Dataframe</font>

In [None]:
# creates a list of all the possible cast
list_of_cast = list(df_train['cast'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values)
# creates a num_keywords column on the train df that has the number of keywords.
df_train['num_cast'] = df_train['cast'].apply(lambda x: len(x) if x != {} else 0)
# creates a column on the train df that takes all the keywords out of the Dictionary 
df_train['all_cast'] = df_train['cast'].apply(lambda x: ' '.join(sorted([i['name'] for i in x])) if x != {} else '')
# create a list of unique keywords for Training dataframe
unique_cast = []
for l in list_of_cast:
    for g in l:
        if g not in unique_cast:
            unique_cast.append(g)
        else:
            pass
print(" There are "+ str(len(unique_cast)) + " unique cast")
# creates a boolean column on the train df for each of the possible keywords
top_cast = [m[0] for m in Counter([i for j in list_of_cast for i in j]).most_common(15)]
for g in top_cast:
    df_train['cast_' + g] = df_train['all_cast'].apply(lambda x: 1 if g in x else 0)
top_cast

#### <font color ='blue'>Testing Dataframe</font>

In [None]:
# creates a list of all the possible cast
list_of_cast = list(df_test['cast'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values)
# creates a num_keywords column on the train df that has the number of keywords.
df_test['num_cast'] = df_test['cast'].apply(lambda x: len(x) if x != {} else 0)
# creates a column on the train df that takes all the keywords out of the Dictionary 
df_test['all_cast'] = df_test['cast'].apply(lambda x: ' '.join(sorted([i['name'] for i in x])) if x != {} else '')
for g in top_cast:
    df_test['cast_' + g] = df_test['all_cast'].apply(lambda x: 1 if g in x else 0)

### Crew
#### <font color = 'blue'> Training Dataframe</font>

In [None]:
# creates a list of all the possible cast
list_of_crew = list(df_train['crew'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values)
# creates a num_keywords column on the train df that has the number of keywords.
df_train['num_crew'] = df_train['crew'].apply(lambda x: len(x) if x != {} else 0)
# creates a column on the train df that takes all the keywords out of the Dictionary 
df_train['all_crew'] = df_train['crew'].apply(lambda x: ' '.join(sorted([i['name'] for i in x])) if x != {} else '')
# create a list of unique keywords for Training dataframe
unique_crew = []
for l in list_of_crew:
    for g in l:
        if g not in unique_crew:
            unique_crew.append(g)
        else:
            pass
print(" There are "+ str(len(unique_crew)) + " unique cast")
# creates a boolean column on the train df for each of the possible keywords
top_crew = [m[0] for m in Counter([i for j in list_of_crew for i in j]).most_common(15)]
for g in top_crew:
    df_train['crew_' + g] = df_train['all_crew'].apply(lambda x: 1 if g in x else 0)
top_crew

#### <font color = 'blue'>Training Dataframe</font>


In [None]:
# creates a list of all the possible cast
list_of_crew = list(df_test['crew'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values)
# creates a num_keywords column on the train df that has the number of keywords.
df_test['num_crew'] = df_test['crew'].apply(lambda x: len(x) if x != {} else 0)
# creates a column on the train df that takes all the keywords out of the Dictionary 
df_test['all_crew'] = df_test['crew'].apply(lambda x: ' '.join(sorted([i['name'] for i in x])) if x != {} else '')
for g in top_crew:
    df_test['crew_' + g] = df_test['all_crew'].apply(lambda x: 1 if g in x else 0)

### Month / DOW
#### <font color = 'blue'> Training Dataframe</font>

In [None]:
df_train['release_date'] = pd.DataFrame(pd.to_datetime(df_train['release_date'],dayfirst=True))
df_train['release_month'] = df_train['release_date'].dt.month
df_train['release_DOW'] = df_train['release_date'].dt.dayofweek

#### <font color = 'blue'>Testing Dataframe</font>

In [None]:
df_test['release_date'] = pd.DataFrame(pd.to_datetime(df_test['release_date'],dayfirst=True))
df_test['release_month'] = df_test['release_date'].dt.month
df_test['release_DOW'] = df_test['release_date'].dt.dayofweek

## Analysis

In [None]:
fig = plt.figure(figsize=(15, 20))
df_train_corr = df_train.corr()
ax = sns.heatmap(df_train_corr)

In [None]:
df_train_corr[['revenue']].sort_values(by=['revenue'],ascending = False)

---

## Predictions
### <font color = 'green'>Linear Regression</font>
With the R value for budget and revenue being pretty high at  0.75 we want to find a model that produces better results than relying on a linear regression model. 

In [None]:
slope, intercept, r_value, p_value, std_err = stats.linregress(df_train['budget'],df_train['revenue'])
ax = sns.regplot(data=df_train, x='budget',y='revenue',line_kws={'label':"y={0:.1f}x+{1:.1f}".format(slope,intercept)})
ax.legend()
plt.show()

In [None]:
X_linear = np.array(df_train['budget']).reshape(-1,1)
y_linear = np.array(df_train['revenue']).reshape(-1,1)
model_linear=LinearRegression()
model_linear.fit(X_linear,y_linear)
y_pred = model_linear.predict(X_linear)
val_mae = mean_absolute_error(y_linear,y_pred)
val_mape = np.mean(np.abs((y_linear - y_pred) / y_linear)) * 100
print( 'MAE: ' + str(val_mae))
print('MAPE: '+ str(val_mape))


With Linear Regression we get a Mean Absolute Error value of <b>45M</b>. While the R value is pretty high, such a high MAE is not a great predictor of Box office revenue.

### <font color = 'green'>Cross Validation with Random Forrest</font>

First we need to confirm Training and Testing Dataframe contain all the same columns besides revenue

In [None]:
col_vary = []
test_col = df_test.columns
train_col = df_train.columns
var = list(set(df_train)-set(df_test))

print("The Training DataFrame contains all the same Headers as the Testing DataFrame except " + str(var[0])) 


In [None]:
drop_col = [
    'id',
    'imdb_id',
    'original_title',
    'belongs_to_collection',
    'genres',
    'homepage',
    'overview',
    'poster_path',
    'production_countries',
    'production_companies',
    'spoken_languages',
    'status',
    'tagline',
    'title',
    'Keywords',
    'cast',
    'crew',
    'revenue',
    'collection_name'  
]
features = []
for i in df_train.columns:
    if i not in drop_col:
        features.append(i)
    else:
        pass
    
features
        

In [None]:
X=df_train[features].copy()
y=df_train['revenue'].copy()
X_test = df_test[features].copy()

In [None]:
X.info()

In [None]:
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in X.columns if
                    X[cname].nunique() < 10 and 
                    X[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X.columns if 
                X[cname].dtype in ['int64', 'float64']]

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

def rmsle_error(y_true, y_pred): 
    assert len(y_true) == len(y_pred)
    return np.sqrt(np.mean((np.log1p(y_pred) - np.log1p(y_true))**2))

rmsle_score = make_scorer(rmsle_error, greater_is_better=False)

my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', RandomForestRegressor(n_estimators=5,n_jobs=-1,min_samples_split=6, random_state=0))
                             ])
scores = -1 * cross_val_score(my_pipeline, X, y,
                              cv=5,
                              scoring=rmsle_score)
results = scores.mean()
results

In [None]:

pred = cross_val_predict(my_pipeline,X,y,cv=5)
col = ['id','revenue']
df_compare = df_train[col].copy()
df_compare['predicted'] = pred
df_compare['residual'] = df_compare['revenue'] - df_compare['predicted']


In [None]:
plt.scatter(df_compare['predicted'],df_compare['revenue'])
plt.plot(df_compare['revenue'],df_compare['revenue'])
plt.show()

In [None]:
weights = np.ones_like(df_compare['residual']) / (len(df_compare['residual']))
plt.hist(df_compare['residual'],bins = 10,weights = weights)
plt.ylabel('Probability')
plt.xlabel('Residual');


In [None]:
my_pipeline.fit(X,y)
y_pred = my_pipeline.predict(X_test)
y_pred


In [None]:
submission = pd.DataFrame(df_test['id'].copy())
submission['revenue'] = y_pred
submission.to_csv("submission.csv",index=False)