# Career Exploration Final Project: TMDB Box Office Prediction

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import json

## Data Loading

In [2]:
train = pd.read_csv('data/train.csv')

In [3]:
test = pd.read_csv('data/test.csv')

In [48]:
X_train, y_train = train.drop('revenue', axis=1), train['revenue']
X_test = test

In [5]:
df = pd.concat((X_train, X_test), axis=0, sort=False)

## Exploratory Data Analysis

In [6]:
def json_to_list_of_ids(col):
    def empty_listify(lst):
        return [] if pd.isnull(lst) else eval(lst)
    
    return pd.Series(
        map(
            lambda lst: list(col['id'] for col in empty_listify(lst)), 
            col
        )
    )

In [7]:
from sklearn.preprocessing import MultiLabelBinarizer

def one_hot_encode_json_col(df, col, prefix, cutoff_count=0):
    col_lsts = json_to_list_of_ids(df[col])
    
    mlb = MultiLabelBinarizer()

    res = pd.DataFrame(mlb.fit_transform(col_lsts),
                       columns=mlb.classes_,
                       index=df.index)
    
    out = res.rename(columns={s: f'{prefix}_{s}' for s in res.columns})
    
    counts = out.sum(axis=0)
    
    return out.loc[:, counts > cutoff_count]

In [8]:
df = pd.concat((df, one_hot_encode_json_col(df, 'genres', 'genre', cutoff_count=1)), axis=1).drop('genres', axis=1)

In [32]:
df = df.drop('Keywords', axis=1)
# df = pd.concat((df, one_hot_encode_json_col(df, 'Keywords', 'kw', cutoff_count=5)), axis=1).drop('Keywords', axis=1)

In [34]:
df = df.drop('production_companies', axis=1)
# df = pd.concat((df, one_hot_encode_json_col(df, 'production_companies', 'pc', cutoff_count=5)), axis=1).drop('production_companies', axis=1)

### Text Data

In [9]:
text_data = train[['overview', 'tagline', 'original_title']]

### Release Date

In [10]:
release_data = train['release_date']

### Cast

In [11]:
cast = train['cast']

### Other Categories

#### Original Language

In [12]:
original_language = train['original_language']

#### Spoken Languages

In [13]:
spoken_languages = train['spoken_languages']

In [14]:
production_countries = train['production_countries']

### Cleanup

In [None]:
df = df.drop(['production_countries', 'status', 'tagline', 'title', 'crew', 'cast', 'id', 'original_language', 'spoken_languages', 'belongs_to_collection', 'original_title', 'homepage', 'imdb_id', 'overview', 'poster_path', 'release_date', 'original_title'], axis=1)

In [41]:
X = df.fillna(0)

In [49]:
# for now, predict on first 3 columns lol
X = X.iloc[:, :3]

## Validation

In [51]:
from sklearn.metrics import mean_squared_log_error

def evaluate(y_pred, y_true):
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

In [52]:
from sklearn.model_selection import train_test_split

X_train = X[:train.shape[0]]
X_test = X[train.shape[0]:]

X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [53]:
from sklearn.linear_model import LinearRegression

In [54]:
model = LinearRegression()

In [55]:
model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [56]:
preds = model.predict(X_valid)

preds = np.clip(preds, 0, preds.max())

In [57]:
evaluate(preds, y_valid)

5.889689697852161

## Prediction

In [61]:
preds = model.predict(X_test)

preds = np.clip(preds, 0, preds.max())

In [68]:
out = pd.DataFrame(data={'id': test['id'], 'revenue': preds}).set_index('id')

In [70]:
out.to_csv('submission.csv')