In [265]:
import pandas as pd
from ast import literal_eval
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder
from sklearn.model_selection import train_test_split
import xgboost as xgb
from xgboost import XGBRegressor
import category_encoders as ce
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from sklearn.impute import SimpleImputer

### Load data

In [274]:
train = pd.read_csv('../tmdb-box-office-prediction/train.csv')
test = pd.read_csv('../tmdb-box-office-prediction/test.csv')
sample_submission = pd.read_csv('../tmdb-box-office-prediction/sample_submission.csv')

### Initial data exploration

In [3]:
# Not a lot of observations left in the train set. Hopefully it'll be 
# enough to train a decent model.

train.shape, test.shape, sample_submission.shape

((3000, 23), (4398, 22), (4398, 2))

In [4]:
train.head().T

Unnamed: 0,0,1,2,3,4
id,1,2,3,4,5
belongs_to_collection,"[{'id': 313576, 'name': 'Hot Tub Time Machine ...","[{'id': 107674, 'name': 'The Princess Diaries ...",,,
budget,14000000,40000000,3300000,1200000,0
genres,"[{'id': 35, 'name': 'Comedy'}]","[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...","[{'id': 18, 'name': 'Drama'}]","[{'id': 53, 'name': 'Thriller'}, {'id': 18, 'n...","[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam..."
homepage,,,http://sonyclassics.com/whiplash/,http://kahaanithefilm.com/,
imdb_id,tt2637294,tt0368933,tt2582802,tt1821480,tt1380152
original_language,en,en,en,hi,ko
original_title,Hot Tub Time Machine 2,The Princess Diaries 2: Royal Engagement,Whiplash,Kahaani,마린보이
overview,"When Lou, who has become the ""father of the In...",Mia Thermopolis is now a college graduate and ...,"Under the direction of a ruthless instructor, ...",Vidya Bagchi (Vidya Balan) arrives in Kolkata ...,Marine Boy is the story of a former national s...
popularity,6.57539,8.24889,64.3,3.17494,1.14807


In [66]:
train['original_language'].value_counts()

en    2575
fr      78
ru      47
es      43
hi      42
ja      37
it      24
ko      20
cn      20
zh      19
de      18
ta      16
sv       8
nl       6
pt       6
fa       5
da       5
ro       4
tr       3
hu       3
ml       2
pl       2
no       2
fi       2
te       2
bn       1
ur       1
id       1
vi       1
cs       1
sr       1
he       1
ar       1
el       1
mr       1
nb       1
Name: original_language, dtype: int64

In [69]:
train['original_language'].unique()

array(['en', 'hi', 'ko', 'sr', 'fr', 'it', 'nl', 'zh', 'es', 'cs', 'ta',
       'cn', 'ru', 'tr', 'ja', 'fa', 'sv', 'de', 'te', 'pt', 'mr', 'da',
       'fi', 'el', 'ur', 'he', 'no', 'ar', 'nb', 'ro', 'vi', 'pl', 'hu',
       'ml', 'bn', 'id'], dtype=object)

In [5]:
train.describe()

Unnamed: 0,id,budget,popularity,runtime,revenue
count,3000.0,3000.0,3000.0,2998.0,3000.0
mean,1500.5,22531330.0,8.463274,107.856571,66725850.0
std,866.169729,37026090.0,12.104,22.086434,137532300.0
min,1.0,0.0,1e-06,0.0,1.0
25%,750.75,0.0,4.018053,94.0,2379808.0
50%,1500.5,8000000.0,7.374861,104.0,16807070.0
75%,2250.25,29000000.0,10.890983,118.0,68919200.0
max,3000.0,380000000.0,294.337037,338.0,1519558000.0


In [6]:
train.describe(exclude='number')

Unnamed: 0,belongs_to_collection,genres,homepage,imdb_id,original_language,original_title,overview,poster_path,production_companies,production_countries,release_date,spoken_languages,status,tagline,title,Keywords,cast,crew
count,604,2993,946,3000,3000,3000,2992,2999,2844,2945,3000,2980,3000,2403,3000,2724,2987,2984
unique,422,872,941,3000,36,2975,2992,2999,2383,321,2398,401,2,2400,2969,2648,2975,2984
top,"[{'id': 645, 'name': 'James Bond Collection', ...","[{'id': 18, 'name': 'Drama'}]",http://www.transformersmovie.com/,tt1194664,en,Ben-Hur,"A man struggles with memories of his past, inc...",/dlTWhgLHdEgSmCa8nEFgN7Hfu05.jpg,"[{'name': 'Paramount Pictures', 'id': 4}]","[{'iso_3166_1': 'US', 'name': 'United States o...",9/10/15,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Based on a true story.,The Gift,"[{'id': 10183, 'name': 'independent film'}]",[],"[{'credit_id': '5839aed192514170c5005220', 'de..."
freq,16,266,4,1,2575,2,1,1,51,1752,5,1817,2996,3,2,27,13,1


### Wrangle data

In [275]:
# wrangle function
def wrangle(X):
    
    # Convert columns with string representations of lists to lists (of dictionaries)
    # This is a common issue when saving and loading pandas DataFrames as .csv files.
    string_list = ['belongs_to_collection', 'genres', 'production_companies',
                   'production_countries', 'spoken_languages', 'Keywords', 
                   'cast', 'crew']

    for col in string_list:
        X.loc[:,col] = X.loc[:,col].apply(lambda x: literal_eval(x) 
                                          if type(x)==str else np.nan)
        
    # Add feature: 1 if movie belongs to a collection, 0 otherwise
    X['collection'] = X['belongs_to_collection'].notnull().astype(int)
    
    # Add feature: 1 if movie released, 0 otherwise
    X['released'] = X['status'].apply(lambda x: 1 if x=='Released' else 0)
    
    # Add feature: 1 if movie has a homepage, 0 otherwise
    # Might covary with budget
    X['has_homepage'] = X['homepage'].notnull().astype(int)
    
    # Get features for collection_id and collection_name
    X['collection_id'] = X['belongs_to_collection'].apply(lambda x: x[0]['id']
                                                          if type(x)==list else 0)
    X['collection_name'] = X['belongs_to_collection'].apply(lambda x: x[0]['name'] 
                                                            if type(x)==list else np.nan)
    
    # Create a feature that is release_date in datetime format
    # Extract year, month, day. Drop datetime feature
    X['release_date_dt'] = pd.to_datetime(X['release_date'], infer_datetime_format=True)
    X['release_year'] = X['release_date_dt'].dt.year
    X['release_month'] = X['release_date_dt'].dt.month
    X['release_day'] = X['release_date_dt'].dt.day
    X = X.drop('release_date_dt', axis=1)
    
    X['release_year'] = X['release_year'].apply(lambda x: x if x < 2020 else x - 100)
    
    # Get genre list for each movie
    X['list_of_genres'] = list(X['genres'].apply(lambda x: [i['name'] for i in x]
                                                 if type(x) == list
                                                 else []).values)
    # One hot encode genre lists
    mlb = MultiLabelBinarizer()
    X = X.join(pd.DataFrame(mlb.fit_transform(X.pop('list_of_genres')),
                                              columns=mlb.classes_, index=X.index))
    
    top_languages = ['en', 'fr', 'ru', 'es', 'hi', 'ja', 'it', 'ko', 'cn', 'zh', 'de', 'ta']
    
    for i in top_languages:
        X['language_' + i] = X['original_language'].apply(lambda x: 1 if x==i else 0)
    
    return X


In [276]:
train_wrangled = wrangle(train)
test_wrangled = wrangle(test)

In [277]:
# TV Movie has one occurence, only in train
train_wrangled.drop('TV Movie', axis=1, inplace=True)

In [278]:
train_wrangled.columns.values

array(['id', 'belongs_to_collection', 'budget', 'genres', 'homepage',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'Keywords',
       'cast', 'crew', 'revenue', 'collection', 'released',
       'has_homepage', 'collection_id', 'collection_name', 'release_year',
       'release_month', 'release_day', 'Action', 'Adventure', 'Animation',
       'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy',
       'Foreign', 'History', 'Horror', 'Music', 'Mystery', 'Romance',
       'Science Fiction', 'Thriller', 'War', 'Western', 'language_en',
       'language_fr', 'language_ru', 'language_es', 'language_hi',
       'language_ja', 'language_it', 'language_ko', 'language_cn',
       'language_zh', 'language_de', 'language_ta'], dtype=object)

In [279]:
features = ['budget', 'runtime', 'collection', 'released', 'popularity',
            'has_homepage', 'collection_id', 'release_year', 'release_month', 
            'release_day', 'Action', 'Adventure', 'Animation', 'Comedy', 'Crime', 
            'Documentary', 'Drama', 'Family', 'Fantasy', 'Foreign', 'History', 
            'Horror', 'Music', 'Mystery', 'Romance', 'Science Fiction', 
            'Thriller', 'War', 'Western', 'language_en', 'language_fr', 
            'language_ru', 'language_es', 'language_hi', 'language_ja', 
            'language_it', 'language_ko', 'language_cn', 'language_zh', 
            'language_de', 'language_ta']
target = 'revenue'

## Modeling

### Split into X feature matrix and y target vector

In [280]:
X = train_wrangled[features]
y = train_wrangled[target]
X_test = test_wrangled[features]
y_train_log = np.log1p(y_train)
y_val_log = np.log1p(y_val)

In [281]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1)
X_train.shape, X_val.shape, y_train.shape, y_val.shape, X_test.shape

((2700, 41), (300, 41), (2700,), (300,), (4398, 41))

### Make pipeline and fit it

In [207]:
# Make pipeline
# commented out the encoder because I believe wrangle() is taking care of it now
pipeline = make_pipeline(
    #ce.OneHotEncoder(),
    SimpleImputer(strategy='median'),
    RandomForestRegressor(n_estimators=5000, random_state=42, n_jobs=-1)
)

In [208]:
# Fit
pipeline.fit(X_train, y_train_log)

Pipeline(memory=None,
     steps=[('onehotencoder', OneHotEncoder(cols=['original_language'], drop_invariant=False,
       handle_missing='value', handle_unknown='value', return_df=True,
       use_cat_names=False, verbose=0)), ('simpleimputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='me...imators=5000, n_jobs=-1,
           oob_score=False, random_state=42, verbose=0, warm_start=False))])

### Validate

In [148]:
# evaluation functions
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

In [209]:
# Validate
y_pred = pipeline.predict(X_val)
print('Validation Error', rmse(y_val_log, y_pred))

Validation Error 3.220090221572915


### Generate predictions

In [198]:
# prediction function
def generate_submission(estimator, X_test_param, filename):
    y_pred_log = estimator.predict(X_test_param)
    y_pred = np.expm1(y_pred_log)  # Convert from log-dollars to dollars
    submission = pd.read_csv('../tmdb-box-office-prediction/sample_submission.csv')
    submission['revenue'] = y_pred
    submission.to_csv(filename, index=False)

In [199]:
# make predictions
# first Kaggle submission RMSE: 3.24910
generate_submission(pipeline, X_test, 'tmdb_sub_1.csv')

### XG Boost

In [282]:
# X_test has 1 NaN for each dt feature, 4 NaN for runtime
# I should probably just run an imputer on all of these
# what's the deal with simpleimputer returning an array 
# and iterativeimputer being experimental and impossible to import?
X_test = X_test.fillna(X_test.median())
X_test.isnull().sum()

budget             0
runtime            0
collection         0
released           0
popularity         0
has_homepage       0
collection_id      0
release_year       0
release_month      0
release_day        0
Action             0
Adventure          0
Animation          0
Comedy             0
Crime              0
Documentary        0
Drama              0
Family             0
Fantasy            0
Foreign            0
History            0
Horror             0
Music              0
Mystery            0
Romance            0
Science Fiction    0
Thriller           0
War                0
Western            0
language_en        0
language_fr        0
language_ru        0
language_es        0
language_hi        0
language_ja        0
language_it        0
language_ko        0
language_cn        0
language_zh        0
language_de        0
language_ta        0
dtype: int64

In [283]:
X_train = X_train.fillna(X_test.median())
X_train.isnull().sum()

budget             0
runtime            0
collection         0
released           0
popularity         0
has_homepage       0
collection_id      0
release_year       0
release_month      0
release_day        0
Action             0
Adventure          0
Animation          0
Comedy             0
Crime              0
Documentary        0
Drama              0
Family             0
Fantasy            0
Foreign            0
History            0
Horror             0
Music              0
Mystery            0
Romance            0
Science Fiction    0
Thriller           0
War                0
Western            0
language_en        0
language_fr        0
language_ru        0
language_es        0
language_hi        0
language_ja        0
language_it        0
language_ko        0
language_cn        0
language_zh        0
language_de        0
language_ta        0
dtype: int64

In [284]:
X_train.shape, X_val.shape, X_test.shape

((2700, 41), (300, 41), (4398, 41))

In [290]:
eval_set = [(X_train, y_train_log),
            (X_val, y_val_log)]

model = XGBRegressor(n_estimators=10000, n_jobs=-1, eta=0.05, max_depth=2)
model.fit(X_train, y_train_log, eval_set=eval_set, eval_metric='rmse',
         early_stopping_rounds=100)

[0]	validation_0-rmse:14.242	validation_1-rmse:14.3165
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 100 rounds.
[1]	validation_0-rmse:12.8873	validation_1-rmse:12.9623
[2]	validation_0-rmse:11.6751	validation_1-rmse:11.7551
[3]	validation_0-rmse:10.5919	validation_1-rmse:10.6725
[4]	validation_0-rmse:9.62499	validation_1-rmse:9.7091
[5]	validation_0-rmse:8.76403	validation_1-rmse:8.85487
[6]	validation_0-rmse:7.99841	validation_1-rmse:8.09285
[7]	validation_0-rmse:7.31964	validation_1-rmse:7.41743
[8]	validation_0-rmse:6.71986	validation_1-rmse:6.81768
[9]	validation_0-rmse:6.19024	validation_1-rmse:6.29043
[10]	validation_0-rmse:5.72614	validation_1-rmse:5.82878
[11]	validation_0-rmse:5.32007	validation_1-rmse:5.42343
[12]	validation_0-rmse:4.96591	validation_1-rmse:5.07005
[13]	validation_0-rmse:4.66004	validation_1-rmse:4.76785
[14]	validation_0-rmse:4.39577	validation_1-rmse:4.503

[141]	validation_0-rmse:2.90197	validation_1-rmse:3.15673
[142]	validation_0-rmse:2.90075	validation_1-rmse:3.15659
[143]	validation_0-rmse:2.90018	validation_1-rmse:3.15787
[144]	validation_0-rmse:2.89956	validation_1-rmse:3.15805
[145]	validation_0-rmse:2.89894	validation_1-rmse:3.15847
[146]	validation_0-rmse:2.89808	validation_1-rmse:3.16196
[147]	validation_0-rmse:2.89686	validation_1-rmse:3.16246
[148]	validation_0-rmse:2.89611	validation_1-rmse:3.16202
[149]	validation_0-rmse:2.89556	validation_1-rmse:3.16225
[150]	validation_0-rmse:2.89477	validation_1-rmse:3.16149
[151]	validation_0-rmse:2.89406	validation_1-rmse:3.16135
[152]	validation_0-rmse:2.89388	validation_1-rmse:3.16147
[153]	validation_0-rmse:2.89326	validation_1-rmse:3.1616
Stopping. Best iteration:
[53]	validation_0-rmse:2.97967	validation_1-rmse:3.10426



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, eta=0.05, gamma=0, importance_type='gain',
       learning_rate=0.1, max_delta_step=0, max_depth=2,
       min_child_weight=1, missing=None, n_estimators=10000, n_jobs=-1,
       nthread=None, objective='reg:linear', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1)

# Junk Drawer

In [92]:
test_wrangled[test_wrangled['release_year'] > 2019]

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew,collection,released,has_homepage,collection_id,collection_name,release_year,release_month,release_day


In [38]:
train_copy = train_wrangled.copy()

In [None]:
train_test = train

In [63]:
train_copy['belongs_to_collection'].fillna(0, inplace=True)
train_copy.head()

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue,collection,released,has_homepage,collection_id,collection_name,release_year,release_month,release_day,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,Foreign,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
0,1,"[{'id': 313576, 'name': 'Hot Tub Time Machine ...",14000000,"[{'id': 35, 'name': 'Comedy'}]",,tt2637294,en,Hot Tub Time Machine 2,"When Lou, who has become the ""father of the In...",6.575393,/tQtWuwvMf0hCc2QR2tkolwl7c3c.jpg,"[{'name': 'Paramount Pictures', 'id': 4}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",2/20/15,93.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Laws of Space and Time are About to be Vio...,Hot Tub Time Machine 2,"[{'id': 4379, 'name': 'time travel'}, {'id': 9...","[{'cast_id': 4, 'character': 'Lou', 'credit_id...","[{'credit_id': '59ac067c92514107af02c8c8', 'de...",12314651,1,1,0,313576.0,Hot Tub Time Machine Collection,2015,2,20,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,"[{'id': 107674, 'name': 'The Princess Diaries ...",40000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,tt0368933,en,The Princess Diaries 2: Royal Engagement,Mia Thermopolis is now a college graduate and ...,8.248895,/w9Z7A0GHEhIp7etpj0vyKOeU1Wx.jpg,"[{'name': 'Walt Disney Pictures', 'id': 2}]","[{'iso_3166_1': 'US', 'name': 'United States o...",8/6/04,113.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,It can take a lifetime to find true love; she'...,The Princess Diaries 2: Royal Engagement,"[{'id': 2505, 'name': 'coronation'}, {'id': 42...","[{'cast_id': 1, 'character': 'Mia Thermopolis'...","[{'credit_id': '52fe43fe9251416c7502563d', 'de...",95149435,1,1,0,107674.0,The Princess Diaries Collection,2004,8,6,0,0,0,1,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0
2,3,0,3300000,"[{'id': 18, 'name': 'Drama'}]",http://sonyclassics.com/whiplash/,tt2582802,en,Whiplash,"Under the direction of a ruthless instructor, ...",64.29999,/lIv1QinFqz4dlp5U4lQ6HaiskOZ.jpg,"[{'name': 'Bold Films', 'id': 2266}, {'name': ...","[{'iso_3166_1': 'US', 'name': 'United States o...",10/10/14,105.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The road to greatness can take you to the edge.,Whiplash,"[{'id': 1416, 'name': 'jazz'}, {'id': 1523, 'n...","[{'cast_id': 5, 'character': 'Andrew Neimann',...","[{'credit_id': '54d5356ec3a3683ba0000039', 'de...",13092000,0,1,1,,,2014,10,10,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,4,0,1200000,"[{'id': 53, 'name': 'Thriller'}, {'id': 18, 'n...",http://kahaanithefilm.com/,tt1821480,hi,Kahaani,Vidya Bagchi (Vidya Balan) arrives in Kolkata ...,3.174936,/aTXRaPrWSinhcmCrcfJK17urp3F.jpg,,"[{'iso_3166_1': 'IN', 'name': 'India'}]",3/9/12,122.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,,Kahaani,"[{'id': 10092, 'name': 'mystery'}, {'id': 1054...","[{'cast_id': 1, 'character': 'Vidya Bagchi', '...","[{'credit_id': '52fe48779251416c9108d6eb', 'de...",16000000,0,1,1,,,2012,3,9,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
4,5,0,0,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",,tt1380152,ko,마린보이,Marine Boy is the story of a former national s...,1.14807,/m22s7zvkVFDU9ir56PiiqIEWFdT.jpg,,"[{'iso_3166_1': 'KR', 'name': 'South Korea'}]",2/5/09,118.0,"[{'iso_639_1': 'ko', 'name': '한국어/조선말'}]",Released,,Marine Boy,,"[{'cast_id': 3, 'character': 'Chun-soo', 'cred...","[{'credit_id': '52fe464b9251416c75073b43', 'de...",3923970,0,1,0,,,2009,2,5,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [58]:
type(train_copy.iloc[2]['belongs_to_collection'])

float

In [39]:
# Get genre lists for each movie
train_copy['list_of_genres'] = list(train_wrangled['genres'].
                                    apply(lambda x: [i['name'] for i in x]
                                          if type(x) == list 
                                          else []).values)

In [41]:
mlb = MultiLabelBinarizer()
train_copy = train_copy.join(pd.DataFrame(mlb.fit_transform(train_copy.pop('list_of_genres')),
                          columns=mlb.classes_,
                          index=train_copy.index))

In [75]:
from sklearn.compose import ColumnTransformer

In [None]:
ct = ColumnTransformer(
    )

In [43]:
pd.set_option('display.max_columns', 100)
train_copy.head()

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue,collection,released,has_homepage,collection_id,collection_name,release_year,release_month,release_day,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,Foreign,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
0,1,"[{'id': 313576, 'name': 'Hot Tub Time Machine ...",14000000,"[{'id': 35, 'name': 'Comedy'}]",,tt2637294,en,Hot Tub Time Machine 2,"When Lou, who has become the ""father of the In...",6.575393,/tQtWuwvMf0hCc2QR2tkolwl7c3c.jpg,"[{'name': 'Paramount Pictures', 'id': 4}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",2/20/15,93.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Laws of Space and Time are About to be Vio...,Hot Tub Time Machine 2,"[{'id': 4379, 'name': 'time travel'}, {'id': 9...","[{'cast_id': 4, 'character': 'Lou', 'credit_id...","[{'credit_id': '59ac067c92514107af02c8c8', 'de...",12314651,1,1,0,313576.0,Hot Tub Time Machine Collection,2015,2,20,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,"[{'id': 107674, 'name': 'The Princess Diaries ...",40000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,tt0368933,en,The Princess Diaries 2: Royal Engagement,Mia Thermopolis is now a college graduate and ...,8.248895,/w9Z7A0GHEhIp7etpj0vyKOeU1Wx.jpg,"[{'name': 'Walt Disney Pictures', 'id': 2}]","[{'iso_3166_1': 'US', 'name': 'United States o...",8/6/04,113.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,It can take a lifetime to find true love; she'...,The Princess Diaries 2: Royal Engagement,"[{'id': 2505, 'name': 'coronation'}, {'id': 42...","[{'cast_id': 1, 'character': 'Mia Thermopolis'...","[{'credit_id': '52fe43fe9251416c7502563d', 'de...",95149435,1,1,0,107674.0,The Princess Diaries Collection,2004,8,6,0,0,0,1,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0
2,3,,3300000,"[{'id': 18, 'name': 'Drama'}]",http://sonyclassics.com/whiplash/,tt2582802,en,Whiplash,"Under the direction of a ruthless instructor, ...",64.29999,/lIv1QinFqz4dlp5U4lQ6HaiskOZ.jpg,"[{'name': 'Bold Films', 'id': 2266}, {'name': ...","[{'iso_3166_1': 'US', 'name': 'United States o...",10/10/14,105.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The road to greatness can take you to the edge.,Whiplash,"[{'id': 1416, 'name': 'jazz'}, {'id': 1523, 'n...","[{'cast_id': 5, 'character': 'Andrew Neimann',...","[{'credit_id': '54d5356ec3a3683ba0000039', 'de...",13092000,0,1,1,,,2014,10,10,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,4,,1200000,"[{'id': 53, 'name': 'Thriller'}, {'id': 18, 'n...",http://kahaanithefilm.com/,tt1821480,hi,Kahaani,Vidya Bagchi (Vidya Balan) arrives in Kolkata ...,3.174936,/aTXRaPrWSinhcmCrcfJK17urp3F.jpg,,"[{'iso_3166_1': 'IN', 'name': 'India'}]",3/9/12,122.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,,Kahaani,"[{'id': 10092, 'name': 'mystery'}, {'id': 1054...","[{'cast_id': 1, 'character': 'Vidya Bagchi', '...","[{'credit_id': '52fe48779251416c9108d6eb', 'de...",16000000,0,1,1,,,2012,3,9,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
4,5,,0,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",,tt1380152,ko,마린보이,Marine Boy is the story of a former national s...,1.14807,/m22s7zvkVFDU9ir56PiiqIEWFdT.jpg,,"[{'iso_3166_1': 'KR', 'name': 'South Korea'}]",2/5/09,118.0,"[{'iso_639_1': 'ko', 'name': '한국어/조선말'}]",Released,,Marine Boy,,"[{'cast_id': 3, 'character': 'Chun-soo', 'cred...","[{'credit_id': '52fe464b9251416c75073b43', 'de...",3923970,0,1,0,,,2009,2,5,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [202]:
list_of_genres

[['Comedy'],
 ['Comedy', 'Drama', 'Family', 'Romance'],
 ['Drama'],
 ['Thriller', 'Drama'],
 ['Action', 'Thriller'],
 ['Animation', 'Adventure', 'Family'],
 ['Horror', 'Thriller'],
 ['Documentary'],
 ['Action', 'Comedy', 'Music', 'Family', 'Adventure'],
 ['Comedy', 'Music'],
 ['Drama'],
 ['Comedy'],
 ['Drama'],
 ['Comedy', 'Crime'],
 ['Action', 'Thriller', 'Science Fiction', 'Mystery'],
 ['Action', 'Crime', 'Drama'],
 ['Horror', 'Thriller'],
 ['Drama', 'Romance'],
 ['Comedy', 'Romance'],
 ['Action', 'Thriller', 'Crime'],
 ['Adventure', 'Family', 'Science Fiction'],
 ['Horror', 'Thriller'],
 ['Thriller', 'Horror'],
 ['Thriller', 'Mystery', 'Foreign'],
 ['Horror', 'Comedy'],
 ['Comedy', 'Horror', 'Mystery', 'Thriller'],
 ['Crime', 'Drama', 'Mystery', 'Thriller'],
 ['Drama', 'Comedy', 'Romance'],
 ['Animation'],
 ['Action', 'Adventure', 'Crime', 'Thriller'],
 ['Drama', 'Comedy'],
 ['Mystery', 'Drama', 'Thriller'],
 ['Fantasy', 'Action', 'Adventure'],
 ['Horror'],
 ['Action', 'Comedy', 'Cr

In [183]:
train_wrangled['genres'].iloc[0][0]['id']

35

In [None]:
train_wrangled['genres'].iloc[0][0]['id']

In [170]:
# the lists of dictionaries can be converted to pandas DataFrame
# Maybe this will be useful later.
new = train.iloc[0]['crew']
pd.DataFrame(new).head()

Unnamed: 0,credit_id,department,gender,id,job,name,profile_path
0,59ac067c92514107af02c8c8,Directing,0,1449071,First Assistant Director,Kelly Cantley,
1,52fe4ee7c3a36847f82afad7,Directing,2,3227,Director,Steve Pink,/myHOgo8mQSCiCAZNGMRdHVr03jr.jpg
2,5524ed25c3a3687ded000d88,Writing,2,347335,Writer,Josh Heald,/pwXJIenrDMrG7t3zNfLvr8w1RGU.jpg
3,5524ed2d925141720c001128,Writing,2,347335,Characters,Josh Heald,/pwXJIenrDMrG7t3zNfLvr8w1RGU.jpg
4,5524ed3d92514166c1004a5d,Production,2,57822,Producer,Andrew Panay,


In [11]:
# maybe one hot encode the genres?
# check the cadinality of production companies, it's probably very high
# ordinal encode produciton country?
# keywords could potentially be interesting for a stretch goal, check cardinality
# maybe pull a few crew members? At least director? Maybe a few leads from cast?
# could we get is_sequel from collection dictionary?

# Also, popularity may be leakage

### code refactored into wrangle()

In [86]:
# # define helper function to get collection_id. NaN --> 0
# def get_collection_id(row):
#     if type(row['belongs_to_collection'])==str:
#         return ast.literal_eval(row['belongs_to_collection'])[0]['id']
#     else:
#         return 0

In [87]:
# ast.literal_eval(train.iloc[0]['belongs_to_collection'])[0]['id']

313576

In [96]:
# # apply the function to create the collection_id feature
# train['collection_id'] = train.apply(get_collection_id, axis=1)
# train['collection_id']

0       313576
1       107674
2            0
3            0
4            0
5            0
6            0
7            0
8       256377
9            0
10        1575
11       48190
12           0
13       91698
14           0
15           0
16           0
17           0
18           0
19        9518
20           0
21        9735
22      207621
23           0
24           0
25           0
26           0
27           0
28           0
29           0
         ...  
2970         0
2971         0
2972         0
2973         0
2974    149704
2975         0
2976         0
2977         0
2978         0
2979         0
2980         0
2981         0
2982         0
2983         0
2984    221111
2985         0
2986         0
2987         0
2988         0
2989         0
2990         0
2991    107469
2992         0
2993         0
2994         0
2995         0
2996         0
2997         0
2998         0
2999         0
Name: collection_id, Length: 3000, dtype: int64

In [92]:
# # define helper function to get collection_name. NaN --> 0
# def get_collection_name(row):
#     if type(row['belongs_to_collection'])==str:
#         return ast.literal_eval(row['belongs_to_collection'])[0]['name']
#     else:
#         return 0

In [93]:
# # apply the function to create the collection_name feature
# train['collection_name'] = train.apply(get_collection_name, axis=1)
# train['collection_name']

0       Hot Tub Time Machine Collection
1       The Princess Diaries Collection
2                                     0
3                                     0
4                                     0
5                                     0
6                                     0
7                                     0
8                 The Muppet Collection
9                                     0
10                     Rocky Collection
11      Revenge of the Nerds Collection
12                                    0
13              Chili Palmer Collection
14                                    0
15                                    0
16                                    0
17                                    0
18                                    0
19           The Transporter Collection
20                                    0
21           Friday the 13th Collection
22                     V/H/S Collection
23                                    0
24                                    0


In [212]:
# Test code for extracting year, month, and day from release_date converted to datetime
# This works, just needs to be refactored into wrangle()
# Refactored fine, keeping for records

train_test['release_date_dt'] = pd.to_datetime(train_test['release_date'], infer_datetime_format=True)
train_test['release_year'] = train_test['release_date_dt'].dt.year
train_test['release_month'] = train_test['release_date_dt'].dt.month
train_test['release_day'] = train_test['release_date_dt'].dt.day

0       20
1        6
2       10
3        9
4        5
5        6
6       30
7       15
8       16
9       16
10      21
11      10
12      15
13       4
14      20
15       6
16       4
17      25
18       2
19       2
20       3
21      13
22      28
23       2
24       9
25      23
26       8
27       7
28       8
29       7
        ..
2970    25
2971    12
2972    27
2973     9
2974    28
2975    31
2976     3
2977    15
2978    23
2979    23
2980     4
2981     9
2982    12
2983     4
2984     8
2985    17
2986     3
2987    20
2988    20
2989    11
2990    28
2991    12
2992    18
2993    17
2994    18
2995    22
2996    28
2997    11
2998    16
2999    22
Name: release_day, Length: 3000, dtype: int64