In [32]:
# IMPORT
import numpy as np

from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import RobustScaler, OneHotEncoder

from currency_converter import CurrencyConverter
from CinePred.data.utils import convert, convert_budget_column, convert_to_int, add_director_category, add_sin_features,\
add_cos_features, convert_to_date
from CinePred.data.data import Data
from CinePred.data.genre_ohe import GenreOHE

from sklearn import set_config; set_config(display='diagram') # decoration for the pipeline
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_union
from sklearn.preprocessing import FunctionTransformer

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [33]:
# IMPORT DF
data = Data('../raw_data/IMDb movies.csv')
data.import_data()

<CinePred.data.data.Data at 0x7effa8394520>

In [34]:
# CLEANING
data.remove_na_rows()
data.convert_income(column_name='worlwide_gross_income')
data.convert_to_date(column_name='date_published')
data.dataframe.sort_values(by='date_published', inplace=True)
data.dataframe.reset_index(inplace=True)

In [35]:
# DECLARE X & Y
X = data.dataframe[['budget', 'genre', 'duration', 'year', 'date_published']]
y = data.dataframe['worlwide_gross_income']
y = np.log(y)/np.log(10)
X.shape, y.shape

((6616, 5), (6616,))

In [36]:
# to_date_transformer = FunctionTransformer(convert_to_date)

sin_transformer = FunctionTransformer(add_sin_features)
cos_transformer = FunctionTransformer(add_cos_features)

In [43]:
ohe_transformer = make_pipeline(GenreOHE())
ohe_transformer.fit(X[['genre']])
genres = ohe_transformer.transform(X[['genre']])
genres

Unnamed: 0,Family,Sci-Fi,Action,War,Western,Comedy,Adventure,Horror,History,Sport,...,Film-Noir,Fantasy,Musical,Music,Romance,Biography,Animation,Mystery,Crime,Drama
506,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1048,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
2454,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2795,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,1
2827,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77241,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
71690,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
75419,0,1,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
80369,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [37]:
# PIPELINE
int_transformer = FunctionTransformer(convert_to_int)
time_pipeline = make_pipeline(int_transformer, RobustScaler())

budget_transformer = FunctionTransformer(convert_budget_column)
ohe_transformer = make_pipeline(GenreOHE())

preproc_basic = make_column_transformer((time_pipeline, ['year', 'duration']),
                                        (budget_transformer, ['budget']),
                                        (sin_transformer, ['date_published']),
                                        (cos_transformer, ['date_published']),
                                        (ohe_transformer, ['genre']))

pipeline = make_pipeline(preproc_basic, GradientBoostingRegressor())
pipeline

In [38]:
# FIT & PREDICT
def baseline(pipeline, X, y):
    """ Returns a list of 5 mae scores"""
    mae = []
    tscv = TimeSeriesSplit(n_splits=5)
    for train_index, test_index in tscv.split(X):
        print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        mae.append(mean_absolute_error(y_test, y_pred))
    return mae

In [39]:
baseline(pipeline, X, y)

TRAIN: [   0    1    2 ... 1103 1104 1105] TEST: [1106 1107 1108 ... 2205 2206 2207]
TRAIN: [   0    1    2 ... 2205 2206 2207] TEST: [2208 2209 2210 ... 3307 3308 3309]
TRAIN: [   0    1    2 ... 3307 3308 3309] TEST: [3310 3311 3312 ... 4409 4410 4411]
TRAIN: [   0    1    2 ... 4409 4410 4411] TEST: [4412 4413 4414 ... 5511 5512 5513]
TRAIN: [   0    1    2 ... 5511 5512 5513] TEST: [5514 5515 5516 ... 6613 6614 6615]


[0.5685347765085719,
 0.5820784345017139,
 0.5295404046852776,
 0.5833934884599798,
 0.5448240624958174]

In [21]:
pipeline.get_params()

{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(transformers=[('pipeline',
                                    Pipeline(steps=[('functiontransformer',
                                                     FunctionTransformer(func=<function convert_to_int at 0x7effb06d8670>)),
                                                    ('robustscaler',
                                                     RobustScaler())]),
                                    ['year', 'duration']),
                                   ('functiontransformer-1',
                                    FunctionTransformer(func=<function convert_budget_column at 0x7effb06d8700>),
                                    ['budget']),
                                   ('functiontransformer-2',
                                    FunctionTransformer(func=<function add_sin_features at 0x7effb06d8820>),
                                    ['date_published']),
                                   ('functiontrans

In [22]:
from sklearn.model_selection import GridSearchCV

# Inspect all pipe components parameters to find the one you want to gridsearch


# Instanciate grid search
grid_search = GridSearchCV(
    pipeline, 
    param_grid={
        # Access any component of the pipeline, as far back as you want
        'gradientboostingregressor__learning_rate': [0.001, 0.01, 0.1],
        'gradientboostingregressor__n_estimators': [10,100,200,500],
        'gradientboostingregressor__max_depth': [2,3,4]},
    cv=TimeSeriesSplit(n_splits=5),
    scoring="neg_mean_absolute_error")

grid_search.fit(X, y)
grid_search.best_params_

{'gradientboostingregressor__learning_rate': 0.1,
 'gradientboostingregressor__max_depth': 2,
 'gradientboostingregressor__n_estimators': 100}