In [150]:
# IMPORT
import numpy as np

from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import RobustScaler, OneHotEncoder

from currency_converter import CurrencyConverter
from CinePred.data.utils import convert, convert_budget_column, convert_to_int, add_director_category, add_sin_features,\
add_cos_features, convert_to_date
from CinePred.data.data import Data

from sklearn import set_config; set_config(display='diagram') # decoration for the pipeline
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_union
from sklearn.preprocessing import FunctionTransformer

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [138]:
# IMPORT DF
data = Data('../raw_data/IMDb movies.csv')
data.import_data()

<CinePred.data.data.Data at 0x7f9d2e516580>

In [139]:
# CLEANING
data.remove_na_rows()
data.convert_income(column_name='worlwide_gross_income')

<CinePred.data.data.Data at 0x7f9d2e516580>

In [154]:
# DECLARE X & Y
X = data.dataframe[['budget', 'genre', 'duration', 'year', 'date_published']]
y = data.dataframe['worlwide_gross_income']
y = np.log(y)/np.log(10)
X.shape, y.shape

((6616, 5), (6616,))

In [155]:
to_date_transformer = FunctionTransformer(convert_to_date)

sin_transformer = FunctionTransformer(add_sin_features)
sin_pipe = make_pipeline(to_date_transformer, sin_transformer)

cos_transformer = FunctionTransformer(add_cos_features)
cos_pipe = make_pipeline(to_date_transformer, cos_transformer)

time_pipe = make_column_transformer(
    (sin_pipe, ['date_published']),
    (cos_pipe, ['date_published']))

In [156]:
# PIPELINE
int_transformer = FunctionTransformer(convert_to_int)
time_pipeline = make_pipeline(int_transformer, RobustScaler())

budget_transformer = FunctionTransformer(convert_budget_column)
# ohe_transformer = make_pipeline(OneHotEncoder(sparse=False))

preproc_basic = make_column_transformer((time_pipeline, ['year', 'duration']),
                                        (budget_transformer, ['budget']),
                                        (time_pipe, ['date_published']))

pipeline = make_pipeline(preproc_basic, GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3))
pipeline

In [157]:
# FIT & PREDICT
def baseline(pipeline, X, y):
    """ Returns a list of 5 mae scores"""
    mae = []
    tscv = TimeSeriesSplit(n_splits=5)
    for train_index, test_index in tscv.split(X):
        print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        mae.append(mean_absolute_error(y_test, y_pred))
    return mae

In [158]:
baseline(pipeline, X, y)

TRAIN: [   0    1    2 ... 1103 1104 1105] TEST: [1106 1107 1108 ... 2205 2206 2207]
TRAIN: [   0    1    2 ... 2205 2206 2207] TEST: [2208 2209 2210 ... 3307 3308 3309]
TRAIN: [   0    1    2 ... 3307 3308 3309] TEST: [3310 3311 3312 ... 4409 4410 4411]
TRAIN: [   0    1    2 ... 4409 4410 4411] TEST: [4412 4413 4414 ... 5511 5512 5513]
TRAIN: [   0    1    2 ... 5511 5512 5513] TEST: [5514 5515 5516 ... 6613 6614 6615]


[0.5320805391777051,
 0.654871810355997,
 0.5517125609602167,
 0.5910773970443713,
 0.6183197889437944]