In [12]:
# IMPORT
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import RobustScaler, OneHotEncoder

from currency_converter import CurrencyConverter
from CinePred.data.utils import convert, convert_budget_column, convert_to_int, add_director_category, add_sin_features, add_cos_features
from CinePred.data.data import Data

from sklearn import set_config; set_config(display='diagram') # decoration for the pipeline
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_union
from sklearn.preprocessing import FunctionTransformer

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [13]:
# IMPORT DF
data = Data('../raw_data/IMDb movies.csv')
data.import_data()

<CinePred.data.data.Data at 0x7f9d2a2fad30>

In [14]:
# CLEANING
data.remove_na_rows()
data.convert_income(column_name='worlwide_gross_income')

<CinePred.data.data.Data at 0x7f9d2a2fad30>

In [15]:
# DECLARE X & Y
X = data.dataframe[['budget', 'genre', 'duration', 'year', 'director', 'date_published']]
y = data.dataframe['worlwide_gross_income']
X.shape, y.shape

((6616, 6), (6616,))

In [16]:
# PIPELINE
int_transformer = FunctionTransformer(convert_to_int)
time_pipeline = make_pipeline(int_transformer, RobustScaler())

budget_transformer = FunctionTransformer(convert_budget_column)
ohe_transformer = make_pipeline(OneHotEncoder(sparse=False))
sin_transformer = FunctionTransformer(add_sin_features)
cos_transformer = FunctionTransformer(add_cos_features)
# ohe_transformer = FunctionTransformer(OneHotEncoder(sparse=False))

preproc_basic = make_column_transformer((time_pipeline, ['year', 'duration']),
                                        (ohe_transformer, ['genre']),
                                        (budget_transformer, ['budget']),
                                        (sin_transformer, ['date_published']),
                                       (cos_transformer, ['date_published']))

pipeline = make_pipeline(preproc_basic, GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3))
pipeline

In [None]:
# FIT & PREDICT
def baseline(pipeline, X, y):
    """ Returns a list of 5 r2 scores"""
    r2 = []
    tscv = TimeSeriesSplit(n_splits=5)
    for train_index, test_index in tscv.split(X):
        print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        pipeline.fit(X_train, y_train)
        pipeline.predict(X_test)
        r2.append(pipeline.score(X_test, y_test))
    return r2

In [None]:
baseline(pipeline, X, y)