In [83]:
# IMPORT
import numpy as np

from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import RobustScaler, OneHotEncoder

from currency_converter import CurrencyConverter
from CinePred.data.utils import convert, convert_budget_column, convert_to_int, add_director_category, add_sin_features,\
add_cos_features, convert_to_date
from CinePred.data.data import Data
from CinePred.data.genre_ohe import GenreOHE
from CinePred.data.production_company_ohe import ProdCompOHE

from sklearn import set_config; set_config(display='diagram') # decoration for the pipeline
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_union
from sklearn.preprocessing import FunctionTransformer

%load_ext autoreload
%autosave 120
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Autosaving every 120 seconds


In [84]:
# IMPORT DF
data = Data('../raw_data/IMDb movies.csv')
data.import_data()

<CinePred.data.data.Data at 0x7effa2fa0b20>

In [85]:
# CLEANING
data.remove_na_rows()
data.convert_income(column_name='worlwide_gross_income')
data.convert_to_date(column_name='date_published')
data.dataframe.sort_values(by='date_published', inplace=True)
data.dataframe.reset_index(inplace=True)

In [86]:
data.dataframe['production_company']

0              Universum Film (UFA)
1       Charles Chaplin Productions
2       Charles Chaplin Productions
3                        Paris Film
4           Walt Disney Productions
                   ...             
6611          Passage Pictures (II)
6612           Rumble Riot Pictures
6613           20th Century Studios
6614             Annapurna Pictures
6615                    Burek Films
Name: production_company, Length: 6616, dtype: object

In [87]:
# DECLARE X & Y
X = data.dataframe[['budget', 'genre', 'duration', 'year', 'date_published', 'production_company']]
y = data.dataframe['worlwide_gross_income']
y = np.log(y)/np.log(10)
X.shape, y.shape

((6616, 6), (6616,))

In [88]:
sin_transformer = FunctionTransformer(add_sin_features)
cos_transformer = FunctionTransformer(add_cos_features)

In [89]:
# PIPELINE
int_transformer = FunctionTransformer(convert_to_int)
time_pipeline = make_pipeline(int_transformer, RobustScaler())

budget_transformer = FunctionTransformer(convert_budget_column)
genre_transformer = make_pipeline(GenreOHE())
comp_transformer = FunctionTransformer(lambda x: # ma fonction qui recoit une serie
                                       x.apply(lambda y: x))

preproc_basic = make_column_transformer((time_pipeline, ['year', 'duration']),
                                        (budget_transformer, ['budget']),
                                        (sin_transformer, ['date_published']),
                                        (cos_transformer, ['date_published']),
                                        (genre_transformer, ['genre']),
                                        (comp_transformer, ['production_company']))

pipeline = make_pipeline(preproc_basic, GradientBoostingRegressor())
pipeline

In [90]:
# FIT & PREDICT
def baseline(pipeline, X, y):
    """ Returns a list of 5 mae scores"""
    mae = []
    tscv = TimeSeriesSplit(n_splits=5)
    for train_index, test_index in tscv.split(X):
        print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        mae.append(mean_absolute_error(y_test, y_pred))
    return mae

In [91]:
baseline(pipeline, X, y)

TRAIN: [   0    1    2 ... 1103 1104 1105] TEST: [1106 1107 1108 ... 2205 2206 2207]


ValueError: If using all scalar values, you must pass an index

In [None]:
pipeline.get_params()

In [22]:
from sklearn.model_selection import GridSearchCV

# Inspect all pipe components parameters to find the one you want to gridsearch


# Instanciate grid search
grid_search = GridSearchCV(
    pipeline, 
    param_grid={
        # Access any component of the pipeline, as far back as you want
        'gradientboostingregressor__learning_rate': [0.001, 0.01, 0.1],
        'gradientboostingregressor__n_estimators': [10,100,200,500],
        'gradientboostingregressor__max_depth': [2,3,4]},
    cv=TimeSeriesSplit(n_splits=5),
    scoring="neg_mean_absolute_error")

grid_search.fit(X, y)
grid_search.best_params_

{'gradientboostingregressor__learning_rate': 0.1,
 'gradientboostingregressor__max_depth': 2,
 'gradientboostingregressor__n_estimators': 100}