In [6]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import r2_score

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import RobustScaler, OneHotEncoder

from currency_converter import CurrencyConverter
from CinePred.data.utils import convert, convert_budget_column, convert_to_int
from CinePred.data.data import Data

from sklearn import set_config; set_config(display='diagram') # decoration for the pipeline
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_union
from sklearn.preprocessing import FunctionTransformer

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
# IMPORT
data = Data('../raw_data/IMDb movies.csv')
data.import_data()

<CinePred.data.data.Data at 0x7f20282a6910>

In [15]:
data.remove_na_rows()
data.convert_income(column_name='worlwide_gross_income')

<CinePred.data.data.Data at 0x7f20282a6910>

In [31]:
int_transformer = FunctionTransformer(convert_to_int)
time_pipeline = make_pipeline(int_transformer, RobustScaler())

budget_transformer = FunctionTransformer(convert_budget_column)
ohe_transformer = make_pipeline(OneHotEncoder(sparse=False))

preproc_basic = make_column_transformer((time_pipeline, ['year', 'duration']),
                                        (ohe_transformer, ['genre']),
                                        (budget_transformer, ['budget']))

pipeline = make_pipeline(preproc_basic, GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3))
pipeline

In [32]:
X = data.dataframe[['budget', 'genre', 'duration', 'year']]
y = data.dataframe['worlwide_gross_income']
X.shape, y.shape

((6616, 4), (6616,))

In [33]:
def baseline(pipeline, X, y):
    """ Returns a list of 5 r2 scores"""
    r2 = []
    tscv = TimeSeriesSplit(n_splits=5)
    for train_index, test_index in tscv.split(X):
        print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        pipeline.fit(X_train, y_train)
        pipeline.predict(X_test)
        r2.append(pipeline.score(X_test, y_test))
    return r2

In [34]:
baseline(pipeline, X, y)

TRAIN: [   0    1    2 ... 1103 1104 1105] TEST: [1106 1107 1108 ... 2205 2206 2207]


ValueError: Found unknown categories ['Action, Crime, Romance', 'Drama, Music, Mystery', 'Drama, Comedy', 'Mystery, Sci-Fi, Thriller', 'Comedy, Drama, Mystery', 'Drama, Family, Fantasy', 'Drama, Romance, Western', 'Action, War', 'Crime, Romance, Thriller', 'Action, Mystery, Thriller', 'Biography, Comedy, Romance', 'Animation, Drama, Family', 'Fantasy, Sci-Fi', 'Drama, History, Horror', 'Crime, Drama, Fantasy', 'Romance, Comedy, Musical', 'Action, Horror, Thriller', 'Drama, Family, Romance', 'Comedy, Romance, Thriller', 'Comedy, Action, Sci-Fi', 'Action, Mystery, Sci-Fi', 'Crime, Fantasy, Horror', 'Adventure, Drama, Sci-Fi', 'Action, Fantasy, Romance', 'Drama, History', 'Adventure, Sci-Fi, Thriller', 'Action, Adventure, Mystery', 'Action, Romance, Thriller', 'Family, Drama', 'Comedy, Family, Music', 'Drama, Fantasy, Sport', 'Drama, Comedy, Crime', 'Action, Drama, Fantasy', 'Comedy, Family, Romance', 'Adventure, Biography, Crime', 'Comedy, Romance, Drama', 'Comedy, Mystery, Romance', 'Drama, Family, History', 'Comedy, Drama, History', 'Family, Comedy', 'Crime, Drama, Action', 'Romance, Drama', 'Biography, Drama, Fantasy', 'Adventure, Drama, Musical', 'Sci-Fi', 'Animation, Action, Crime', 'Crime, Thriller, Drama', 'Action, Comedy, Thriller', 'Action, Adventure, History', 'Comedy, Family, Sport', 'Adventure, Comedy, Horror', 'Action, Comedy, Sci-Fi', 'Crime, Horror', 'Action, Adventure, Horror', 'Comedy, Crime, Sport', 'Drama, Romance, Sci-Fi', 'Drama, Thriller, War', 'Biography, Drama, Thriller'] in column 0 during transform