In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import r2_score

from sklearn import set_config; set_config(display='diagram') # decoration for the pipeline
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_union
from sklearn.preprocessing import FunctionTransformer


from currency_converter import CurrencyConverter
from CinePred.data.utils import convert, convert_budget_column, convert_to_int, convert_to_date
from CinePred.data.data import Data

%load_ext autoreload
%autoreload 2

### Baseline

#### Baseline (linear regression)

In [9]:
# Declaring X and y
reg_df = df[df.currency == '$'].copy()
reg_df['year'] = reg_df['year'].astype('int64')
reg_df.sort_values(by=['year'], inplace=True)
reg_df.reset_index(inplace=True)

X = reg_df[['budget', 'avg_vote', 'duration', 'year']]
y = reg_df['worlwide_gross_income']
# X.avg_vote.hist()
X.shape, y.shape

((9025, 4), (9025,))

In [10]:
n_splits = 5
tscv = TimeSeriesSplit(n_splits)
print(tscv)

TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None)


In [11]:
# Scaling avg_vote
scaler = RobustScaler()
X['avg_vote'] = scaler.fit_transform(X[['avg_vote']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['avg_vote'] = scaler.fit_transform(X[['avg_vote']])


In [12]:
X

Unnamed: 0,budget,avg_vote,duration,year
0,18000,1.285714,76,1920
1,800000,0.642857,150,1921
2,250000,1.428571,68,1921
3,351000,0.500000,82,1923
4,923000,1.357143,95,1925
...,...,...,...,...
9020,18000000,0.285714,123,2020
9021,85000000,0.142857,99,2020
9022,8000000,-0.428571,88,2020
9023,1000000,0.571429,76,2020


In [13]:
def baseline(model, X, y):
    """ Returns a list of 5 r2 scores"""
    r2 = []
    tscv = TimeSeriesSplit(n_splits=5)
    for train_index, test_index in tscv.split(X):
        print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        r2.append(r2_score(y_test, y_pred))
    return r2

print(baseline(LinearRegression(), X, y))

TRAIN: [   0    1    2 ... 1502 1503 1504] TEST: [1505 1506 1507 ... 3006 3007 3008]
TRAIN: [   0    1    2 ... 3006 3007 3008] TEST: [3009 3010 3011 ... 4510 4511 4512]
TRAIN: [   0    1    2 ... 4510 4511 4512] TEST: [4513 4514 4515 ... 6014 6015 6016]
TRAIN: [   0    1    2 ... 6014 6015 6016] TEST: [6017 6018 6019 ... 7518 7519 7520]
TRAIN: [   0    1    2 ... 7518 7519 7520] TEST: [7521 7522 7523 ... 9022 9023 9024]
[0.3513339173194543, 0.5435098614324834, 0.5882116752439108, 0.6643727664645329, 0.6012721203225608]


#### Baseline (GradientBoostingRegressor)

In [14]:
from sklearn.ensemble import GradientBoostingRegressor
model = GradientBoostingRegressor(
    n_estimators=100, 
    learning_rate=0.1,
    max_depth=3
)
baseline(model, X, y)

TRAIN: [   0    1    2 ... 1502 1503 1504] TEST: [1505 1506 1507 ... 3006 3007 3008]
TRAIN: [   0    1    2 ... 3006 3007 3008] TEST: [3009 3010 3011 ... 4510 4511 4512]
TRAIN: [   0    1    2 ... 4510 4511 4512] TEST: [4513 4514 4515 ... 6014 6015 6016]
TRAIN: [   0    1    2 ... 6014 6015 6016] TEST: [6017 6018 6019 ... 7518 7519 7520]
TRAIN: [   0    1    2 ... 7518 7519 7520] TEST: [7521 7522 7523 ... 9022 9023 9024]


[0.27786931255389513,
 0.3487515161996345,
 0.664575538082975,
 0.6565889689490769,
 0.6566061866124366]

### Pipeline

In [17]:
print('----- init Data -----')
data = Data('../raw_data/IMDb movies.csv')

print('----- import Data -----')
data.import_data()

print('----- remove na rows -----')
data.remove_na_rows()

# print('----- convert to int -----')
# data.convert_to_int('year')
# data.convert_to_int('duration')

#print('----- convert budget -----')
#data.convert_budget_column(column_name='budget',min_rows=45, out_currency='USD')

print('----- convert to date -----')
data.convert_to_date('date_published')
    
print('----- convert income column -----')
data.convert_income(column_name='worlwide_gross_income')

print('----- keep columns -----')
# data.keep_columns(columns_names=[
#     'imdb_title_id', 'title', 'year', 'date_published', 'genre',
#     'duration', 'country', 'director', 'writer', 'production_company',
#     'actors', 'budget', 'worlwide_gross_income'
# ])
data.keep_columns(columns_names=[
    'imdb_title_id', 'title', 'year', 'date_published', 'genre',
    'duration', 'production_company',
    'budget', 'worlwide_gross_income'
])

print('----- seasonality Sin/Cos -----')
data.add_sin_cos_features('Month_published')
    
print('----- reset index -----')
data.reset_index()

X = data.dataframe.drop(columns='worlwide_gross_income')
y = data.dataframe['worlwide_gross_income']

X.head()

----- init Data -----
----- import Data -----
----- remove na rows -----
----- convert to date -----
----- convert income column -----
----- keep columns -----
----- seasonality Sin/Cos -----
----- reset index -----


Unnamed: 0,imdb_title_id,title,year,date_published,genre,duration,production_company,budget,Month_published,sin_MoPub,cos_MoPub
0,tt0017136,Metropolis,1927,1928-10-01,"Drama, Sci-Fi",153,Universum Film (UFA),DEM 6000000,10,-0.866025,0.5
1,tt0021749,Luci della città,1931,1931-04-02,"Comedy, Drama, Romance",87,Charles Chaplin Productions,$ 1500000,4,0.866025,-0.5
2,tt0027977,Tempi moderni,1936,1937-03-12,"Comedy, Drama, Family",87,Charles Chaplin Productions,$ 1500000,3,1.0,6.123234000000001e-17
3,tt0029453,Il bandito della Casbah,1937,1937-10-22,"Crime, Drama, Romance",94,Paris Film,$ 60000,10,-0.866025,0.5
4,tt0029583,Biancaneve e i sette nani,1937,1938-11-30,"Animation, Family, Fantasy",83,Walt Disney Productions,$ 1499000,11,-0.5,0.8660254


In [11]:
int_transformer = FunctionTransformer(convert_to_int)
time_pipeline = make_pipeline(int_transformer, RobustScaler())

budget_transformer = FunctionTransformer(convert_budget_column)
ohe_transformer = make_pipeline(OneHotEncoder(sparse=False))

preproc_basic = make_column_transformer((time_pipeline, ['year', 'duration']),
                                        (ohe_transformer, ['genre']),
                                        (budget_transformer, ['budget']))

pipeline = make_pipeline(preproc_basic, GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3))

In [12]:
pipeline.fit(X, y)

In [126]:
X_test = pd.DataFrame({'year': [2000],
              'genre': ['Comedy, Drama'],
              'duration': [90],
              'budget': ['$ 1500000']
             })
X_test

Unnamed: 0,year,genre,duration,budget
0,2000,"Comedy, Drama",90,$ 1500000


In [127]:
y_pred = pipeline.predict(X_test)
print(round(y_pred[0]), "$")

5388785 $
