In [2]:
%load_ext autoreload
%autoreload 2

In [73]:
from CinePred.new_model import *
from CinePred.data.importing import import_data

from sklearn.inspection import permutation_importance

In [5]:
df = import_data('../raw_data/preprocessed.csv')

In [79]:
df.shape

(11875, 45)

In [80]:
acteurs_df = import_data('../raw_data/cat_acteur.csv')
acteurs_df.shape

(37101, 9)

In [81]:
acteurs_df.head()

Unnamed: 0,worlwide_gross_income,year,duration,budget,production_company,director,writer,shifted,nb_movies_actor1,nb_movies_actor2,...,Horror,Family,Music,Sci-Fi,Animation,Comedy,Nb_actuals_movie_production_company,Nb_actuals_movie_directors,Nb_actuals_movie_writers,title
0,4.476463,2012,95,2.304875,Elixir Entretenimento,Lúcia Murat,"Tatiana Salem Levy, Lúcia Murat",0,1,1.0,...,0,0,0,0,0,0,1,2,1,Memories They Told Me
1,4.908817,2018,132,2.761312,Caos Calmo Filmes,"Sérgio Graciano, Manuel Pureza","Ricardo Oliveira, Manuel Pureza",0,1,1.0,...,0,0,0,0,0,1,1,1,1,Blood Lines
2,5.059217,2018,137,2.840493,Moby Dick Films,Jean-Paul Civeyrac,Jean-Paul Civeyrac,0,1,1.0,...,0,0,0,0,0,0,2,1,1,A Paris Education
3,4.336624,2018,90,2.875255,Propeler Film,Antonio Nuic,Antonio Nuic,0,1,1.0,...,0,0,0,0,0,0,1,1,1,Kid
4,5.825793,2006,87,2.935861,Bianca Film,Alessandro Angelini,"Alessandro Angelini, Angelo Carbone",0,1,1.0,...,0,0,0,0,0,0,2,1,1,Salty Air


### Cleaning

In [5]:
df = keep_columns(df,
                      column_names=[
                          'year', 'date_published', 'genre', 'duration',
                          'budget', 'worlwide_gross_income'
                      ])
df = remove_na_rows(df)

In [6]:
# date_published
df['date_published'] = convert_to_date(df[['date_published']])

# day of the year
df['date_sin'] = add_sin_features(df[['date_published']])
df['date_cos'] = add_cos_features(df[['date_published']])
df.drop(columns='date_published', inplace=True)

# year, duration
df['year'] = convert_to_int(df[['year']])
df['duration'] = convert_to_int(df[['duration']])

# genre
ohe = GenreOHE()
ohe.fit(df) # la colonne 'genre' est spécifié dans la classe
df = ohe.transform(df)

# budget
df['budget'] = convert_budget_column(df[['budget']])
df = df[df['budget'] != 0]
df = add_inflation(df, 'budget')
df['budget'] = log_transformation(df[['budget']])

In [7]:
pd.set_option('display.max_rows', None)

In [8]:
df['worlwide_gross_income'] = convert_income(df[['worlwide_gross_income']])

df = add_inflation(df, 'worlwide_gross_income')

df['worlwide_gross_income'] = log_transformation(df[['worlwide_gross_income']])

TypeError: Years can only be converted to other years. Months only to other months.

In [9]:
df.head()

Unnamed: 0,year,duration,budget,worlwide_gross_income,date_sin,date_cos,Action,Biography,Thriller,Crime,...,Sport,History,War,Horror,Family,Music,Sci-Fi,Animation,Comedy,year_2
0,1920,76,5.367225,8811,0.8660254,0.5,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1920
1,1921,107,5.605531,4272,-2.449294e-16,1.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1921
2,1921,150,7.06322,9183673,0.8660254,-0.5,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1921
3,1921,68,6.55807,26916,-0.5,0.866025,0,0,0,0,...,0,0,0,0,1,0,0,0,1,1921
4,1923,82,6.725294,11233,1.224647e-16,-1.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1923


### Fit & score

In [7]:
mid = int(df.shape[0] / 2)
df1 = df.iloc[:mid].copy()
df2 = df.iloc[mid:].copy()

In [50]:
X1 = df1.drop(columns=['worlwide_gross_income', 'actors', 'production_company', 'director', 'writer', 'imdb_title_id', 'description', 'title', 'avg_vote', 'country'])
y1 = df1['worlwide_gross_income']

model1 = XGBRegressor(learning_rate=0.1, max_depth=2, n_estimators=100)

cv1 = cross_val_score(model1, X1, y1, cv=5, scoring='neg_mean_absolute_error')

score1 = np.mean(np.abs(cv1))
score1

0.8031469292154787

In [68]:
X2 = df2.drop(columns=['worlwide_gross_income', 'actors', 'production_company', 'director', 'writer', 'imdb_title_id', 'description', 'title', 'avg_vote', 'country'])
y2 = df2['worlwide_gross_income']

model2 = XGBRegressor(learning_rate=0.1, max_depth=3, n_estimators=200)

cv2 = cross_val_score(model2, X2, y2, cv=5, scoring='neg_mean_absolute_error')
score2 = np.mean(np.abs(cv2))
score2

0.5544040864726767

In [69]:
(score1+score2)/2

0.6787755078440777

In [37]:
X = df.drop(columns=['worlwide_gross_income', 'actors', 'production_company', 'director', 'writer', 'imdb_title_id', 'description', 'title', 'avg_vote', 'country'])
y = df['worlwide_gross_income']

model = XGBRegressor(learning_rate=0.3, max_depth=2, n_estimators=100)

cv = cross_val_score(model, X, y, cv=5, scoring='neg_mean_absolute_error')

score = np.mean(np.abs(cv))
score

0.7022640447828474

### Feature importance

In [77]:
model1.fit(X1, y1)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.1, max_delta_step=0,
             max_depth=2, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=8,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [78]:
permutation_score = permutation_importance(model1, X1, y1, n_repeats=10) # Perform Permutation

importance_df = pd.DataFrame(np.vstack((X1.columns,
                                        permutation_score.importances_mean)).T) # Unstack results
importance_df.columns=['feature','score decrease']

importance_df.sort_values(by="score decrease", ascending = False) # Order by importance

Unnamed: 0,feature,score decrease
2,budget,0.2354
0,year,0.06128
32,Nb_actuals_movie_production_company,0.059933
1,duration,0.028504
31,Comedy,0.021986
33,Nb_actuals_movie_directors,0.010756
7,last income,0.010053
26,Horror,0.008832
22,Mystery,0.005671
30,Animation,0.00438


In [75]:
model2.fit(X2, y2)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=200, n_jobs=8,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [76]:
permutation_score = permutation_importance(model2, X2, y2, n_repeats=10) # Perform Permutation

importance_df = pd.DataFrame(np.vstack((X2.columns,
                                        permutation_score.importances_mean)).T) # Unstack results
importance_df.columns=['feature','score decrease']

importance_df.sort_values(by="score decrease", ascending = False) # Order by importance

Unnamed: 0,feature,score decrease
2,budget,0.39714
0,year,0.155756
32,Nb_actuals_movie_production_company,0.11544
1,duration,0.081286
7,last income,0.054643
31,Comedy,0.024186
6,nb_movies_actor3,0.022598
30,Animation,0.015886
27,Family,0.014152
26,Horror,0.012907


In [70]:
model.fit(X, y)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.3, max_delta_step=0,
             max_depth=2, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=8,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [74]:
permutation_score = permutation_importance(model, X, y, n_repeats=10) # Perform Permutation

importance_df = pd.DataFrame(np.vstack((X.columns,
                                        permutation_score.importances_mean)).T) # Unstack results
importance_df.columns=['feature','score decrease']

importance_df.sort_values(by="score decrease", ascending = False) # Order by importance

Unnamed: 0,feature,score decrease
2,budget,0.575209
7,last income,0.110929
32,Nb_actuals_movie_production_company,0.065577
4,nb_movies_actor1,0.059297
0,year,0.052339
1,duration,0.028128
31,Comedy,0.014369
33,Nb_actuals_movie_directors,0.007284
26,Horror,0.006846
6,nb_movies_actor3,0.005091
