# Importação da informação

## Importar pacotes

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import ticker

In [2]:
# Train Split Test
from sklearn.model_selection import train_test_split

In [3]:
# modelos lineares
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# modelos baseados em árvore
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# funcionalidades para o pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from feature_engine.imputation import ArbitraryNumberImputer, MeanMedianImputer
from feature_engine.imputation import CategoricalImputer
from feature_engine.encoding import OneHotEncoder
from feature_engine.wrappers import SklearnTransformerWrapper

# Validação cruzada
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold

# Importando o objeto que separa os nossos dados de acordo com o KFold
from sklearn.model_selection import KFold

# Cross Validation Score
from sklearn.model_selection import cross_val_score

In [4]:
import shap

In [5]:
# Métricas de modelo
from sklearn.metrics import(
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)

In [6]:
# GridSearchCV
from sklearn.model_selection import GridSearchCV

In [7]:
# Mostrar diagramas no GridSearchCV
from sklearn import set_config
set_config(display='diagram')

## Importar base salva

In [115]:
# Onde salvar .csv
export_path = 'export_tmdb_database'
export_image_path = 'export_image'

In [116]:
df_movies_abt = pd.read_csv(f"{export_path}\df_movies_abt.csv")
df_movies_abt.head()

Unnamed: 0,movie_id,adult,budget,original_language,popularity,revenue,runtime,vote_average,vote_count,nmonth_release,...,genre Horror,genre Music,genre Mystery,genre Romance,genre Science Fiction,genre TV Movie,genre Thriller,genre War,genre Western,financial_success
0,1107083,0,182685,hi,0.694,2557594,150,10.0,1,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,1107828,0,130000,mr,0.6,560000,125,10.0,1,11,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,1108726,0,90000000,en,1.4,161600000,90,0.0,0,4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,1111182,0,6000000,en,0.6,66623990,120,7.0,1,8,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,1008660,0,9205858,sw,0.6,500028,1,0.0,0,6,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0


## Machine Learning - Classificação - Filme de Sucesso

### Preparação dos parâmetros

In [117]:
# Função de classificação automática de tipo de variáveis de forma mais avançada de um dataframe
def get_type_dataframe(df, classify_bool_to_cat=False):

    #-----------------------------------------------------------------------
    # Types of columns

    numeric_cols = pd.DataFrame(df.head(1)._get_numeric_data().columns, columns=['columns'])

    bool_cat_cols = []
    if classify_bool_to_cat:
        for col in list(numeric_cols['columns'].values):
            if (list(df[col].sort_values().unique()) == [0,1]) | (list(df[col].sort_values().unique()) == [0]) | (list(df[col].sort_values().unique()) == [1]):
                bool_cat_cols.append(col)

    df_dtypes = (
        pd.DataFrame(df.dtypes, columns=['data_type']).reset_index().rename({'index':'columns'}, axis='columns')

        # Get generic_data_type
        .merge(numeric_cols, on='columns', how='left', indicator=True)
        .assign(generic_data_type = lambda df : df['_merge'].apply(lambda y : 'numeric' if y == 'both' else 'categoric'))
        .drop(['_merge'], axis=1)

        # Update generic_data_type with boolean [0,1]
        .merge(pd.DataFrame(bool_cat_cols, columns=['columns']), on='columns', how='left', indicator=True)
        .assign(generic_data_type = lambda df : np.where(df['_merge'] == 'both', 'categoric', df['generic_data_type']))
        .drop(['_merge'], axis=1)

        # Update index
        .set_index(['columns'])
    )

    del numeric_cols
    del bool_cat_cols

    df_col_types_T = df_dtypes.T


    #-----------------------------------------------------------------------
    # List of numeric (non categorical) and categoric fields
    list_non_cat = list(df_dtypes[df_dtypes['generic_data_type'] == 'numeric'].index)
    list_cat = list(df_dtypes[df_dtypes['generic_data_type'] == 'categoric'].index)

    return list_non_cat, list_cat

In [119]:
# Convertendo o tipo da variável df_movies_abt['nmonth_release'] para object
df_movies_abt['nmonth_release'] = [str(x) for x in list(df_movies_abt['nmonth_release'])]

In [120]:
# Separando variáveis categóricas e não categóricas automaticamente
list_non_cat, list_cat = get_type_dataframe(df_movies_abt)

In [137]:
# Variáveis
key_vars = ['movie_id']
target = ['financial_success']
ignore_columns = ['revenue','nquarter_release','popularity']
num_vars = list(set(list_non_cat) - set(key_vars) - set(target) - set(ignore_columns)) # Classificando todas as variáveis numéricas e eliminando as key_vars, lembrando que bool (0,1) estão já tratadas
cat_vars = list(set(list_cat) - set(key_vars) - set(target) - set(ignore_columns)) # Classificando todas as variáveis categóricas e eliminando as key_vars

features = cat_vars + num_vars

X = df_movies_abt[features]
y = df_movies_abt[target]

# Random state
#var_random_state=42

# display(cat_vars)
# display(num_vars)

In [122]:
# Separar dados em treino e em teste
X_train, X_oot, y_train, y_oot = train_test_split(X, y, test_size=0.30, shuffle=True)
y_train = np.array(y_train).flatten()
y_oot = np.array(y_oot).flatten()

In [123]:
### PIPELINE
# Criando a pipeline do modelo - árvores para variáveis numéricas
steps_modelos_arvores_num_vars = [
    ('numeric_imputer', ArbitraryNumberImputer(variables=num_vars, arbitrary_number=-999)),
    ('numeric_scaler', SklearnTransformerWrapper(transformer=MinMaxScaler(), variables=num_vars))
]

# Criando a pipeline do modelo - árvores para variáveis categóricas
steps_modelos_arvores_cat_vars = [
    ('categoric_imputer', CategoricalImputer(variables=cat_vars, imputation_method='missing', fill_value='Missing')),
    ('one_hot_encoder', OneHotEncoder(variables=cat_vars))
]

# Criando a pipeline do lineares - árvores para variáveis numéricas
steps_modelos_lineares_num_vars = [
    ('numeric_imputer', ArbitraryNumberImputer(variables=num_vars, arbitrary_number=-999)),
    ('numeric_scaler', SklearnTransformerWrapper(transformer=MinMaxScaler(), variables=num_vars))
]

# Criando a pipeline do modelo - lineares para variáveis categóricas
steps_modelos_lineares_cat_vars = [
    ('categoric_imputer', CategoricalImputer(variables=cat_vars, fill_value='missing')),
    ('one_hot_encoder', OneHotEncoder(variables=cat_vars))
]



# Modelo de ML - Lineares
modelos_lineares = [
    ('logistic_regression', LogisticRegression(max_iter=1000,random_state=None)),
    ('svm', SVC(random_state=None, probability=True))
]

# Modelo de ML - Árvores
modelos_arvores = [
    ('decision_tree', DecisionTreeClassifier(random_state=None)),
    ('random_forest', RandomForestClassifier(random_state=None)),
    ('gb', GradientBoostingClassifier(random_state=None)),
    ('xgb', XGBClassifier(random_state=None)),
    ('lgbm', LGBMClassifier(random_state=None)),
    ('catboost', CatBoostClassifier(random_state=None, logging_level='Silent'))
]


# Criando a pipeline dos passos de pré-tratamento de modelos lineares
steps_modelos_arvores = []
steps_modelos_lineares = []
if len(num_vars) > 0:
    steps_modelos_arvores = steps_modelos_arvores + steps_modelos_arvores_num_vars
    steps_modelos_lineares = steps_modelos_lineares + steps_modelos_lineares_num_vars
if len(cat_vars) > 0:
    steps_modelos_arvores = steps_modelos_arvores + steps_modelos_arvores_cat_vars
    steps_modelos_lineares = steps_modelos_lineares + steps_modelos_lineares_cat_vars

In [126]:
# Função para automatizar o processo
def treinar_modelo(modelo, model_type, steps, X_train, y_train, cv, random_state=None, n_jobs=-1, do_real_train=False, do_real_test=False, X_oot=None, y_oot=None):
        pipeline = Pipeline(steps=steps + [modelo])
        resultado_cv = cross_validate(
                estimator=pipeline, 
                X=X_train,
                y=y_train,
                scoring=['accuracy', 'precision', 'recall', 'f1', 'roc_auc'],
                cv=cv,
                n_jobs=n_jobs)
        
        # List of return of variables
        df_aux = pd.DataFrame(resultado_cv)
        dfar = df_aux.mean()
        list_return = [[
                modelo[0]
                ,model_type
                ,'Cross Validation'
                ,dfar.loc['test_accuracy']
                ,dfar.loc['test_precision'] 
                ,dfar.loc['test_recall']
                ,dfar.loc['test_f1']
                ,dfar.loc['test_roc_auc']
        ]]

        # Real train or real test
        if (do_real_train == True) or (do_real_test == True and len(X_oot) == len(y_oot)):
                real_model = pipeline.fit(X_train, y_train)

        # Real train or real test
        if (do_real_train == True):
                _y_pred_train = real_model.predict(X_train)
                metrics_train = [
                        modelo[0]
                        ,model_type
                        ,'Vs Training'
                        ,accuracy_score(y_train, _y_pred_train)
                        ,precision_score(y_train, _y_pred_train)
                        ,recall_score(y_train, _y_pred_train)
                        ,f1_score(y_train, _y_pred_train)
                        ,roc_auc_score(y_train, _y_pred_train)
                ]
                #display(metrics_train)
                list_return.append(metrics_train)


        # Real train or real test
        if (do_real_test == True and len(X_oot) == len(y_oot)):
                _y_pred_oot = real_model.predict(X_oot)
                metrics_test = [
                        modelo[0]
                        ,model_type
                        ,'Vs Test'
                        ,accuracy_score(y_oot, _y_pred_oot)
                        ,precision_score(y_oot, _y_pred_oot)
                        ,recall_score(y_oot, _y_pred_oot)
                        ,f1_score(y_oot, _y_pred_oot)
                        ,roc_auc_score(y_oot, _y_pred_oot)
                ]
                #display(metrics_test)
                list_return.append(metrics_test)

        return list_return

### Executar Machine Learning com modelos de classificação (Cross Validation)

In [127]:
# Fazendo o treino em vários modelos de machine learning com o StratifiedKFold para verificar o melhor modelo
#%%time

# Layout de retorno em branco
df_results = pd.DataFrame(columns=['model_name','model_type','type_return','accuracy', 'precision', 'recall', 'f1', 'roc_auc'])

# Tipos de treinamento
var_do_real_train = False
var_do_real_test = True

# Listar tipos de treinamentos retornados
list_type_return = ['cross_validation']
if var_do_real_train: list_type_return.append('vs_train')
if var_do_real_test: list_type_return.append('vs_test')


# Modelo de cross validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=None)

# Loop modelos lineares
for model in modelos_lineares:
    model_name = model[0]
    print(f'Treinando {model_name} ...', end=' ')
    aux = treinar_modelo(model, 'linear',steps_modelos_lineares, X_train, y_train, skf, do_real_train=var_do_real_train, do_real_test=var_do_real_test, X_oot=X_oot, y_oot=y_oot)

    for i, type_return in enumerate(list_type_return):
        df_results.loc[f'{model_name} ({type_return})'] = aux[i]
        #display(df_results.loc[f'{model_name} ({type_return})','acuracidade'])
    print('OK')


# Loop modelos árvores
for model in modelos_arvores:
    model_name = model[0]
    print(f'Treinando {model_name} ...', end=' ')
    aux = treinar_modelo(model, 'tree', steps_modelos_arvores, X_train, y_train, skf, do_real_train=var_do_real_train, do_real_test=var_do_real_test, X_oot=X_oot, y_oot=y_oot)

    for i, type_return in enumerate(list_type_return):
        df_results.loc[f'{model_name} ({type_return})'] = aux[i]
        #df_results.loc[f'{model_name} ({type_return})','model_name'] = model_name

    print('OK')

Treinando logistic_regression ... OK
Treinando svm ... OK
Treinando decision_tree ... OK
Treinando random_forest ... OK
Treinando gb ... OK
Treinando xgb ... OK
Treinando lgbm ... OK
Treinando catboost ... OK


### Fazer a análise do Cross-Validation

In [128]:
metrica_ref = 'precision'

In [129]:
df_results.sort_values(by=['type_return',metrica_ref], ascending=[True,False])

Unnamed: 0,model_name,model_type,type_return,accuracy,precision,recall,f1,roc_auc
gb (cross_validation),gb,tree,Cross Validation,0.74588,0.750977,0.716687,0.733365,0.82451
catboost (cross_validation),catboost,tree,Cross Validation,0.747182,0.747985,0.727335,0.737284,0.829413
lgbm (cross_validation),lgbm,tree,Cross Validation,0.742199,0.74236,0.722465,0.732088,0.823973
random_forest (cross_validation),random_forest,tree,Cross Validation,0.735485,0.737618,0.711372,0.724047,0.811764
logistic_regression (cross_validation),logistic_regression,linear,Cross Validation,0.711437,0.7322,0.645213,0.68582,0.782745
xgb (cross_validation),xgb,tree,Cross Validation,0.727906,0.723809,0.715357,0.719424,0.805974
svm (cross_validation),svm,linear,Cross Validation,0.672659,0.70652,0.564373,0.627079,0.747878
decision_tree (cross_validation),decision_tree,tree,Cross Validation,0.681323,0.673805,0.674946,0.673807,0.681161
catboost (vs_test),catboost,tree,Vs Test,0.747852,0.742038,0.731937,0.736953,0.747316
gb (vs_test),gb,tree,Vs Test,0.744315,0.740622,0.72356,0.731992,0.743616


### Executar melhor modelo selecionado

In [130]:
# Usar o melhor modelo para treinar a base inteira
row_best_model = df_results[df_results['type_return'] == 'Cross Validation'].sort_values(by=metrica_ref, ascending=False).head(1)
model_type_best_model = row_best_model['model_type'][0]
name_best_model = row_best_model['model_name'][0]

#name_best_model = 'logistic_regression'

for model in modelos_lineares:
    if model[0] == name_best_model:
        print(f'Treinando no melhor modelo: {model[0]} ...', end=' ')
        best_model = Pipeline(steps=steps_modelos_lineares + [model])
for model in modelos_arvores:
    if model[0] == name_best_model:
        print(f'Treinando no melhor modelo: {model[0]} ...', end=' ')
        best_model = Pipeline(steps=steps_modelos_arvores + [model])

best_model.fit(X_train, y_train)
print('OK')

Treinando no melhor modelo: gb ... OK


### Verificar o modelo criado

In [131]:
# Metricas - vs_train
y_proba_train = best_model.predict_proba(X_train)[:, 1]
display(y_proba_train)

results_train_best_model = pd.DataFrame({'y_true' : y_train})
results_train_best_model['y_prob'] = y_proba_train
results_train_best_model['y_pred'] = np.round(y_proba_train).astype('int64')
results_train_best_model['y_hit'] = results_train_best_model['y_true'] == results_train_best_model['y_pred']

y_train_comp, y_pred_comp = (np.array(y_train).flatten(), np.round(y_proba_train).astype('int64'))
metricas_best_model = pd.DataFrame(
    data=[
        accuracy_score(y_train_comp, y_pred_comp)
        ,precision_score(y_train_comp, y_pred_comp)
        ,recall_score(y_train_comp, y_pred_comp)
        ,f1_score(y_train_comp, y_pred_comp)
        ,roc_auc_score(y_train_comp, y_pred_comp)
    ]
    ,columns=['vs_train']
    ,index=['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
)

metricas_best_model


array([0.1991103 , 0.39212821, 0.26342224, ..., 0.6984257 , 0.43670355,
       0.75355994])

Unnamed: 0,vs_train
accuracy,0.790945
precision,0.795862
recall,0.76865
f1,0.782019
roc_auc,0.790416


### Usar a base de testes do treinamento

In [132]:
# Fazer teste
y_pred_oot = best_model.predict(X_oot)
y_proba_oot = best_model.predict_proba(X_oot)[:, 1]
y_pred_oot

array([1, 0, 1, ..., 0, 1, 0], dtype=int64)

In [133]:
metricas_best_model['vs_test'] = [
    accuracy_score(y_oot, y_pred_oot)
    ,precision_score(y_oot, y_pred_oot)
    ,recall_score(y_oot, y_pred_oot)
    ,f1_score(y_oot, y_pred_oot)
    ,roc_auc_score(y_oot, y_pred_oot)
]

metricas_best_model

Unnamed: 0,vs_train,vs_test
accuracy,0.790945,0.744315
precision,0.795862,0.740622
recall,0.76865,0.72356
f1,0.782019,0.731992
roc_auc,0.790416,0.743616


In [134]:
y_oot_comp, y_proba_oot_comp = (np.array(y_oot).flatten(), np.round(y_proba_oot).astype('int64'))
results_oot_best_model = pd.DataFrame({'y_true' : y_oot_comp})
results_oot_best_model['y_prob'] = y_proba_oot_comp
results_oot_best_model['y_pred'] = np.round(y_proba_oot_comp).astype('int64')
results_oot_best_model['y_hit'] = results_oot_best_model['y_true'] == results_oot_best_model['y_pred']

### Importância das variáveis

In [139]:
X_oot_transformado = best_model[:-1].transform(X_oot)
X_oot_transformado

Unnamed: 0,runtime,genre History,vote_average,genre Horror,genre War,has_collection,genre Family,genre Music,genre Adventure,genre TV Movie,...,nmonth_release_12,nmonth_release_5,nmonth_release_9,nmonth_release_3,nmonth_release_8,nmonth_release_4,nmonth_release_6,nmonth_release_2,nmonth_release_1,nmonth_release_11
5456,0.370079,0.0,0.7540,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,0,0,0,0
5020,0.413386,0.0,0.5200,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,1,0,0,0,0
4994,0.271654,0.0,0.4283,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0
2969,0.385827,0.0,0.6558,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3867,0.318898,0.0,0.6336,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2633,0.472441,0.0,0.7166,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,1,0,0,0,0,0,0
62,0.314961,0.0,0.7821,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
842,0.543307,0.0,0.5527,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0,0,0,0,0,0,1,0,0,0
1766,0.358268,0.0,0.6862,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [178]:
#len(X_oot_transformado.columns)

#len(np.round(best_model[-1].feature_importances_ ,1))

pd.set_option('display.max_rows',100)
importance = pd.DataFrame({'feature':X_oot_transformado.columns,'importance':np.round(best_model[-1].feature_importances_ ,2)}).sort_values(by=['importance'], ascending=[False])
display(importance)
pd.set_option('display.max_rows',20)


Unnamed: 0,feature,importance
20,vote_count,0.43
25,budget,0.23
5,has_collection,0.1
2,vote_average,0.09
0,runtime,0.03
13,genre Romance,0.01
29,original_language_fr,0.01
24,genre Comedy,0.01
18,genre Science Fiction,0.01
16,genre Drama,0.01
