##### Criando modelo Baseline

##### Importando bibliotecas para o projeto

In [None]:
! pip install pycaret

In [None]:
import numpy             as np
import matplotlib.pyplot as plt
import pandas            as pd
import seaborn           as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from category_encoders import TargetEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, roc_auc_score, accuracy_score, precision_score, recall_score, auc
from sklearn.linear_model import LogisticRegression

##### Funções que iremos utilizar para metrificar os modelos

In [None]:
# Métricas do modelo

def calculate_metrics(nm_modelo, model, X_train, y_train, X_test, y_test):
    # Fazendo predições
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Calculando as métricas para o conjunto de treino
    accuracy_train = accuracy_score(y_train, y_train_pred)
    precision_train = precision_score(y_train, y_train_pred)
    recall_train = recall_score(y_train, y_train_pred)
    auc_roc_train = roc_auc_score(y_train, model.predict_proba(X_train)[:, 1])

    # Calculando o Índice Gini e Estatística KS para o conjunto de treino
    probabilities_train = model.predict_proba(X_train)[:, 1]
    df_train = pd.DataFrame({'true_labels': y_train, 'predicted_probs': probabilities_train})
    df_train = df_train.sort_values(by='predicted_probs', ascending=False)
    df_train['cumulative_true'] = df_train['true_labels'].cumsum() / df_train['true_labels'].sum()
    df_train['cumulative_false'] = (1 - df_train['true_labels']).cumsum() / (1 - df_train['true_labels']).sum()
    ks_statistic_train = max(abs(df_train['cumulative_true'] - df_train['cumulative_false']))
    gini_index_train = 2 * auc_roc_train - 1

    # Calculando as métricas para o conjunto de teste
    accuracy_test = accuracy_score(y_test, y_test_pred)
    precision_test = precision_score(y_test, y_test_pred)
    recall_test = recall_score(y_test, y_test_pred)
    auc_roc_test = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])

    # Calculando o Índice Gini e Estatística KS para o conjunto de teste
    probabilities_test = model.predict_proba(X_test)[:, 1]
    df_test = pd.DataFrame({'true_labels': y_test, 'predicted_probs': probabilities_test})
    df_test = df_test.sort_values(by='predicted_probs', ascending=False)
    df_test['cumulative_true'] = df_test['true_labels'].cumsum() / df_test['true_labels'].sum()
    df_test['cumulative_false'] = (1 - df_test['true_labels']).cumsum() / (1 - df_test['true_labels']).sum()
    ks_statistic_test = max(abs(df_test['cumulative_true'] - df_test['cumulative_false']))
    gini_index_test = 2 * auc_roc_test - 1

    # Criando o DataFrame com as métricas calculadas
    metrics_df = pd.DataFrame({
        'Algoritmo': [nm_modelo, nm_modelo],
        'Conjunto': ['Treino', 'Teste'],
        'Acuracia': [accuracy_train, accuracy_test],
        'Precisao': [precision_train, precision_test],
        'Recall': [recall_train, recall_test],
        'AUC_ROC': [auc_roc_train, auc_roc_test],
        'GINI': [gini_index_train, gini_index_test],
        'KS': [ks_statistic_train, ks_statistic_test]
    })
    return metrics_df

In [None]:
def plot_metrics(model, X_train, y_train, X_test, y_test, features):
    plt.figure(figsize=(15, 18))

    # Gráfico de Barras para Taxa do Evento e Importância das Variáveis
    plt.subplot(4, 2, 1)
    event_rate = y_train.mean()
    non_event_rate = 1 - event_rate
    plt.barh(['Não Evento', 'Evento'], [non_event_rate, event_rate], color=['lightsteelblue', 'skyblue'])
    plt.xlabel('Taxa')
    plt.title('Taxa do Evento')

    plt.subplot(4, 2, 2)
    importances = model.feature_importances_
    indices = np.argsort(importances)  # Ordem ascendente
    indices = indices[-10:]  # Considerando apenas as 10 mais importantes
    plt.barh(features[indices], importances[indices], color='skyblue')
    plt.xlabel('Importância')
    plt.title('Importância das Variáveis')

    # Curva ROC para Treino e Teste
    plt.subplot(4, 2, 3)
    for X, y, label in [(X_train, y_train, 'Treino'), (X_test, y_test, 'Teste')]:
        fpr, tpr, _ = roc_curve(y, model.predict_proba(X)[:, 1])
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, label=f'{label} (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Curva ROC')
    plt.legend(loc='lower right')

    # Gráfico de Barras para Gini e KS
    plt.subplot(4, 2, 4)
    metrics = ['Índice Gini', 'Estatística KS']
    for X, y, label in [(X_train, y_train, 'Treino'), (X_test, y_test, 'Teste')]:
        probs = model.predict_proba(X)[:, 1]
        df = pd.DataFrame({'true_labels': y, 'predicted_probs': probs})
        df = df.sort_values(by='predicted_probs', ascending=False)
        df['cumulative_true'] = df['true_labels'].cumsum() / df['true_labels'].sum()
        df['cumulative_false'] = (1 - df['true_labels']).cumsum() / (1 - df['true_labels']).sum()
        ks = max(abs(df['cumulative_true'] - df['cumulative_false']))
        gini = 2 * auc(*roc_curve(y, probs)[:2]) - 1
        plt.bar([f'{label}\n{metric}' for metric in metrics], [gini, ks], color='skyblue')
    plt.title('Índice Gini e Estatística KS')

    # Matrizes de Confusão para Treino e Teste
    for i, (X, y, label) in enumerate([(X_train, y_train, 'Treino'), (X_test, y_test, 'Teste')], 5):
        plt.subplot(4, 2, i)
        conf_matrix = confusion_matrix(y, model.predict(X))
        sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar_kws={'label': 'Count'})
        plt.xlabel('Predicted Label')
        plt.ylabel('True Label')
        plt.title(f'Matriz de Confusão - {label}')

    # Histograma do Score para Treino e Teste
    plt.subplot(4, 2, 7)
    plt.hist(model.predict_proba(X_train)[:, 1], bins=30, color='skyblue', edgecolor='black', alpha=0.7, label='Treino')
    plt.xlabel('Score')
    plt.ylabel('Frequência')
    plt.title('Histograma do Score - Treino')
    plt.legend(loc='upper right')

    plt.subplot(4, 2, 8)
    plt.hist(model.predict_proba(X_test)[:, 1], bins=30, color='lightsteelblue', edgecolor='black', alpha=0.7, label='Teste')
    plt.xlabel('Score')
    plt.ylabel('Frequência')
    plt.title('Histograma do Score - Teste')
    plt.legend(loc='upper right')

    plt.tight_layout()
    plt.show()

##### Leitura dos Dados

In [None]:
df_train = pd.read_csv('PoD Bank/application_train.csv')
df_test = pd.read_csv('application_test.csv')


##### Criando modelo Baseline - AutoML

In [None]:
from pycaret.classification import *

clf_setup = setup(data=df_train, target='TARGET')


Unnamed: 0,Description,Value
0,Session id,6018
1,Target,TARGET
2,Target type,Binary
3,Original data shape,"(215257, 172)"
4,Transformed data shape,"(215257, 235)"
5,Transformed train set shape,"(150679, 235)"
6,Transformed test set shape,"(64578, 235)"
7,Numeric features,155
8,Categorical features,16
9,Rows with missing values,97.2%


In [None]:
best_model = compare_models()


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.9192,0.7518,0.0149,0.5292,0.0289,0.0246,0.0783,115.006
lr,Logistic Regression,0.9191,0.6257,0.0,0.0,0.0,-0.0,-0.0005,19.377
nb,Naive Bayes,0.9191,0.612,0.0,0.0,0.0,-0.0,-0.0002,1.245
ridge,Ridge Classifier,0.9191,0.7461,0.0002,0.3,0.0005,0.0004,0.0078,1.231
rf,Random Forest Classifier,0.9191,0.6976,0.0,0.0,0.0,-0.0,-0.0002,32.762
et,Extra Trees Classifier,0.9191,0.6961,0.0,0.0,0.0,-0.0,-0.0002,15.553
dummy,Dummy Classifier,0.9191,0.5,0.0,0.0,0.0,0.0,0.0,1.11
ada,Ada Boost Classifier,0.919,0.745,0.0244,0.4865,0.0464,0.039,0.0947,23.798
lda,Linear Discriminant Analysis,0.9187,0.7461,0.0235,0.4491,0.0446,0.0368,0.0879,2.82
xgboost,Extreme Gradient Boosting,0.9174,0.7266,0.0373,0.3869,0.0681,0.0546,0.0995,4.533


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

In [None]:
evaluate_model(best_model)


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

##### Criando DF para submissão e avaliação do modelo

In [None]:
y_predict_proba_test = predict_model(best_model, data=df_test)['prediction_score']

In [None]:
# Criar DataFrame de submissão com IDs do teste
df_submission = pd.DataFrame()
df_submission['ID'] = df_test['SK_ID_CURR']  # Adicionar os IDs do teste como coluna 'ID'
df_submission['TARGET'] = 1 - y_predict_proba_test

# Salvar submissão em CSV
df_submission.to_csv('submission_baseline_automl_v1.csv', index=False)


In [None]:
df_submission.head()

Unnamed: 0,ID,TARGET
0,384575,0.1764
1,214010,0.0425
2,142232,0.1603
3,389171,0.0221
4,283617,0.0666


##### Salvando artefato do Modelo

In [None]:
save_model(best_model, 'baseline_automl_v1')


Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['SK_ID_CURR', 'CNT_CHILDREN',
                                              'AMT_INCOME_TOTAL', 'AMT_CREDIT',
                                              'AMT_ANNUITY', 'AMT_GOODS_PRICE',
                                              'REGION_POPULATION_RELATIVE',
                                              'DAYS_BIRTH', 'DAYS_EMPLOYED',
                                              'DAYS_REGISTRATION',
                                              'DAYS_ID_PUBLISH', 'OWN_CAR_AGE',
                                              'FLAG_MOBIL', 'FLAG_EMP_PHONE',
                                              'FLAG_WORK_...
                                             criterion='friedman_mse', init=None,
                                             learning_rate=0.1, loss='log_loss',
                                

##### Criando modelo Baseline - Tradicional

In [None]:
# Dropar TARGET e manter SK_ID_CURR separadamente
X = df_train.drop(columns=['TARGET'])
y = df_train['TARGET']

# Dividir dados em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Armazenar os IDs separadamente
X_train_ids = X_train['SK_ID_CURR']
X_test_ids = X_test['SK_ID_CURR']

# Remover SK_ID_CURR antes do preprocessamento
X_train = X_train.drop(columns=['SK_ID_CURR'])
X_test = X_test.drop(columns=['SK_ID_CURR'])

# Identificar atributos categóricos e numéricos
cat_attributes = X_train.select_dtypes(include='object').columns
num_attributes = X_train.select_dtypes(exclude='object').columns

# Pipeline de pré-processamento para atributos categóricos
cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', TargetEncoder())
])

# Pipeline de pré-processamento para atributos numéricos
num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Combinar pipelines de pré-processamento
preprocessor = ColumnTransformer([
    ('cat', cat_pipe, cat_attributes),
    ('num', num_pipe, num_attributes)
])

# Pipeline completo
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

# Preprocessar os dados de treino
X_train_processed = pipeline.fit_transform(X_train, y_train)
X_train_processed = pd.DataFrame(X_train_processed, columns=list(cat_attributes) + list(num_attributes))

# Adicionar SK_ID_CURR de volta aos dados de treino processados
X_train_processed['SK_ID_CURR'] = X_train_ids.reset_index(drop=True)

# Preprocessar os dados de teste
X_test_processed = pipeline.transform(X_test)
X_test_processed = pd.DataFrame(X_test_processed, columns=list(cat_attributes) + list(num_attributes))

# Adicionar SK_ID_CURR de volta aos dados de teste processados
X_test_processed['SK_ID_CURR'] = X_test_ids.reset_index(drop=True)

# Concatenar dados de treino e teste processados (opcional)
# X_processed = pd.concat([X_train_processed, X_test_processed], axis=0)

# Preprocessar os dados de produção (df_test)
df_test_ids = df_test['SK_ID_CURR']
X_prod_processed = pipeline.transform(df_test.drop(columns=['SK_ID_CURR']))
X_prod_processed = pd.DataFrame(X_prod_processed, columns=list(cat_attributes) + list(num_attributes))

# Adicionar SK_ID_CURR de volta aos dados de produção processados
X_prod_processed['SK_ID_CURR'] = df_test_ids.reset_index(drop=True)


##### Algoritmos usados para criação do modelo Baseline - Tradicional

In [None]:
# Testando modelos

algoritmos = [DecisionTreeClassifier(criterion='gini',random_state=0),
              LogisticRegression(solver='liblinear',random_state=0),
              RandomForestClassifier(random_state=0),
              GradientBoostingClassifier(random_state=0),
              XGBClassifier(random_state=0),
              lgb.LGBMClassifier(random_state=0)]
# Dropando os ID'S da modelagem
X_train_processed = X_train_processed.drop(columns=['SK_ID_CURR'])
X_test_processed = X_test_processed.drop(columns=['SK_ID_CURR'])


for algoritmo in algoritmos:

    nome_algoritmo = str(algoritmo)[:str(algoritmo).find("(")]

    # Treino do modelo
    algoritmo.fit(X_train_processed,y_train)

    # Avaliar modelo
    metricas = calculate_metrics(nome_algoritmo,algoritmo, X_train_processed, y_train, X_test_processed, y_test)
    display(metricas)

Unnamed: 0,Algoritmo,Conjunto,Acuracia,Precisao,Recall,AUC_ROC,GINI,KS
0,DecisionTreeClassifier,Treino,1.0,1.0,1.0,1.0,1.0,1.0
1,DecisionTreeClassifier,Teste,0.84981,0.135056,0.159562,0.53496,0.069921,0.071497


Unnamed: 0,Algoritmo,Conjunto,Acuracia,Precisao,Recall,AUC_ROC,GINI,KS
0,LogisticRegression,Treino,0.919079,0.509225,0.0099,0.746402,0.492804,0.366559
1,LogisticRegression,Teste,0.919655,0.606557,0.010657,0.747001,0.494003,0.369076


Unnamed: 0,Algoritmo,Conjunto,Acuracia,Precisao,Recall,AUC_ROC,GINI,KS
0,RandomForestClassifier,Treino,0.999959,1.0,0.999498,1.0,1.0,1.0
1,RandomForestClassifier,Teste,0.919353,0.5,0.000288,0.699459,0.398917,0.302276


Unnamed: 0,Algoritmo,Conjunto,Acuracia,Precisao,Recall,AUC_ROC,GINI,KS
0,GradientBoostingClassifier,Treino,0.919892,0.725857,0.016714,0.764322,0.528644,0.393744
1,GradientBoostingClassifier,Teste,0.919957,0.63,0.018145,0.753215,0.506431,0.376989


Unnamed: 0,Algoritmo,Conjunto,Acuracia,Precisao,Recall,AUC_ROC,GINI,KS
0,XGBClassifier,Treino,0.929799,0.931871,0.143257,0.91603,0.832061,0.66343
1,XGBClassifier,Teste,0.918564,0.446875,0.041187,0.731804,0.463608,0.346875


[LightGBM] [Info] Number of positive: 13940, number of negative: 158265
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.044010 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 23837
[LightGBM] [Info] Number of data points in the train set: 172205, number of used features: 165
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080950 -> initscore=-2.429508
[LightGBM] [Info] Start training from score -2.429508


Unnamed: 0,Algoritmo,Conjunto,Acuracia,Precisao,Recall,AUC_ROC,GINI,KS
0,LGBMClassifier,Treino,0.921036,0.835294,0.03056,0.825882,0.651764,0.49539
1,LGBMClassifier,Teste,0.919609,0.552381,0.016705,0.751443,0.502886,0.372422


In [None]:
#Os Algoritmos GradientBoosting e LightGBM apresentam resultados bem semelhantes no conjunto de Treino
#Desta forma vamos fazer um pequeno tunning nos dois modelos para decidir o algoritmo campeão

from sklearn.model_selection import GridSearchCV

# Definir os algoritmos e os parâmetros para o GridSearchCV
algoritmos = {
    'GradientBoostingClassifier': GradientBoostingClassifier(random_state=0),
    'LGBMClassifier': lgb.LGBMClassifier(random_state=0)
}

parametros = {
    'GradientBoostingClassifier': {
        'n_estimators': [50, 150]

    },
    'LGBMClassifier': {
        'n_estimators': [50, 150]
    }
}

# Dicionário para armazenar os melhores modelos e suas métricas
best_models = {}
metricas = {}

for nome_algoritmo, algoritmo in algoritmos.items():
    print(f"Realizando GridSearchCV para {nome_algoritmo}...")
    grid_search = GridSearchCV(estimator=algoritmo, param_grid=parametros[nome_algoritmo], scoring='roc_auc', cv=3, n_jobs= -1)
    grid_search.fit(X_train_processed, y_train)
    best_model = grid_search.best_estimator_
    best_models[nome_algoritmo] = best_model

    # Avaliar modelo
    metricas[nome_algoritmo] = calculate_metrics(nome_algoritmo, best_model, X_train_processed, y_train, X_test_processed, y_test)

# Exibir as métricas de todos os modelos
for nome_algoritmo, metricas_algoritmo in metricas.items():
    print(f"Desempenho do modelo: {nome_algoritmo}")
    display(metricas_algoritmo)


Realizando GridSearchCV para GradientBoostingClassifier...
Realizando GridSearchCV para LGBMClassifier...
[LightGBM] [Info] Number of positive: 13940, number of negative: 158265
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.146477 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 23837
[LightGBM] [Info] Number of data points in the train set: 172205, number of used features: 165
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080950 -> initscore=-2.429508
[LightGBM] [Info] Start training from score -2.429508
Desempenho do modelo: GradientBoostingClassifier


Unnamed: 0,Algoritmo,Conjunto,Acuracia,Precisao,Recall,AUC_ROC,GINI,KS
0,GradientBoostingClassifier,Treino,0.920118,0.694915,0.023529,0.771443,0.542887,0.405564
1,GradientBoostingClassifier,Teste,0.919864,0.58871,0.021025,0.754918,0.509836,0.379996


Desempenho do modelo: LGBMClassifier


Unnamed: 0,Algoritmo,Conjunto,Acuracia,Precisao,Recall,AUC_ROC,GINI,KS
0,LGBMClassifier,Treino,0.919927,0.770609,0.015423,0.788675,0.577351,0.434189
1,LGBMClassifier,Teste,0.919771,0.640625,0.011809,0.751788,0.503576,0.374439


##### Algoritmo Campeão

In [None]:
best_models

{'GradientBoostingClassifier': GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                            learning_rate=0.1, loss='log_loss', max_depth=3,
                            max_features=None, max_leaf_nodes=None,
                            min_impurity_decrease=0.0, min_samples_leaf=1,
                            min_samples_split=2, min_weight_fraction_leaf=0.0,
                            n_estimators=150, n_iter_no_change=None,
                            random_state=0, subsample=1.0, tol=0.0001,
                            validation_fraction=0.1, verbose=0,
                            warm_start=False),
 'LGBMClassifier': LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
                importance_type='split', learning_rate=0.1, max_depth=-1,
                min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
                n_estimators=50, n_jobs=None, num_leaves=31, objective=None,
          

In [None]:
algoritmos = [GradientBoostingClassifier(random_state=0, n_estimators=150)]

for algoritmo in algoritmos:

    nome_algoritmo = str(algoritmo)[:str(algoritmo).find("(")]
    # Treino do modelo
    algoritmo.fit(X_train_processed,y_train)

    # Avaliar modelo
    metricas = calculate_metrics(nome_algoritmo,algoritmo, X_train_processed, y_train, X_test_processed, y_test)
    display(metricas)

Unnamed: 0,Algoritmo,Conjunto,Acuracia,Precisao,Recall,AUC_ROC,GINI,KS
0,GradientBoostingClassifier,Treino,0.920118,0.694915,0.023529,0.771443,0.542887,0.405564
1,GradientBoostingClassifier,Teste,0.919864,0.58871,0.021025,0.754918,0.509836,0.379996


##### Criando DF para submissão e avaliação do modelo

In [None]:
# Fazer previsões no conjunto de produção (ou teste)
y_predict_proba_prod = algoritmo.predict_proba(X_prod_processed.drop(columns=['SK_ID_CURR']))[:, 1]

# Criar DataFrame de submissão com IDs do conjunto de produção (ou teste)
df_submission = pd.DataFrame()
df_submission['ID'] = X_prod_processed['SK_ID_CURR']  # Adicionar os IDs do conjunto de produção como coluna 'ID'
df_submission['TARGET'] = y_predict_proba_prod

In [None]:
df_submission.head()

Unnamed: 0,ID,TARGET
0,384575,0.174191
1,214010,0.041693
2,142232,0.154501
3,389171,0.020068
4,283617,0.067265


In [None]:
# Salvar submissão em CSV
df_submission.to_csv('submission_baseline_traditional_v1.csv', index=False)


##### Salvando artefato do Modelo

In [None]:
import pickle

with open('baseline_traditional_v1.pkl', 'wb') as file:
  pickle.dump(algoritmo, file)