In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
import optuna
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('C:\\Users\\souza\\OneDrive\\Área de Trabalho\\Risk Nubank\\data\\data_tratado.csv')
df = df.drop('Unnamed: 0', axis=1)
df = df.drop('score_rating_enc', axis=1)
x = df.drop('target_default', axis=1)
y = df['target_default']

In [3]:
# split 
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [None]:
# seleçao de atributos 
cat_atribs = ['score_rating', 'situation']
num_atribs = ['score_3', 'risk_rate', 'credit_limit', 'income', 'n_defaulted_loans', ' n_issues', 'ok_since',
            'n_bankruptcies']

In [5]:
column_transf = ColumnTransformer([('onehot', OneHotEncoder(handle_unknown='ignore'), cat_atribs)
                                ])
pipe_tree = Pipeline(steps=[('transformer', column_transf),
                    ('model', DecisionTreeClassifier())
                    ])

In [None]:
# estudo decision tree
def objective(trial):
    params = {
        'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy', 'log_loss']),
        'max_depth': trial.suggest_int('max_depth', 2, 50),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 50),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 2,50),
        'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 2,50)
    }
    model = pipe_tree.set_params(model=DecisionTreeClassifier(**params, random_state=42)).fit(x_train, y_train)
    y_pred = model.predict(x_test)
    recall = recall_score(y_test, y_pred)
    
    return recall
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)
print(f'melhores parametros arvores de decisao: {study.best_params}')

[I 2025-08-06 16:10:15,978] A new study created in memory with name: no-name-cc015095-89de-4043-97f4-ad1ab82c0848
[I 2025-08-06 16:10:16,010] Trial 0 finished with value: 1.0 and parameters: {'criterion': 'log_loss', 'max_depth': 20, 'min_samples_split': 46, 'min_samples_leaf': 2, 'max_leaf_nodes': 47}. Best is trial 0 with value: 1.0.
[I 2025-08-06 16:10:16,040] Trial 1 finished with value: 0.9985207100591716 and parameters: {'criterion': 'log_loss', 'max_depth': 16, 'min_samples_split': 14, 'min_samples_leaf': 42, 'max_leaf_nodes': 21}. Best is trial 0 with value: 1.0.
[I 2025-08-06 16:10:16,066] Trial 2 finished with value: 0.9985207100591716 and parameters: {'criterion': 'entropy', 'max_depth': 31, 'min_samples_split': 46, 'min_samples_leaf': 7, 'max_leaf_nodes': 19}. Best is trial 0 with value: 1.0.
[I 2025-08-06 16:10:16,094] Trial 3 finished with value: 0.9985207100591716 and parameters: {'criterion': 'log_loss', 'max_depth': 16, 'min_samples_split': 33, 'min_samples_leaf': 50, 

melhores parametros arvores de decisao: {'criterion': 'log_loss', 'max_depth': 20, 'min_samples_split': 46, 'min_samples_leaf': 2, 'max_leaf_nodes': 47}


In [9]:
decision_tree = pipe_tree.set_params(model=DecisionTreeClassifier(criterion='log_loss', max_depth=10,
                                                                min_samples_split=46,
                                                                min_samples_leaf=2,
                                                                max_leaf_nodes=47,
                                                                random_state=42 
                                                                )).fit(x_train, y_train)
y_pred = decision_tree.predict(x_test)
print(f'recall: {recall_score(y_test, y_pred)}')
print(f'acc: {accuracy_score(y_test, y_pred)}')
print(f'precision: {precision_score(y_test, y_pred)}')

recall: 1.0
acc: 1.0
precision: 1.0
