In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
import optuna
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('C:\\Users\\souza\\OneDrive\\Área de Trabalho\\Risk Nubank\\data\\data_tratado.csv')
df = df.drop('Unnamed: 0', axis=1)
df = df.drop('score_rating_enc', axis=1)
x = df.drop('target_default', axis=1)
y = df['target_default']

In [3]:
# split 
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [None]:
# seleçao de atributos 
cat_atribs = ['score_rating', 'situation']
num_atribs = ['score_3', 'risk_rate', 'credit_limit', 'income', 'n_defaulted_loans', ' n_issues', 'ok_since',
            'n_bankruptcies']

In [5]:
column_transf = ColumnTransformer([('onehot', OneHotEncoder(handle_unknown='ignore'), cat_atribs)
                                ])
pipe_log = Pipeline(steps=[('transformer', column_transf),
                    ('model', LogisticRegression())
                    ])

In [None]:
# criando estudo com optuna
def objective(trial):
    params = {
        'C': trial.suggest_float('C', 0,1),
        'max_iter': trial.suggest_int('max_iter', 0,100),
        'penalty': trial.suggest_categorical('penalty', ['l2'])
    }
    model = pipe_log.set_params(model=LogisticRegression(**params, random_state=42)).fit(x_train, y_train)
    y_pred = model.predict(x_test)
    recall = recall_score(y_test, y_pred)
    
    return recall
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

[I 2025-08-06 16:09:33,947] A new study created in memory with name: no-name-537ed9e0-b4f8-4bb1-afb5-38029c1a44a7
[I 2025-08-06 16:09:34,012] Trial 0 finished with value: 0.9992753623188406 and parameters: {'C': 0.5139292173030473, 'max_iter': 18, 'penalty': 'l2'}. Best is trial 0 with value: 0.9992753623188406.
[I 2025-08-06 16:09:34,060] Trial 1 finished with value: 0.9992753623188406 and parameters: {'C': 0.42613765423266337, 'max_iter': 66, 'penalty': 'l2'}. Best is trial 0 with value: 0.9992753623188406.
[I 2025-08-06 16:09:34,100] Trial 2 finished with value: 0.9992753623188406 and parameters: {'C': 0.6336170959599446, 'max_iter': 73, 'penalty': 'l2'}. Best is trial 0 with value: 0.9992753623188406.
[I 2025-08-06 16:09:34,142] Trial 3 finished with value: 0.9992753623188406 and parameters: {'C': 0.7019821697135405, 'max_iter': 47, 'penalty': 'l2'}. Best is trial 0 with value: 0.9992753623188406.
[I 2025-08-06 16:09:34,186] Trial 4 finished with value: 0.9992753623188406 and param

In [7]:
print(f'melhores parametros modelo regressao logistica: {study.best_params}')

melhores parametros modelo regressao logistica: {'C': 0.5139292173030473, 'max_iter': 18, 'penalty': 'l2'}


In [9]:
logistic_reg = pipe_log.set_params(model=LogisticRegression(C= 0.5139292173030473, max_iter=18, penalty='l2', random_state=42)).fit(x_train, y_train)
y_pred = logistic_reg.predict(x_test)
print(f'recall: {recall_score(y_test, y_pred)}')
print(f'acc: {accuracy_score(y_test, y_pred)}')
print(f'precision: {precision_score(y_test, y_pred)}')

recall: 0.9992753623188406
acc: 0.9998798365777457
precision: 1.0
