# Imports

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectPercentile
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns',None)

# Load Data

In [2]:
df = pd.read_csv('../data/raw/train.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,target,TaxaDeUtilizacaoDeLinhasNaoGarantidas,Idade,NumeroDeVezes30-59DiasAtrasoNaoPior,TaxaDeEndividamento,RendaMensal,NumeroDeLinhasDeCreditoEEmprestimosAbertos,NumeroDeVezes90DiasAtraso,NumeroDeEmprestimosOuLinhasImobiliarias,NumeroDeVezes60-89DiasAtrasoNaoPior,NumeroDeDependentes
0,0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


# Data Description

In [3]:
df = df.drop(['Unnamed: 0'], axis = 1)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 11 columns):
 #   Column                                      Non-Null Count   Dtype  
---  ------                                      --------------   -----  
 0   target                                      150000 non-null  int64  
 1   TaxaDeUtilizacaoDeLinhasNaoGarantidas       150000 non-null  float64
 2   Idade                                       150000 non-null  int64  
 3   NumeroDeVezes30-59DiasAtrasoNaoPior         150000 non-null  int64  
 4   TaxaDeEndividamento                         150000 non-null  float64
 5   RendaMensal                                 120269 non-null  float64
 6   NumeroDeLinhasDeCreditoEEmprestimosAbertos  150000 non-null  int64  
 7   NumeroDeVezes90DiasAtraso                   150000 non-null  int64  
 8   NumeroDeEmprestimosOuLinhasImobiliarias     150000 non-null  int64  
 9   NumeroDeVezes60-89DiasAtrasoNaoPior         150000 non-null  int64  
 

In [5]:
df.isna().sum()

target                                            0
TaxaDeUtilizacaoDeLinhasNaoGarantidas             0
Idade                                             0
NumeroDeVezes30-59DiasAtrasoNaoPior               0
TaxaDeEndividamento                               0
RendaMensal                                   29731
NumeroDeLinhasDeCreditoEEmprestimosAbertos        0
NumeroDeVezes90DiasAtraso                         0
NumeroDeEmprestimosOuLinhasImobiliarias           0
NumeroDeVezes60-89DiasAtrasoNaoPior               0
NumeroDeDependentes                            3924
dtype: int64

In [6]:
df['target'].value_counts(normalize = True)

target
0    0.93316
1    0.06684
Name: proportion, dtype: float64

# Train Test Split

In [7]:
X = df.drop('target', axis = 1)
y = df['target']

In [8]:
X_train, X_val, y_train, y_val = train_test_split(X, y,test_size=0.25, stratify=y, random_state=42)

# Data Transformation

In [9]:
imp_mean = SimpleImputer(strategy='mean')
cols_to_impute = ['RendaMensal','NumeroDeDependentes']
imp_mean.fit(X_train[cols_to_impute])

In [10]:
X_train[cols_to_impute] = imp_mean.transform(X_train[cols_to_impute])
X_val[cols_to_impute] = imp_mean.transform(X_val[cols_to_impute])

# Feature Selection

In [11]:
select = SelectPercentile(percentile=50)
select.fit(X_train, y_train)

X_train_selected = select.transform(X_train)
print('X_train shape: ',X_train.shape)
print('X_train_selected shape: ',X_train_selected.shape)

X_train shape:  (112500, 10)
X_train_selected shape:  (112500, 5)


In [12]:
cols_selected = select.get_feature_names_out().tolist()

# Baseline Model

In [13]:
X_val_selected = select.transform(X_val)

In [14]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
y_preds = lr.predict_proba(X_val)[:,1]
print('Score da RegLog com todas as features: {}'.format(roc_auc_score(y_val,y_preds)))

Score da RegLog com todas as features: 0.7062874442824334


In [15]:
lr.fit(X_train_selected, y_train)
y_preds = lr.predict_proba(X_val_selected)[:,1]
print('Score da RegLog com feature selection: {}'.format(roc_auc_score(y_val,y_preds)))

Score da RegLog com feature selection: 0.7062333818849621


# Hyperparameter fine tunning

In [16]:
clf = [
    LogisticRegression(solver='newton-cg', penalty = None, max_iter = 1000),
    LogisticRegression(solver='lbfgs', penalty = None, max_iter = 1000),
    LogisticRegression(solver='sag', penalty = None, max_iter = 1000),
    LogisticRegression(solver='saga', penalty = None, max_iter = 1000),
       ]
clf_columns = []
clf_compare = pd.DataFrame(columns = clf_columns)

row_index = 0
for lrs in clf:
    y_preds = lrs.fit(X_train, y_train).predict_proba(X_val)[:,1]
    auc = roc_auc_score(y_val, y_preds)

    clf_name = lrs.__class__.__name__
    clf_compare.loc[row_index, 'Modelo'] = clf_name
    clf_compare.loc[row_index, 'max_iter'] = lrs.max_iter
    clf_compare.loc[row_index, 'solver'] = lrs.solver
    clf_compare.loc[row_index, 'penalty'] = lrs.penalty
    clf_compare.loc[row_index, 'class_weight'] = lrs.class_weight
    clf_compare.loc[row_index, 'AUC'] = auc

    row_index +=1

clf_compare.sort_values(by=['AUC'], ascending = False, inplace = True)
clf_compare

Unnamed: 0,Modelo,max_iter,solver,penalty,class_weight,AUC
1,LogisticRegression,1000.0,lbfgs,,,0.706413
2,LogisticRegression,1000.0,sag,,,0.577133
0,LogisticRegression,1000.0,newton-cg,,,0.577108
3,LogisticRegression,1000.0,saga,,,0.577068


In [17]:
# Model deployment

In [18]:
final_model = LogisticRegression(solver='lbfgs', penalty = None, max_iter = 1000)
final_model.fit(X_train, y_train)

In [19]:
import joblib
joblib.dump(final_model,'../src/final_model_reglog.pkl')

['../src/final_model_reglog.pkl']