In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# O que falta fazer

- Extrair colunas das variáveis categóricas
- Hyperparameters tuning
- Defininr random seed
- Analisar tipo de erro melhor pro problema
- Matriz de confusão

## Importando dados

In [2]:
df = pd.read_csv('../Dados/bankloans.csv')
df.head()

Unnamed: 0,age,ed,employ,address,income,debtinc,creddebt,othdebt,default
0,41,3,17,12,176,9.3,11.359392,5.008608,1.0
1,27,1,10,6,31,17.3,1.362202,4.000798,0.0
2,40,1,15,14,55,5.5,0.856075,2.168925,0.0
3,41,1,15,14,120,2.9,2.65872,0.82128,0.0
4,24,2,2,0,28,17.3,1.787436,3.056564,1.0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1150 entries, 0 to 1149
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1150 non-null   int64  
 1   ed        1150 non-null   int64  
 2   employ    1150 non-null   int64  
 3   address   1150 non-null   int64  
 4   income    1150 non-null   int64  
 5   debtinc   1150 non-null   float64
 6   creddebt  1150 non-null   float64
 7   othdebt   1150 non-null   float64
 8   default   700 non-null    float64
dtypes: float64(4), int64(5)
memory usage: 81.0 KB


## Pré processamento

### Tratamento de dados nulos

In [4]:
df.dropna(how='any', inplace=True)

### Balanceamento da feature 'default'

In [5]:
df['default'].value_counts(normalize=True)

0.0    0.738571
1.0    0.261429
Name: default, dtype: float64

In [6]:
# Undersampling nos dados default == 0:
#    tamanho do df default == 0 se reduz a um pouco menos que o dobro (1.75) do tamanho do df default == 1

i_zero = np.random.choice(df.query('default == 0').index.to_list(),
                          int(1.75*len(df.query('default == 1').index.to_list())))

df = pd.concat([df.query('default == 0').loc[i_zero], df.query('default == 1')], axis=0)

### Feature Engineering

In [7]:
df['ed'].value_counts()

1    271
2    138
3     62
4     26
5      6
Name: ed, dtype: int64

In [8]:
# Extraindo colunas da feature categórica 'ed'

df['ed_1'] = df['ed'].apply(lambda x:1 if x==1 else 0)
#df['ed_2'] = df['ed'].apply(lambda x:1 if x==2 else 0)
df.drop('ed', axis=1, inplace=True)

In [9]:
df['address'].value_counts()

count    503.000000
mean       7.874751
std        6.743613
min        0.000000
25%        2.000000
50%        6.000000
75%       11.000000
max       34.000000
Name: address, dtype: float64

In [20]:
df['employ'].value_counts()

0     53
5     36
1     35
7     33
2     32
6     32
4     30
9     30
3     27
11    23
10    22
12    21
13    21
8     16
22    15
19    11
16    11
15    10
14     9
18     8
17     7
20     5
21     3
30     3
23     3
25     2
26     1
27     1
24     1
29     1
31     1
Name: employ, dtype: int64

## Processamento

### Analisando correlações para restringir colunas se necessário

In [10]:
df.corr()['default'].apply(lambda x:x if x>=0 else -x).sort_values(ascending=False).to_frame()

Unnamed: 0,default
default,1.0
debtinc,0.405096
employ,0.32546
creddebt,0.240544
address,0.166278
ed_1,0.162432
othdebt,0.142945
age,0.131931
income,0.094028


### Separação e normalização dos dados

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = df.drop(columns={'default'})
y = df['default']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=13)

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### Modelagem

In [12]:
results = {}

#### Usando Regressão Logística

In [13]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression()

lr_model.fit(X_train, y_train)

y_hat = lr_model.predict(X_test)

results['LR'] = y_hat

#### Usando Support Vector Classifier

In [14]:
from sklearn.svm import SVC

svc_model = SVC()

svc_model.fit(X_train, y_train)

y_hat = svc_model.predict(X_test)

results['SVC'] = y_hat

#### Usando árvore de decisão

In [15]:
from sklearn.tree import DecisionTreeClassifier

tree_model = DecisionTreeClassifier()

tree_model.fit(X_train, y_train)

y_hat = tree_model.predict(X_test)

results['TREE'] = y_hat

#### Usando K Vizinhos Próximos (K nearest neighbors)

In [16]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier()

knn_model.fit(X_train, y_train)

y_hat = knn_model.predict(X_test)

results['KNN'] = y_hat

#### Usando SGDClassifier

In [17]:
from sklearn.linear_model import SGDClassifier

sgdc_model = SGDClassifier()

sgdc_model.fit(X_train, y_train)

y_hat = sgdc_model.predict(X_test)

results['SGDC'] = y_hat

### Evaluation

In [18]:
from sklearn.metrics import precision_score

print('Precisão de treino\n')
print('Logistic Regression: ', precision_score(y_true=y_train, y_pred=lr_model.predict(X_train)))
print('Support Vector Class.: ', precision_score(y_true=y_train, y_pred=svc_model.predict(X_train)))
print('Decision Tree Class.: ', precision_score(y_true=y_train, y_pred=tree_model.predict(X_train)))
print('K Nearest Neighbors: ', precision_score(y_true=y_train, y_pred=knn_model.predict(X_train)))
print('SGD Classifier: ', precision_score(y_true=y_train, y_pred=sgdc_model.predict(X_train)))

Precisão de treino

Logistic Regression:  0.7450980392156863
Support Vector Class.:  0.8461538461538461
Decision Tree Class.:  1.0
K Nearest Neighbors:  0.8043478260869565
SGD Classifier:  0.6397058823529411


In [19]:
print('Precisão de teste\n')
print('Logistic Regression: ', precision_score(y_true=y_test, y_pred=results['LR']))
print('Support Vector Class.: ', precision_score(y_true=y_test, y_pred=results['SVC']))
print('Decision Tree Class.: ', precision_score(y_true=y_test, y_pred=results['TREE']))
print('K Nearest Neighbors: ', precision_score(y_true=y_test, y_pred=results['KNN']))
print('SGD Classifier: ', precision_score(y_true=y_test, y_pred=results['SGDC']))

Precisão de teste

Logistic Regression:  0.7368421052631579
Support Vector Class.:  0.782608695652174
Decision Tree Class.:  0.6545454545454545
K Nearest Neighbors:  0.7586206896551724
SGD Classifier:  0.6533333333333333


Precisão de teste

Logistic Regression:  0.7213114754098361
Support Vector Class.:  0.72
Decision Tree Class.:  0.6666666666666666
K Nearest Neighbors:  0.6885245901639344
SGD Classifier:  0.6923076923076923