In [1]:
import pandas as pd

In [2]:
# Importando database
base = pd.read_csv('credit_data.csv')

In [3]:
# Estatísticas do database
base.describe()

Unnamed: 0,i#clientid,income,age,loan,c#default
count,2000.0,2000.0,1997.0,2000.0,2000.0
mean,1000.5,45331.600018,40.807559,4444.369695,0.1415
std,577.494589,14326.327119,13.624469,3045.410024,0.348624
min,1.0,20014.48947,-52.42328,1.37763,0.0
25%,500.75,32796.459717,28.990415,1939.708847,0.0
50%,1000.5,45789.117313,41.317159,3974.719419,0.0
75%,1500.25,57791.281668,52.58704,6432.410625,0.0
max,2000.0,69995.685578,63.971796,13766.051239,1.0


In [4]:
# Amostra dos dados
base.head()

Unnamed: 0,i#clientid,income,age,loan,c#default
0,1,66155.925095,59.017015,8106.532131,0
1,2,34415.153966,48.117153,6564.745018,0
2,3,57317.170063,63.108049,8020.953296,0
3,4,42709.534201,45.751972,6103.64226,0
4,5,66952.688845,18.584336,8770.099235,1


In [5]:
# Verificando dados com idade negativa
base.loc[base['age'] < 0]

Unnamed: 0,i#clientid,income,age,loan,c#default
15,16,50501.726689,-28.218361,3977.287432,0
21,22,32197.620701,-52.42328,4244.057136,0
26,27,63287.038908,-36.496976,9595.286289,0


In [6]:
### Maneiras de contornar o problema das idades menores que zero

## 1) Apagar a coluna por inteiro (não recomendada, neste caso)
# base.drop('age', 1, inplace=True)

## 2) Apagar apenas os registros, por completo, que possuem essa incoerência
# base.drop(base[base.age < 0].index, inplace=True)

## 3) Preencher os valores com a média da coluna, apenas dos valores maiores que zero
media = base['age'][base.age > 0].mean()
base.loc[base.age < 0, 'age'] = media

In [7]:
# Verificando valores nulos
base.loc[pd.isnull(base['age'])]

Unnamed: 0,i#clientid,income,age,loan,c#default
28,29,59417.805406,,2082.625938,0
30,31,48528.852796,,6155.78467,0
31,32,23526.302555,,2862.010139,0


In [8]:
# Divisão do dataset entre variáveis preditoras e target
previsores = base.iloc[:, 1:4].values
classe = base.iloc[:, 4].values

In [9]:
# Substituindo os valores missing pela média de cada coluna
from sklearn.preprocessing import Imputer

imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
imputer.fit(previsores[:, 0:3])

previsores[:, 0:3] = imputer.transform(previsores[:, 0:3])

In [10]:
## Fazendo o escalonamento (normalização) dos atributos
from sklearn.preprocessing import StandardScaler

# Padronização
scaler = StandardScaler()
previsores = scaler.fit_transform(previsores)

# Normalização
# from sklearn.preprocessing import MinMaxScaler
# scaler = MinMaxScaler()
# previsores = scaler.fit_transform(previsores)

In [11]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

  from numpy.core.umath_tests import inner1d


In [12]:
# modelo SVM
classificadorSVM = SVC(kernel='rbf', C=2.0)
classificadorSVM.fit(previsores, classe)

SVC(C=2.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [13]:
# modelo Random Forest
classificadorRandomForest = RandomForestClassifier(n_estimators=40, criterion='entropy')
classificadorRandomForest.fit(previsores, classe)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=40, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [14]:
# modelo Redes Neurais
classificadorMLP = MLPClassifier(verbose=True, max_iter=500, tol=0.00001, solver='adam', hidden_layer_sizes=(100),
                                activation='relu', batch_size=200, learning_rate_init=0.001)
classificadorMLP.fit(previsores, classe)

Iteration 1, loss = 0.72035849
Iteration 2, loss = 0.63602671
Iteration 3, loss = 0.56394337
Iteration 4, loss = 0.50313629
Iteration 5, loss = 0.45188175
Iteration 6, loss = 0.40813349
Iteration 7, loss = 0.37093489
Iteration 8, loss = 0.33856815
Iteration 9, loss = 0.31082451
Iteration 10, loss = 0.28691854
Iteration 11, loss = 0.26596966
Iteration 12, loss = 0.24776331
Iteration 13, loss = 0.23159972
Iteration 14, loss = 0.21740847
Iteration 15, loss = 0.20489192
Iteration 16, loss = 0.19388192
Iteration 17, loss = 0.18405473
Iteration 18, loss = 0.17530849
Iteration 19, loss = 0.16746365
Iteration 20, loss = 0.16046641
Iteration 21, loss = 0.15392325
Iteration 22, loss = 0.14811347
Iteration 23, loss = 0.14276336
Iteration 24, loss = 0.13779041
Iteration 25, loss = 0.13329146
Iteration 26, loss = 0.12918808
Iteration 27, loss = 0.12536366
Iteration 28, loss = 0.12186515
Iteration 29, loss = 0.11868634
Iteration 30, loss = 0.11567650
Iteration 31, loss = 0.11295215
Iteration 32, los

Iteration 261, loss = 0.01921049
Iteration 262, loss = 0.01915991
Iteration 263, loss = 0.01913728
Iteration 264, loss = 0.01910620
Iteration 265, loss = 0.01892180
Iteration 266, loss = 0.01888669
Iteration 267, loss = 0.01881706
Iteration 268, loss = 0.01875969
Iteration 269, loss = 0.01864634
Iteration 270, loss = 0.01866750
Iteration 271, loss = 0.01857147
Iteration 272, loss = 0.01848208
Iteration 273, loss = 0.01837847
Iteration 274, loss = 0.01838340
Iteration 275, loss = 0.01839270
Iteration 276, loss = 0.01838769
Training loss did not improve more than tol=0.000010 for two consecutive epochs. Stopping.


MLPClassifier(activation='relu', alpha=0.0001, batch_size=200, beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=100, learning_rate='constant',
       learning_rate_init=0.001, max_iter=500, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=1e-05, validation_fraction=0.1,
       verbose=True, warm_start=False)

In [15]:
# Salvando modelos já treinados para aplicações comerciais
import pickle

In [16]:
pickle.dump(classificadorSVM, open('svm_finalizado.sav', 'wb'))

In [17]:
pickle.dump(classificadorRandomForest, open('random_forest_finalizado.sav', 'wb'))

In [18]:
pickle.dump(classificadorMLP, open('mlp_finalizado.sav', 'wb'))