In [1]:
import pandas as pd
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score

In [2]:
# Carregar base de dados
df = pd.read_csv('credit-data.csv')
df.head()

Unnamed: 0,clientid,income,age,loan,default
0,1,66155.925095,59.017015,8106.532131,0
1,2,34415.153966,48.117153,6564.745018,0
2,3,57317.170063,63.108049,8020.953296,0
3,4,42709.534201,45.751972,6103.64226,0
4,5,66952.688845,18.584336,8770.099235,1


In [3]:
# Corrigir valores negativos
df.loc[df.age < 0, 'age'] = 40.92

In [4]:
# Dividir a base entre previsores e classe
previsores = df.iloc[:, 1:4].values
previsores

array([[6.61559251e+04, 5.90170151e+01, 8.10653213e+03],
       [3.44151540e+04, 4.81171531e+01, 6.56474502e+03],
       [5.73171701e+04, 6.31080495e+01, 8.02095330e+03],
       ...,
       [4.43114493e+04, 2.80171669e+01, 5.52278669e+03],
       [4.37560566e+04, 6.39717958e+01, 1.62272260e+03],
       [6.94365796e+04, 5.61526170e+01, 7.37883360e+03]])

In [5]:
# Dividir a base entre previsores e classe
classes = df.iloc[:, 4].values
classes

array([0, 0, 0, ..., 1, 0, 0])

In [6]:
# Tratar valores não informados, desconhecidos
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
imputer = imputer.fit(previsores[:, 1:4])
previsores[:, 1:4] = imputer.transform(previsores[:, 1:4])
previsores



array([[6.61559251e+04, 5.90170151e+01, 8.10653213e+03],
       [3.44151540e+04, 4.81171531e+01, 6.56474502e+03],
       [5.73171701e+04, 6.31080495e+01, 8.02095330e+03],
       ...,
       [4.43114493e+04, 2.80171669e+01, 5.52278669e+03],
       [4.37560566e+04, 6.39717958e+01, 1.62272260e+03],
       [6.94365796e+04, 5.61526170e+01, 7.37883360e+03]])

In [7]:
# Transformar atributos numericos na mesma escala
scaler = StandardScaler()
previsores = scaler.fit_transform(previsores)
previsores

array([[ 1.45393393,  1.36538093,  1.20281942],
       [-0.76217555,  0.5426602 ,  0.69642695],
       [ 0.83682073,  1.67417189,  1.17471147],
       ...,
       [-0.07122592, -0.97448519,  0.35420081],
       [-0.11000289,  1.73936739, -0.92675625],
       [ 1.682986  ,  1.14917639,  0.96381038]])

In [8]:
# Fazer a divisão da base entre treinamento e teste
previsores_treinamento, previsores_teste, classe_treinamento, classe_teste = train_test_split(previsores, classes,
                                                                                              test_size=0.25,
                                                                                              random_state=0)

In [9]:
print(previsores_treinamento)
print(previsores_teste)
print(classe_treinamento)
print(classe_teste)

[[-1.3754462   0.50631087  0.10980934]
 [ 1.45826409 -1.6489393  -1.21501497]
 [-0.79356829  0.22531191 -0.43370226]
 ...
 [ 0.21738243 -0.14704404  1.40872498]
 [ 0.58716195  0.66435493  0.67948086]
 [ 0.68315357  0.04084946  1.91819744]]
[[ 1.59301567 -1.35435846  2.58262733]
 [ 0.99769755  0.99806572  0.84418709]
 [-0.42485257  0.55812622 -1.15785286]
 ...
 [ 1.37445674 -1.05746281 -1.12564819]
 [-1.57087737 -0.63488173 -0.36981671]
 [-1.03572293 -0.93978122  0.04244312]]
[0 0 0 ... 0 0 0]
[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1
 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 1 0 0 1 0
 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0
 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0
 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0
 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 1

In [10]:
classificador = GaussianNB()
classificador.fit(previsores_treinamento, classe_treinamento)

GaussianNB(priors=None, var_smoothing=1e-09)

In [11]:
previsoes = classificador.predict(previsores_teste)

In [14]:
precisao = accuracy_score(classe_teste, previsoes)
print('Precisao de: {}%'.format(precisao * 100))

Precisao de: 93.8%


In [26]:
matriz = confusion_matrix(classe_teste, previsoes)
print('Matriz confusão:'
  '\n\t0\t1\n0:\t{}\t{}'
  '\n1:\t{}\t{}'.format(matriz[0][0], matriz[0][1], matriz[1][0], matriz[1][1]))

Matriz confusão:
	0	1
0:	428	8
1:	23	41
