In [1]:
import pandas as pd
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score

In [2]:
# load dataset
df = pd.read_csv('credit-data.csv')
df.head()

Unnamed: 0,clientid,income,age,loan,default
0,1,66155.925095,59.017015,8106.532131,0
1,2,34415.153966,48.117153,6564.745018,0
2,3,57317.170063,63.108049,8020.953296,0
3,4,42709.534201,45.751972,6103.64226,0
4,5,66952.688845,18.584336,8770.099235,1


In [3]:
# Fix negative values
df.loc[df.age < 0, 'age'] = df['age'].mean()
df.loc[df.age < 0, 'age']

Series([], Name: age, dtype: float64)

In [4]:
# Split dataset into forecasters and classes
forecasters = df.iloc[:, 1:4].values
forecasters

array([[6.61559251e+04, 5.90170151e+01, 8.10653213e+03],
       [3.44151540e+04, 4.81171531e+01, 6.56474502e+03],
       [5.73171701e+04, 6.31080495e+01, 8.02095330e+03],
       ...,
       [4.43114493e+04, 2.80171669e+01, 5.52278669e+03],
       [4.37560566e+04, 6.39717958e+01, 1.62272260e+03],
       [6.94365796e+04, 5.61526170e+01, 7.37883360e+03]])

In [5]:
classes = df.iloc[:, 4].values
classes

array([0, 0, 0, ..., 1, 0, 0])

In [6]:
# Handle missing values
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
imputer = imputer.fit(forecasters[:, 1:4])
forecasters[:, 1:4] = imputer.transform(forecasters[:, 1:4])
forecasters



array([[6.61559251e+04, 5.90170151e+01, 8.10653213e+03],
       [3.44151540e+04, 4.81171531e+01, 6.56474502e+03],
       [5.73171701e+04, 6.31080495e+01, 8.02095330e+03],
       ...,
       [4.43114493e+04, 2.80171669e+01, 5.52278669e+03],
       [4.37560566e+04, 6.39717958e+01, 1.62272260e+03],
       [6.94365796e+04, 5.61526170e+01, 7.37883360e+03]])

In [7]:
# Transform numeric attributes in the same scale
scaler = StandardScaler()
forecasters = scaler.fit_transform(forecasters)
forecasters

array([[ 1.45393393,  1.36539359,  1.20281942],
       [-0.76217555,  0.54267291,  0.69642695],
       [ 0.83682073,  1.67418453,  1.17471147],
       ...,
       [-0.07122592, -0.97447238,  0.35420081],
       [-0.11000289,  1.73938004, -0.92675625],
       [ 1.682986  ,  1.14918907,  0.96381038]])

In [8]:
# Split dataset into train and test
forecasters_train, forecasters_test, classes_train, classes_test = train_test_split(forecasters
                                                                                    , classes
                                                                                    , test_size=0.25
                                                                                    , random_state=0)
print(len(forecasters_train))
print(len(forecasters_test))
print(len(classes_train))
print(len(classes_test))

1500
500
1500
500


In [9]:
print(forecasters_train)
print(forecasters_test)
print(classes_train)
print(classes_test)

[[-1.3754462   0.50632359  0.10980934]
 [ 1.45826409 -1.64892645 -1.21501497]
 [-0.79356829  0.22532465 -0.43370226]
 ...
 [ 0.21738243 -0.14703128  1.40872498]
 [ 0.58716195  0.66436763  0.67948086]
 [ 0.68315357  0.04086221  1.91819744]]
[[ 1.59301567 -1.35434563  2.58262733]
 [ 0.99769755  0.99807841  0.84418709]
 [-0.42485257  0.55813894 -1.15785286]
 ...
 [ 1.37445674 -1.05745    -1.12564819]
 [-1.57087737 -0.63486894 -0.36981671]
 [-1.03572293 -0.93976841  0.04244312]]
[0 0 0 ... 0 0 0]
[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1
 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 1 0 0 1 0
 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0
 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0
 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0
 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 1

In [10]:
estimator = GaussianNB()
estimator.fit(forecasters_train, classes_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [11]:
predictions = estimator.predict(forecasters_test)
predictions

array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [12]:
precision = accuracy_score(classes_test, predictions)
print('Precision: {}%'.format(precision * 100))

Precision: 93.8%


In [13]:
matrix = confusion_matrix(classes_test, predictions)
print('Confusion Matrix:'
'\n\t0\t1\n0:\t{}\t{}'
'\n1:\t{}\t{}'.format(matrix[0][0], matrix[0][1], matrix[1][0], matrix[1][1]))

Confusion Matrix:
	0	1
0:	428	8
1:	23	41
