In [37]:
import pandas as pd
from sklearn.preprocessing import Imputer, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

In [25]:
df = pd.read_csv('credit-data.csv')
df.head()

Unnamed: 0,clientid,income,age,loan,default
0,1,66155.925095,59.017015,8106.532131,0
1,2,34415.153966,48.117153,6564.745018,0
2,3,57317.170063,63.108049,8020.953296,0
3,4,42709.534201,45.751972,6103.64226,0
4,5,66952.688845,18.584336,8770.099235,1


In [26]:
df.describe()

Unnamed: 0,clientid,income,age,loan,default
count,2000.0,2000.0,1997.0,2000.0,2000.0
mean,1000.5,45331.600018,40.807559,4444.369695,0.1415
std,577.494589,14326.327119,13.624469,3045.410024,0.348624
min,1.0,20014.48947,-52.42328,1.37763,0.0
25%,500.75,32796.459717,28.990415,1939.708847,0.0
50%,1000.5,45789.117313,41.317159,3974.719419,0.0
75%,1500.25,57791.281668,52.58704,6432.410625,0.0
max,2000.0,69995.685578,63.971796,13766.051239,1.0


In [27]:
df.loc[df.age < 0] = df['age'].mean()
df.describe()

Unnamed: 0,clientid,income,age,loan,default
count,2000.0,2000.0,1997.0,2000.0,2000.0
mean,1000.528711,45258.668036,40.92752,4435.522591,0.202711
std,577.446347,14424.139657,13.261826,3047.973701,1.612325
min,1.0,40.807559,18.055189,1.37763,0.0
25%,500.75,32751.700392,29.072097,1932.453823,0.0
50%,1000.5,45776.207139,41.317159,3964.990626,0.0
75%,1500.25,57771.032285,52.58704,6428.192991,0.0
max,2000.0,69995.685578,63.971796,13766.051239,40.807559


In [28]:
forecasters = df.iloc[:, 1:4].values
forecasters

array([[6.61559251e+04, 5.90170151e+01, 8.10653213e+03],
       [3.44151540e+04, 4.81171531e+01, 6.56474502e+03],
       [5.73171701e+04, 6.31080495e+01, 8.02095330e+03],
       ...,
       [4.43114493e+04, 2.80171669e+01, 5.52278669e+03],
       [4.37560566e+04, 6.39717958e+01, 1.62272260e+03],
       [6.94365796e+04, 5.61526170e+01, 7.37883360e+03]])

In [29]:
classes = df.iloc[:, 4].values
classes

array([0., 0., 0., ..., 1., 0., 0.])

In [30]:
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
imputer = imputer.fit(forecasters[:, 1:4])
forecasters[:, 1:4] = imputer.transform(forecasters[:, 1:4])
forecasters

array([[6.61559251e+04, 5.90170151e+01, 8.10653213e+03],
       [3.44151540e+04, 4.81171531e+01, 6.56474502e+03],
       [5.73171701e+04, 6.31080495e+01, 8.02095330e+03],
       ...,
       [4.43114493e+04, 2.80171669e+01, 5.52278669e+03],
       [4.37560566e+04, 6.39717958e+01, 1.62272260e+03],
       [6.94365796e+04, 5.61526170e+01, 7.37883360e+03]])

In [31]:
scaler = StandardScaler()
forecasters = scaler.fit_transform(forecasters)
forecasters

array([[ 1.44913207,  1.36539359,  1.20471106],
       [-0.75194959,  0.54267291,  0.69874452],
       [ 0.83620362,  1.67418453,  1.17662675],
       ...,
       [-0.06568542, -0.97447238,  0.35680624],
       [-0.10419944,  1.73938004, -0.9230734 ],
       [ 1.6766309 ,  1.14918907,  0.96590306]])

In [32]:
forecasters_train, forecasters_test, classes_train, classes_test = train_test_split(forecasters
                                                                                   , classes
                                                                                   , test_size=0.25
                                                                                   , random_state=0)
print(len(forecasters_train))
print(len(forecasters_test))
print(len(classes_train))
print(len(classes_test))
print(forecasters_train)
print(forecasters_test)
print(classes_train)
print(classes_test)

1500
500
1500
500
[[-1.36106156  0.50632359  0.11262032]
 [ 1.45343286 -1.64892645 -1.21108967]
 [-0.78312946  0.22532465 -0.43043412]
 ...
 [ 0.22096583 -0.14703128  1.41044344]
 [ 0.58823782  0.66436763  0.68181269]
 [ 0.6835785   0.04086221  1.91948737]]
[[ 1.58727067 -1.35434563  2.58335841]
 [ 0.9959895   0.99807841  0.84638038]
 [-0.41691407  0.55813894 -1.15397563]
 ...
 [ 1.37019382 -1.05745    -1.12179805]
 [-1.55516747 -0.63486894 -0.36660231]
 [-1.02364201 -0.93976841  0.04531077]]
[0. 0. 0. ... 0. 0. 0.]
[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0.
 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0.
 1. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 

In [38]:
label_encoder = LabelEncoder()
classes_encoded = label_encoder.fit_transform(classes_train)
classes_encoded

array([0, 0, 0, ..., 0, 0, 0])

In [1]:
estimator = RandomForestClassifier(n_estimators=40, criterion='entropy', random_state=0)
estimator.fit(forecasters_train, classes_encoded)
predictions = estimator.predict(forecasters_test)
predictions

NameError: name 'RandomForestClassifier' is not defined

In [64]:
precision = accuracy_score(classes_test, predictions)
print('Precision: {}%'.format(precision * 100))

Precision: 98.4%


In [44]:
matrix = confusion_matrix(classes_test, predictions)
print('Confusion Matrix:'
  '\n\t0\t1\n0:\t{}\t{}'
  '\n1:\t{}\t{}'.format(matrix[0][0], matrix[0][1], matrix[1][0], matrix[1][1]))

Confusion Matrix:
	0	1
0:	433	3
1:	5	59
