In [11]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, Imputer, LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import collections

In [2]:
df = pd.read_csv('credit-data.csv')
df.head()

Unnamed: 0,clientid,income,age,loan,default
0,1,66155.925095,59.017015,8106.532131,0
1,2,34415.153966,48.117153,6564.745018,0
2,3,57317.170063,63.108049,8020.953296,0
3,4,42709.534201,45.751972,6103.64226,0
4,5,66952.688845,18.584336,8770.099235,1


In [3]:
df.loc[df.age < 0] = df['age'].mean()
df.describe()

Unnamed: 0,clientid,income,age,loan,default
count,2000.0,2000.0,1997.0,2000.0,2000.0
mean,1000.528711,45258.668036,40.92752,4435.522591,0.202711
std,577.446347,14424.139657,13.261826,3047.973701,1.612325
min,1.0,40.807559,18.055189,1.37763,0.0
25%,500.75,32751.700392,29.072097,1932.453823,0.0
50%,1000.5,45776.207139,41.317159,3964.990626,0.0
75%,1500.25,57771.032285,52.58704,6428.192991,0.0
max,2000.0,69995.685578,63.971796,13766.051239,40.807559


In [4]:
forecasters = df.iloc[:, 1:4].values
forecasters

array([[6.61559251e+04, 5.90170151e+01, 8.10653213e+03],
       [3.44151540e+04, 4.81171531e+01, 6.56474502e+03],
       [5.73171701e+04, 6.31080495e+01, 8.02095330e+03],
       ...,
       [4.43114493e+04, 2.80171669e+01, 5.52278669e+03],
       [4.37560566e+04, 6.39717958e+01, 1.62272260e+03],
       [6.94365796e+04, 5.61526170e+01, 7.37883360e+03]])

In [5]:
classes = df.iloc[:, 4].values
classes

array([0., 0., 0., ..., 1., 0., 0.])

In [14]:
label_encoder = LabelEncoder()
classes = label_encoder.fit_transform(classes)
classes

array([0, 0, 0, ..., 1, 0, 0])

In [15]:
imputer = Imputer(missing_values='NaN', strategy="mean", axis=0)
imputer = imputer.fit(forecasters[:, 1:4])
forecasters[:, 1:4] = imputer.transform(forecasters[:, 1:4])



In [16]:
scaler = StandardScaler()
forecasters = scaler.fit_transform(forecasters)
forecasters

array([[ 1.44913207,  1.36539359,  1.20471106],
       [-0.75194959,  0.54267291,  0.69874452],
       [ 0.83620362,  1.67418453,  1.17662675],
       ...,
       [-0.06568542, -0.97447238,  0.35680624],
       [-0.10419944,  1.73938004, -0.9230734 ],
       [ 1.6766309 ,  1.14918907,  0.96590306]])

In [17]:
forecasters_train, forecasters_test, classes_train, classes_test = train_test_split(forecasters
                                                             , classes
                                                             , test_size=0.25
                                                             , random_state=0)

In [19]:
estimator = LogisticRegression(random_state=1)
estimator.fit(forecasters_train, classes_train)

predictions = estimator.predict(forecasters_test)



In [20]:
precision = accuracy_score(classes_test, predictions)
print('Precision: {}%'.format(precision * 100))

Precision: 94.39999999999999%


In [21]:
matrix = confusion_matrix(classes_test, predictions)
print('Confusion Matrix:'
  '\n\t0\t1\n0:\t{}\t{}'
  '\n1:\t{}\t{}'.format(matrix[0][0], matrix[0][1], matrix[1][0], matrix[1][1]))

Confusion Matrix:
	0	1
0:	422	14
1:	14	50
