In [2]:
import functions as fx
import log_reg_functions as lrf
import loaddata as ld
import NeuralNetwork as nn

import numpy as np
import time
import matplotlib.pyplot as plt
import pandas as pd
import os

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, log_loss, f1_score, confusion_matrix, roc_auc_score

# A ) 
Here we will load the dataset, then split it into train, validation and test data. 
As the Credit Card data is biased, we also create a downzised dataset.

In [3]:
# Reading in credit card dataset.
x,y = ld.load_data(scaler='minmax')

# Splitting data in train and test data
xtrain,xtest,ytrain,ytest = train_test_split(x,y, test_size=0.25, random_state= 0, shuffle=True, stratify=y)
# Splitting in train and validation
xtrain,xval,ytrain,yval = train_test_split(xtrain,ytrain, test_size=0.25, random_state= 0, shuffle=True, stratify=ytrain)

# Extracting some useful shapes.
nx_train, ny_train = xtrain.shape
nx_test, ny_test = xtest.shape

In [107]:
# Extracting all datapoints where the output is 1 (i.e paying credit card dept).
indices = np.where(y == 1)
indices_zero = np.where(y == 0)
# Randomly extracting equal amount of datapoints where the output is 0 (i.e defaulting).
datapoints = np.random.choice(indices_zero[0], size=y[indices[0]].shape[0], replace=False)

#Stacking the two datasets
x_new = np.vstack((x[indices[0],:],x[datapoints,:]))
y_new = np.vstack((y[indices[0]],y[datapoints]))

# Renaming data
xtrain = x_new
ytrain = y_new

# Extracting some useful shapes.
nx_train, ny_train = xtrain.shape
nx_test, ny_test = xtest.shape


# B) 
Egen logistic regression med gradient descent.


In [108]:
eta = 0.1 # learning rate
doplot = True
doprint = False
Niteration = 250
beta = np.random.randn(x.shape[1],1)
costvec=[]
costvec_val=[]
loglvec=[]
xaxis=[]

%matplotlib qt
#plt.axis([0, Niteration, 0, 13])

start = time.time()
for iter in range(Niteration):
    
    sig = lrf.sigmoid(xtrain@beta)
    gradient = lrf.gradient_ols(xtrain,ytrain,sig)
    beta -= eta*gradient
    
    #Cost function
    cost = lrf.cost_log_ols(xtrain@beta,ytrain.T)
    cost_val = lrf.cost_log_ols(xval@beta,yval.T) # do this for testdata at the end. 
    #Log Loss function from sklearn
    logloss=log_loss(ytrain, np.round(xtrain@beta), eps=1e-16, normalize=True)
    if doprint:
        print('Cost', cost,'&','Log loss', logloss,'&','Cost test', cost_val)
    if doplot:
        costvec.append(cost.ravel())
        costvec_val.append(cost_val.ravel())
        loglvec.append(logloss)
        xaxis.append(iter+1)
        plt.plot(xaxis, costvec, 'b')
        plt.plot(xaxis, costvec_val, 'r')
        #plt.plot(xaxis, loglvec, 'g')
        plt.pause(1e-12)
plt.show()    
end = time.time()
print(end - start)
   
# et relevant plot er iterativt plot. 

18.107311010360718


In [110]:
# making confusion matrix to check observed data with model predictions. 

predictions = xtest@beta
sig_val = lrf.sigmoid(predictions)
sig_val = np.round(sig_val)

cm = confusion_matrix(ytest , sig_val.astype(int))
print(cm)

[[4272 1569]
 [ 773  886]]


# Accuracy. 
Både egen kode og tester med scikit. 

In [111]:

activation =lrf.sigmoid(xtrain@beta) 
classes = np.zeros([len(activation)])
classes=np.round(activation)
print(100*np.sum(classes==ytrain)/len(activation),'% Training Accuracy')
print(f1_score(ytrain, classes))

activation =lrf.sigmoid(xval@beta) 
classes = np.zeros([len(activation)])
classes=np.round(activation)
print(100*np.sum(classes==yval)/len(activation),'% Validation Accuracy')
print(f1_score(yval, classes))


activation =lrf.sigmoid(xtest@beta) 
classes = np.zeros([len(activation)])
classes=np.round(activation)
print(100*np.sum(classes==ytest)/len(activation),'% Test Accuracy')
print(f1_score(ytest, classes))



64.04460518384569 % Training Accuracy
0.6024658447184272
69.04888888888888 % Validation Accuracy
0.4349237260629666
68.77333333333333 % Test Accuracy
0.4307243558580457


# Egen logistisk regresjon med stokastisk gradient descent

In [112]:
eta = 0.1 # learning rate
doplot = True
doprint = False
Niteration = 250
batch_size = 3000
batch_size_held_out = nx_train-batch_size

beta = np.random.randn(x.shape[1],1)
costvec=[]
costvec_test=[]
loglvec=[]
xaxis=[]

%matplotlib qt
#plt.axis([0, Niteration, 0, 13])

indexes = np.arange(nx_train)


start = time.time()
for iter in range(Niteration):
    datapoints = np.random.choice(indexes, size=batch_size, replace=False)
    batch_x = xtrain[datapoints,:]
    batch_y = ytrain[datapoints]
    
    batch_x_held_out = np.delete(xtrain, datapoints, axis=0)
    batch_y_held_out = np.delete(ytrain, datapoints).reshape([batch_size_held_out,1])
    
    sig = lrf.sigmoid(batch_x@beta)
    gradient = lrf.gradient_ols(batch_x,batch_y,sig)
    beta -= eta*gradient
    
    #Cost function
    cost = lrf.cost_log_ols(batch_x@beta,batch_y.T)
    cost_test = lrf.cost_log_ols(batch_x_held_out@beta,batch_y_held_out.T)
    #Log Loss function from sklearn
    if doprint:
        logloss=log_loss(batch_y, np.round(batch_x@beta), eps=1e-16, normalize=True)
        print('Cost', cost,'&','Log loss', logloss,'&','Cost test', cost_test)
    if doplot:
        costvec.append(cost.ravel())
        costvec_test.append(cost_test.ravel())
        xaxis.append(iter+1)
        plt.plot(xaxis, costvec, 'b')
        plt.plot(xaxis, costvec_test, 'r')
        plt.pause(1e-12)
plt.show()

end = time.time()
print(end - start)
 

27.082622289657593


In [113]:
# making confusion matrix to check observed data with model predictions. 

predictions = xtest@beta
sig_val = lrf.sigmoid(predictions)
sig_val = np.round(sig_val)

cm = confusion_matrix(ytest , sig_val.astype(int))
print(cm)

[[4358 1483]
 [ 745  914]]


In [114]:
activation =lrf.sigmoid(xtrain@beta) 
classes = np.zeros([len(activation)])
classes=np.round(activation)
print(100*np.sum(classes==ytrain)/len(activation),'% Training Accuracy')
print(f1_score(ytrain, classes))

activation =lrf.sigmoid(xval@beta) 
classes = np.zeros([len(activation)])
classes=np.round(activation)
print(100*np.sum(classes==yval)/len(activation),'% Validation Accuracy')
print(f1_score(yval, classes))


activation =lrf.sigmoid(xtest@beta) 
classes = np.zeros([len(activation)])
classes=np.round(activation)
print(100*np.sum(classes==ytest)/len(activation),'% Test Accuracy')
print(f1_score(ytest, classes))


64.7302591922845 % Training Accuracy
0.6101440826184725
70.54222222222222 % Validation Accuracy
0.454395785314455
70.29333333333334 % Test Accuracy
0.4506903353057199


In [12]:

model = LogisticRegression()
model.fit(xtrain, ytrain)
predicted_classes = model.predict(xtrain)
accuracy = accuracy_score(ytrain.flatten(),predicted_classes)
accuracy = accuracy * 100
parameters = model.coef_
log_loss(ytrain, predicted_classes)

print(accuracy, '% Training Accuracy')

predicted_classes = model.predict(xtest)
accuracy = accuracy_score(ytest.flatten(),predicted_classes)
accuracy = accuracy * 100
parameters = model.coef_
log_loss(ytest, predicted_classes)

print(accuracy, '% Test Accuracy')

  y = column_or_1d(y, warn=True)


71.26280892103676 % Training Accuracy
77.36 % Test Accuracy


# C) 
Neural Network. 

In [12]:
import NeuralNetwork as nn

neural_net = nn.ANN(lmb=1e-4, bias=0, eta=0.1, mode = 'classification')            
neural_net.add_layers(n_features=[91,50,50,20], n_neurons = [50,50,20,1] , n_layers=4)

x=xtrain
y=ytrain

activation = [lrf.relu, lrf.relu, lrf.sigmoid, lrf.sigmoid]
derivative = [lrf.relu_deriv, lrf.relu_deriv, lrf.sigmoid_deriv, lrf.sigmoid_deriv]

start = time.time()


epochs=200
batch_size=500
neural_net.train(epochs, batch_size, x,y,activation,derivative , verbose=True)

end = time.time()
print(end - start)


Epoch 0 loss [[0.74389165]]
Epoch 1 loss [[0.74820935]]
Epoch 2 loss [[0.74319287]]
Epoch 3 loss [[0.74508343]]
Epoch 4 loss [[0.74160689]]
Epoch 5 loss [[0.74036037]]
Epoch 6 loss [[0.74323033]]
Epoch 7 loss [[0.73881604]]
Epoch 8 loss [[0.73919635]]
Epoch 9 loss [[0.74067675]]
Epoch 10 loss [[0.73893483]]
Epoch 11 loss [[0.74136704]]
Epoch 12 loss [[0.73798119]]
Epoch 13 loss [[0.73946547]]
Epoch 14 loss [[0.73838323]]
Epoch 15 loss [[0.73843125]]
Epoch 16 loss [[0.74110105]]
Epoch 17 loss [[0.73922071]]
Epoch 18 loss [[0.74165447]]
Epoch 19 loss [[0.73770987]]
Epoch 20 loss [[0.73902928]]
Epoch 21 loss [[0.73760269]]
Epoch 22 loss [[0.73873874]]
Epoch 23 loss [[0.73771431]]
Epoch 24 loss [[0.73823531]]
Epoch 25 loss [[0.73643734]]
Epoch 26 loss [[0.73740886]]
Epoch 27 loss [[0.73793273]]
Epoch 28 loss [[0.7397342]]
Epoch 29 loss [[0.73776263]]
Epoch 30 loss [[0.73744477]]
Epoch 31 loss [[0.74002301]]
Epoch 32 loss [[0.73916486]]
Epoch 33 loss [[0.73607902]]
Epoch 34 loss [[0.7357175

In [7]:
activ=neural_net.feed_out(x, activation)['3']
print(activ)

classes=np.round(activ)
print(classes)

print(100*np.sum(classes==y)/len(activ),'% Training Accuracy')
#print(f1_score(ytrain, classes)) 

cm = confusion_matrix(y , classes)
print(cm)

[[0.27563218]
 [0.15236726]
 [0.05616373]
 ...
 [0.2334659 ]
 [0.5102972 ]
 [0.08060647]]
[[0.]
 [0.]
 [0.]
 ...
 [0.]
 [1.]
 [0.]]
79.5674074074074 % Training Accuracy
[[12578   564]
 [ 2884   849]]


In [9]:
activ=neural_net.feed_out(xtest, activation)['3']
print(activ)

classes=np.round(activ)
print(classes)

print(100*np.sum(classes==ytest)/len(activ),'% Test Accuracy')
#print(f1_score(ytrain, classes)) 

cm = confusion_matrix(ytest , classes)
print(cm)

[[0.18238223]
 [0.24991717]
 [0.12917734]
 ...
 [0.13491384]
 [0.23389111]
 [0.29355627]]
[[0.]
 [0.]
 [0.]
 ...
 [0.]
 [0.]
 [0.]]
79.32 % Test Accuracy
[[5602  239]
 [1312  347]]


In [11]:
cm= confusion_matrix([0, 1, 0, 1], [1, 1, 1, 0])
print(cm)

[[0 2]
 [1 1]]


In [76]:
from sklearn.neural_network import MLPClassifier


mlp = MLPClassifier(hidden_layer_sizes=(50,50,20,), max_iter=200, alpha=1e-4,
                    solver='sgd', verbose=10, tol=1e-4, random_state=1,
                    learning_rate_init=.1)

mlp.fit(x, y.ravel())
print("Training set score: %f" % mlp.score(x, y))
print("Test set score: %f" % mlp.score(xtest, ytest))



Iteration 1, loss = 0.47058199
Iteration 2, loss = 0.43925224
Iteration 3, loss = 0.43579302
Iteration 4, loss = 0.43459617
Iteration 5, loss = 0.43174006
Iteration 6, loss = 0.43196502
Iteration 7, loss = 0.43020106
Iteration 8, loss = 0.43028145
Iteration 9, loss = 0.42806578
Iteration 10, loss = 0.42829283
Iteration 11, loss = 0.42715236
Iteration 12, loss = 0.42587892
Iteration 13, loss = 0.42510128
Iteration 14, loss = 0.42576809
Iteration 15, loss = 0.42460461
Iteration 16, loss = 0.42338827
Iteration 17, loss = 0.42229345
Iteration 18, loss = 0.42162485
Iteration 19, loss = 0.41993946
Iteration 20, loss = 0.42177044
Iteration 21, loss = 0.42035700
Iteration 22, loss = 0.41894649
Iteration 23, loss = 0.41750074
Iteration 24, loss = 0.41702136
Iteration 25, loss = 0.41689325
Iteration 26, loss = 0.41679972
Iteration 27, loss = 0.41538672
Iteration 28, loss = 0.41519998
Iteration 29, loss = 0.41547898
Iteration 30, loss = 0.41315813
Iteration 31, loss = 0.41298859
Iteration 32, los



In [77]:
pred = mlp.predict(x)
accuracy = accuracy_score(y.flatten(),pred)
accuracy = accuracy * 100
print(accuracy)

cm = confusion_matrix(y , pred)
print(cm)

87.78666666666666
[[12813   329]
 [ 1732  2001]]


In [78]:
pred = mlp.predict(xtest)
accuracy = accuracy_score(ytest.flatten(),pred)
accuracy = accuracy * 100
print(accuracy)

cm = confusion_matrix(ytest , pred)
print(cm)

79.33333333333333
[[5424  417]
 [1133  526]]
