In [1]:
def sigmoid(prediction):
    '''
    Sigmoid activation function, from logistic regression slides. 
    '''
    return 1. / (1. + np.exp(-prediction))
    #return np.exp(prediction) / (1. + np.exp(prediction))


In [2]:
def relu_(prediction):
    '''
    Relu activation function
    '''
    return prediction

In [3]:
def cost_mse_ols(design, data, beta):
    '''
    Mean squared error
    '''
    return (data - design.dot(beta)).T*(data - design.dot(beta))

In [4]:
def cost_grad_ols(design, data, beta):
    '''
    Calculates the first derivative of MSE w.r.t beta.
    '''
    return (2/len(data))*design.T.dot(design.dot(beta)-data) #logistic regression slides

In [98]:
def cost_log_ols(prediction, data):
    '''
    Logisitic regression cost function
    '''
    calc = -data.dot(np.log(sigmoid(prediction)+ 1e-16)) - ((1 - data).dot(np.log(1 - sigmoid(prediction) + 1e-16)))
    norm = calc/data.shape[1]
    return norm
#return -np.mean(data.dot(prediction.T)-np.log(1+np.exp(prediction)))

In [6]:
def cost_grad_log_ols(design, data, p):
    '''
    Gradient w.r.t log
    '''
    return (1/len(data))*design.T.dot(data-p)

In [7]:
def gradient_solver(N, eta, design, data, beta=None):
    M=len(data)
    if beta != None:
        beta = beta
    else:
        beta = np.random.randn(design.shape[1])
     
    for i in range(N):
        gradients = cost_grad_ols(design,frank,beta)
        beta -= eta*gradients
    return beta


In [8]:
import functions_class as fx
import classx as cl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score

In [9]:
n_x         = 50
x           = np.linspace(0, 1, n_x)
y           = np.linspace(0, 1, n_x)

x_mesh, y_mesh  = np.meshgrid(x,y)
noise_level     = 0.01
frank           = fx.FrankeFunction(x_mesh, y_mesh, noise_level)

frank = np.ravel(frank)



In [10]:
design = fx.DesignDesign(x,y,10)
data = frank.reshape([n_x*n_x,1])
np.random.seed(2018)
M=len(data)
N=10000
eta=0.1

beta = gradient_solver(N, eta, design, data)


prediction = design @ beta
pred = np.reshape(prediction,[n_x,n_x])



In [11]:

# Trying to set the seed
np.random.seed(0)
import random
random.seed(0)

# Reading file into data frame
directory = os.getcwd()
filename = directory + '/cred_card.xls'
nanDict = {} # fjerner NaN 
dataframe = pd.read_excel(filename, header=1, skiprows=0, index_col=0, na_values=nanDict)


dataframe.rename(index=str, columns={"default payment next month": "defaultPaymentNextMonth"}, inplace=True)

# Features and targets 
X = dataframe.loc[:, dataframe.columns != 'defaultPaymentNextMonth'].values
y = dataframe.loc[:, dataframe.columns == 'defaultPaymentNextMonth'].values

# Categorical variables to one-hot's
onehotencoder = OneHotEncoder(categories="auto")

X = ColumnTransformer(
    [("", onehotencoder, [3]),],
    remainder="passthrough"
).fit_transform(X)



# Train-test split
trainingShare = 0.5 
seed  = 1
XTrain, XTest, yTrain, yTest=train_test_split(X, y, train_size=trainingShare, \
                                              test_size = 1-trainingShare,
                                             random_state=seed)

# Input Scaling
sc = StandardScaler()
XTrain = sc.fit_transform(XTrain)
XTest = sc.transform(XTest)

# One-hot's of the target vector
Y_train_onehot, Y_test_onehot = onehotencoder.fit_transform(yTrain), onehotencoder.fit_transform(yTest)


# Remove instances with zeros only for past bill statements or paid amounts
'''
dataframe = dataframe.drop(dataframe[(dataframe.BILL_AMT1 == 0) &
                (dataframe.BILL_AMT2 == 0) &
                (dataframe.BILL_AMT3 == 0) &
                (dataframe.BILL_AMT4 == 0) &
                (dataframe.BILL_AMT5 == 0) &
                (dataframe.BILL_AMT6 == 0) &
                (dataframe.PAY_AMT1 == 0) &
                (dataframe.PAY_AMT2 == 0) &
                (dataframe.PAY_AMT3 == 0) &
                (dataframe.PAY_AMT4 == 0) &
                (dataframe.PAY_AMT5 == 0) &
                (dataframe.PAY_AMT6 == 0)].index)
'''
dataframe = dataframe.drop(dataframe[(dataframe.BILL_AMT1 == 0) &
                (dataframe.BILL_AMT2 == 0) &
                (dataframe.BILL_AMT3 == 0) &
                (dataframe.BILL_AMT4 == 0) &
                (dataframe.BILL_AMT5 == 0) &
                (dataframe.BILL_AMT6 == 0)].index)

dataframe = dataframe.drop(dataframe[(dataframe.PAY_AMT1 == 0) &
                (dataframe.PAY_AMT2 == 0) &
                (dataframe.PAY_AMT3 == 0) &
                (dataframe.PAY_AMT4 == 0) &
                (dataframe.PAY_AMT5 == 0) &
                (dataframe.PAY_AMT6 == 0)].index)


In [16]:
dataframe

Unnamed: 0_level_0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,defaultPaymentNextMonth
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,20000,2,2,1,24,2,2,-1,-1,-2,...,0,0,0,0,689,0,0,0,0,1
2,120000,2,2,2,26,-1,2,0,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
3,90000,2,2,2,34,0,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
4,50000,2,2,1,37,0,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
5,50000,1,2,1,57,-1,0,-1,0,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0
6,50000,1,1,2,37,0,0,0,0,0,...,19394,19619,20024,2500,1815,657,1000,1000,800,0
7,500000,1,1,2,29,0,0,0,0,0,...,542653,483003,473944,55000,40000,38000,20239,13750,13770,0
8,100000,2,2,2,23,0,-1,-1,0,0,...,221,-159,567,380,601,0,581,1687,1542,0
9,140000,2,3,1,28,0,0,2,0,0,...,12211,11793,3719,3329,0,432,1000,1000,1000,0
10,20000,1,3,2,35,-2,-2,-2,-2,-1,...,0,13007,13912,0,0,0,13007,1122,0,0


In [17]:
print(np.unique(X[:,0]))
print(np.unique(X[:,1]))
print(np.unique(X[:,2]))
print(np.unique(X[:,3]))
print(np.unique(X[:,4]))
print(np.unique(X[:,5]))
print(np.unique(X[:,6]))
print(np.unique(X[:,7]))
print(np.unique(X[:,8]))
print(np.unique(X[:,9]))
print(np.unique(X[:,10]))
print(np.unique(X[:,11]))
print(np.unique(X[:,12]))
print(np.unique(X[:,13]))
print(np.unique(X[:,14]))
print(np.unique(X[:,15]))
print(np.unique(X[:,16]))
print(np.unique(X[:,17]))
print(np.unique(X[:,18]))
print(np.unique(X[:,19]))
print(np.unique(X[:,20]))
print(np.unique(X[:,21]))
print(np.unique(X[:,22]))
print(np.unique(X[:,23]))
print(np.unique(X[:,24]))
print(np.unique(X[:,25]))


[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[  10000.   16000.   20000.   30000.   40000.   50000.   60000.   70000.
   80000.   90000.  100000.  110000.  120000.  130000.  140000.  150000.
  160000.  170000.  180000.  190000.  200000.  210000.  220000.  230000.
  240000.  250000.  260000.  270000.  280000.  290000.  300000.  310000.
  320000.  327680.  330000.  340000.  350000.  360000.  370000.  380000.
  390000.  400000.  410000.  420000.  430000.  440000.  450000.  460000.
  470000.  480000.  490000.  500000.  510000.  520000.  530000.  540000.
  550000.  560000.  570000.  580000.  590000.  600000.  610000.  620000.
  630000.  640000.  650000.  660000.  670000.  680000.  690000.  700000.
  710000.  720000.  730000.  740000.  750000.  760000.  780000.  800000.
 1000000.]
[1. 2.]
[0. 1. 2. 3. 4. 5. 6.]
[21. 22. 23. 24. 25. 26. 27. 28. 29. 30. 31. 32. 33. 34. 35. 36. 37. 38.
 39. 40. 41. 42. 43. 44. 45. 46. 47. 48. 49. 50. 51. 52. 53. 54. 55. 56.
 57. 58. 59. 60. 61. 62. 63. 64. 65. 66. 67. 68. 6

In [225]:
# Set up datasets

data_new = dataframe.rename({'defaultPaymentNextMonth':'y'}, axis='columns')

payment = data_new['y']

# SEX

sex = dataframe['SEX']


# Education

edu = dataframe['EDUCATION']


# Marriage

mar = dataframe['MARRIAGE']


# Make new dataframe with chosen columns

merged_data = pd.concat([sex, edu, mar, payment], axis=1)

merged_data


Unnamed: 0_level_0,SEX,EDUCATION,MARRIAGE,y
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,2,2,1,1
2,2,2,2,1
3,2,2,2,0
4,2,2,1,0
5,1,2,1,0
6,1,1,2,0
7,1,1,2,0
8,2,2,2,0
9,2,3,1,0
10,1,3,2,0


# Egen logistic regression.


In [160]:

eta = 0.0001 # This is out eta
#m = 10

Niteration = 5
#beta = np.random.randn(26,1)
#
beta = parameters.reshape([26,1])

for iter in range(Niteration):
    
    sig = sigmoid(XTrain@beta)
    gradients = -(np.transpose(XTrain)@(yTrain-sig))
    beta -= eta*gradients
  
    #Cost function
    cost = cost_log_ols(XTrain@beta,yTrain)
 
    print('cost is', cost)

cost is 169513727.15771538
cost is 173592565.08923486
cost is 169513736.66230506
cost is 173592551.10736522
cost is 169513745.3224298


In [102]:
eta = 0.1 # This is out eta
#m = 10

Niteration = 1000
beta = np.random.randn(26,1)
#
#beta = parameters.reshape([26,1])

for iter in range(Niteration):
    
    sig = sigmoid(XTrain@beta)
    #gradients = -(np.transpose(XTrain)@(yTrain-sig))
    gradient = np.dot(XTrain.T, (sig - yTrain)) / yTrain.shape[0]
    beta -= eta*gradient
    #beta , pred  = fx.OridinaryLeastSquares(XTrain, yTrain,XTrain)
    #Cost function
    cost = cost_log_ols(XTrain@beta,yTrain.T)
    logloss=log_loss(yTrain, np.round(XTrain@beta), eps=1e-16, normalize=True)
    #cost = (-yTrain * np.log(sig) - (1 - yTrain) * np.log(1 - sig)).mean()
    print('cost is', cost)
    print('logl is', logloss)


cost is [[1.25778758]]
logl is 15.56226834824427
cost is [[1.24498301]]
logl is 15.552437014452948
cost is [[1.2327393]]
logl is 15.523005749629915
cost is [[1.22095145]]
logl is 15.481349796800636
cost is [[1.20959387]]
logl is 15.451925502705485
cost is [[1.19864234]]
logl is 15.42248029642669
cost is [[1.18807385]]
logl is 15.39548421018587
cost is [[1.17783162]]
logl is 15.329246437514344
cost is [[1.16796501]]
logl is 15.290018692539396
cost is [[1.15839678]]
logl is 15.255654334000996
cost is [[1.14915429]]
logl is 15.23353557565249
cost is [[1.14018055]]
logl is 15.20165519079148
cost is [[1.13147989]]
logl is 15.169760864474704
cost is [[1.12303607]]
logl is 15.150119109075703
cost is [[1.11483438]]
logl is 15.118238724214692
cost is [[1.1068613]]
logl is 15.071649677670045
cost is [[1.09909769]]
logl is 15.027495809707617
cost is [[1.09154048]]
logl is 15.032373137599928
cost is [[1.08417845]]
logl is 14.983300117377894
cost is [[1.07700142]]
logl is 14.9661353649284
cost is [

cost is [[0.7067491]]
logl is 9.317913869847375
cost is [[0.70605008]]
logl is 9.300770029581525
cost is [[0.70535943]]
logl is 9.2860962215373
cost is [[0.70467706]]
logl is 9.26404019973973
cost is [[0.70400281]]
logl is 9.195443926492686
cost is [[0.70333661]]
logl is 9.180735264809051
cost is [[0.70267834]]
logl is 9.15134582435331
cost is [[0.70202789]]
logl is 9.134201984087461
cost is [[0.70138516]]
logl is 9.144026347150902
cost is [[0.70075007]]
logl is 9.117079056005258
cost is [[0.70012249]]
logl is 9.117099968188905
cost is [[0.69950234]]
logl is 9.085240495511536
cost is [[0.69888952]]
logl is 9.046033662720236
cost is [[0.69828394]]
logl is 9.031359854676012
cost is [[0.69768549]]
logl is 9.009317774334205
cost is [[0.6970941]]
logl is 8.997079144872194
cost is [[0.69650966]]
logl is 8.965233613650591
cost is [[0.69593209]]
logl is 8.930945933118894
cost is [[0.6953613]]
logl is 8.921163394422743
cost is [[0.6947972]]
logl is 8.899114343353054
cost is [[0.69423972]]
logl 

cost is [[0.65293854]]
logl is 7.33231185528408
cost is [[0.65287485]]
logl is 7.327413615208123
cost is [[0.65281182]]
logl is 7.320080196549951
cost is [[0.65274942]]
logl is 7.325006319537436
cost is [[0.65268765]]
logl is 7.325020260993199
cost is [[0.65262652]]
logl is 7.32011505018936
cost is [[0.652566]]
logl is 7.320122020917243
cost is [[0.65250609]]
logl is 7.315223780841285
cost is [[0.65244679]]
logl is 7.310325540765328
cost is [[0.65238809]]
logl is 7.310325540765328
cost is [[0.65232998]]
logl is 7.310339482221092
cost is [[0.65227246]]
logl is 7.307890362183113
cost is [[0.65221552]]
logl is 7.302999092835038
cost is [[0.65215915]]
logl is 7.298107823486964
cost is [[0.65210336]]
logl is 7.295665674176867
cost is [[0.65204812]]
logl is 7.29076743410091
cost is [[0.65199345]]
logl is 7.295665674176867
cost is [[0.65193933]]
logl is 7.29322352486677
cost is [[0.65188575]]
logl is 7.293230495594653
cost is [[0.65183271]]
logl is 7.288332255518696
cost is [[0.65178021]]
log

cost is [[0.64765148]]
logl is 7.065838751368273
cost is [[0.6476443]]
logl is 7.0633896313302955
cost is [[0.64763719]]
logl is 7.0633896313302955
cost is [[0.64763017]]
logl is 7.065838751368273
cost is [[0.64762322]]
logl is 7.065838751368273
cost is [[0.64761636]]
logl is 7.065838751368273
cost is [[0.64760957]]
logl is 7.068287871406253
cost is [[0.64760285]]
logl is 7.065845722096156
cost is [[0.64759621]]
logl is 7.065845722096156
cost is [[0.64758965]]
logl is 7.065838751368273
cost is [[0.64758316]]
logl is 7.065838751368273
cost is [[0.64757674]]
logl is 7.0633896313302955
cost is [[0.64757039]]
logl is 7.0633896313302955
cost is [[0.64756412]]
logl is 7.063396602058177
cost is [[0.64755791]]
logl is 7.065852692824038
cost is [[0.64755177]]
logl is 7.065852692824038
cost is [[0.64754569]]
logl is 7.065852692824038
cost is [[0.64753969]]
logl is 7.068301812862015
cost is [[0.64753375]]
logl is 7.065852692824038
cost is [[0.64752787]]
logl is 7.063403572786058
cost is [[0.64752

cost is [[0.64697546]]
logl is 7.019340324286092
cost is [[0.64697375]]
logl is 7.019340324286092
cost is [[0.64697205]]
logl is 7.019340324286092
cost is [[0.64697035]]
logl is 7.014435113482253
cost is [[0.64696866]]
logl is 7.014435113482253
cost is [[0.64696697]]
logl is 7.014435113482253
cost is [[0.64696529]]
logl is 7.014435113482253
cost is [[0.64696361]]
logl is 7.011985993444275
cost is [[0.64696194]]
logl is 7.009536873406296
cost is [[0.64696027]]
logl is 7.009536873406296
cost is [[0.64695861]]
logl is 7.009536873406296
cost is [[0.64695695]]
logl is 7.007087753368318
cost is [[0.6469553]]
logl is 7.004631662602458
cost is [[0.64695365]]
logl is 7.002182542564479
cost is [[0.64695201]]
logl is 7.002182542564479
cost is [[0.64695037]]
logl is 6.999733422526501
cost is [[0.64694874]]
logl is 6.999733422526501
cost is [[0.64694711]]
logl is 6.999733422526501
cost is [[0.64694549]]
logl is 6.999733422526501
cost is [[0.64694387]]
logl is 6.999733422526501
cost is [[0.64694226]

cost is [[0.64672483]]
logl is 6.977712254368339
cost is [[0.64672371]]
logl is 6.977712254368339
cost is [[0.6467226]]
logl is 6.977712254368339
cost is [[0.64672149]]
logl is 6.977712254368339
cost is [[0.64672038]]
logl is 6.977712254368339
cost is [[0.64671927]]
logl is 6.977712254368339
cost is [[0.64671816]]
logl is 6.977712254368339
cost is [[0.64671706]]
logl is 6.977712254368339
cost is [[0.64671595]]
logl is 6.977712254368339
cost is [[0.64671485]]
logl is 6.977712254368339
cost is [[0.64671375]]
logl is 6.977712254368339
cost is [[0.64671266]]
logl is 6.977712254368339
cost is [[0.64671156]]
logl is 6.977712254368339
cost is [[0.64671047]]
logl is 6.977712254368339
cost is [[0.64670938]]
logl is 6.977712254368339
cost is [[0.6467083]]
logl is 6.977712254368339
cost is [[0.64670721]]
logl is 6.977712254368339
cost is [[0.64670613]]
logl is 6.977712254368339
cost is [[0.64670504]]
logl is 6.9801683451342
cost is [[0.64670396]]
logl is 6.9801683451342
cost is [[0.64670289]]
log

In [104]:
#pred_classes = sigmoid()

#print(np.array_equal(pred_classes, np.round(pred_classes)))
#print(pred_classes)

#beta = parameters.reshape([26,1])
activation =sigmoid(XTrain@beta) 
classes = np.zeros([len(activation)])



classes=np.round(activation)
print(np.sum(classes==yTrain)/len(activation))
log_loss(yTrain, classes)
accuracy = accuracy_score(yTrain.flatten(),classes)
print(accuracy)


0.6016
0.6016


# Accuracy. 
Både egen kode og tester med scikit. 

In [144]:
activation =sigmoid(X@beta) 
classes = np.zeros([len(activation)])

for i in range (len(activation)):
    if activation[i]>=0.5:
        classes[i] = 1 
    else:
        classes[1] = 0
        


In [147]:
print(classes)
print(activation)
print(np.array_equal(classes,activation))

[0. 0. 0. ... 0. 1. 0.]
[[0.]
 [0.]
 [0.]
 ...
 [0.]
 [1.]
 [0.]]
False


In [50]:
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import log_loss

model = LogisticRegression()
model.fit(XTrain, yTrain)
predicted_classes = model.predict(XTrain)
accuracy = accuracy_score(yTrain.flatten(),predicted_classes)
accuracy = accuracy * 100
parameters = model.coef_
log_loss(yTrain, predicted_classes)



  y = column_or_1d(y, warn=True)


6.357454020076612

In [51]:
#print(parameters)
print(accuracy, '%')

81.59333333333333 %


# Tips fra gruppelærer: 
Lage et enklere dataset som har x med 1000 elementer og første 500 verdiene = 0 og de siste etter = 1, slik at y fra 0 - 500 = 0 osv. Sjekker man logisitic regression på dette så vil accuracy være 100% med scikit. Kan bruke cost-funksjon med log for OLS. Ikke nødvenig å kjøre for Ridge og Lasso, dette er tidskrevende for oppgaven. Han tror ikke vi har tid til dette. Var inne på god tanke med loopen. 

# Eget dataset: 

In [47]:
x_test = np.zeros([999, 1])
y_test = np.zeros([999, 1])

y_test[500:999, 0] = 1

x_test[500:999, 0] = 1

In [51]:
#print(x_test)
#print(y_test)

In [49]:
model = LogisticRegression()
model.fit(x_test, y_test)
predicted_classes = model.predict(x_test)
accuracy = accuracy_score(y_test.flatten(),predicted_classes)
accuracy = accuracy * 100
parameters = model.coef_

  y = column_or_1d(y, warn=True)


In [50]:
#print(parameters)
print(accuracy, '%')

100.0 %


In [52]:
beta = np.random.randn(26,1)

In [53]:
print(beta)

[[ 1.53277921]
 [ 1.46935877]
 [ 0.15494743]
 [ 0.37816252]
 [-0.88778575]
 [-1.98079647]
 [-0.34791215]
 [ 0.15634897]
 [ 1.23029068]
 [ 1.20237985]
 [-0.38732682]
 [-0.30230275]
 [-1.04855297]
 [-1.42001794]
 [-1.70627019]
 [ 1.9507754 ]
 [-0.50965218]
 [-0.4380743 ]
 [-1.25279536]
 [ 0.77749036]
 [-1.61389785]
 [-0.21274028]
 [-0.89546656]
 [ 0.3869025 ]
 [-0.51080514]
 [-1.18063218]]
