In [1]:
def sigmoid(prediction):
    '''
    Sigmoid activation function, from logistic regression slides. 
    '''
    return 1. / (1. + np.exp(-prediction))
    #return np.exp(prediction) / (1. + np.exp(prediction))


In [2]:
def relu_(prediction):
    '''
    Relu activation function
    '''
    return prediction

In [3]:
def cost_mse_ols(design, data, beta):
    '''
    Mean squared error
    '''
    return (data - design.dot(beta)).T*(data - design.dot(beta))

In [4]:
def cost_grad_ols(design, data, beta):
    '''
    Calculates the first derivative of MSE w.r.t beta.
    '''
    return (2/len(data))*design.T.dot(design.dot(beta)-data) #logistic regression slides

In [98]:
def cost_log_ols(prediction, data):
    '''
    Logisitic regression cost function
    '''
    calc = -data.dot(np.log(sigmoid(prediction)+ 1e-16)) - ((1 - data).dot(np.log(1 - sigmoid(prediction) + 1e-16)))
    norm = calc/data.shape[1]
    return norm
#return -np.mean(data.dot(prediction.T)-np.log(1+np.exp(prediction)))

In [6]:
def cost_grad_log_ols(design, data, p):
    '''
    Gradient w.r.t log
    '''
    return (1/len(data))*design.T.dot(data-p)

In [7]:
def gradient_solver(N, eta, design, data, beta=None):
    M=len(data)
    if beta != None:
        beta = beta
    else:
        beta = np.random.randn(design.shape[1])
     
    for i in range(N):
        gradients = cost_grad_ols(design,frank,beta)
        beta -= eta*gradients
    return beta


In [8]:
import functions_class as fx
import classx as cl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score

In [9]:
n_x         = 50
x           = np.linspace(0, 1, n_x)
y           = np.linspace(0, 1, n_x)

x_mesh, y_mesh  = np.meshgrid(x,y)
noise_level     = 0.01
frank           = fx.FrankeFunction(x_mesh, y_mesh, noise_level)

frank = np.ravel(frank)



In [10]:
design = fx.DesignDesign(x,y,10)
data = frank.reshape([n_x*n_x,1])
np.random.seed(2018)
M=len(data)
N=10000
eta=0.1

beta = gradient_solver(N, eta, design, data)


prediction = design @ beta
pred = np.reshape(prediction,[n_x,n_x])



In [11]:

# Trying to set the seed
np.random.seed(0)
import random
random.seed(0)

# Reading file into data frame
directory = os.getcwd()
filename = directory + '/cred_card.xls'
nanDict = {} # fjerner NaN 
dataframe = pd.read_excel(filename, header=1, skiprows=0, index_col=0, na_values=nanDict)


dataframe.rename(index=str, columns={"default payment next month": "defaultPaymentNextMonth"}, inplace=True)

# Features and targets 
X = dataframe.loc[:, dataframe.columns != 'defaultPaymentNextMonth'].values
y = dataframe.loc[:, dataframe.columns == 'defaultPaymentNextMonth'].values

# Categorical variables to one-hot's
onehotencoder = OneHotEncoder(categories="auto")

X = ColumnTransformer(
    [("", onehotencoder, [3]),],
    remainder="passthrough"
).fit_transform(X)



# Train-test split
trainingShare = 0.5 
seed  = 1
XTrain, XTest, yTrain, yTest=train_test_split(X, y, train_size=trainingShare, \
                                              test_size = 1-trainingShare,
                                             random_state=seed)

# Input Scaling
sc = StandardScaler()
XTrain = sc.fit_transform(XTrain)
XTest = sc.transform(XTest)

# One-hot's of the target vector
Y_train_onehot, Y_test_onehot = onehotencoder.fit_transform(yTrain), onehotencoder.fit_transform(yTest)


# Remove instances with zeros only for past bill statements or paid amounts
'''
dataframe = dataframe.drop(dataframe[(dataframe.BILL_AMT1 == 0) &
                (dataframe.BILL_AMT2 == 0) &
                (dataframe.BILL_AMT3 == 0) &
                (dataframe.BILL_AMT4 == 0) &
                (dataframe.BILL_AMT5 == 0) &
                (dataframe.BILL_AMT6 == 0) &
                (dataframe.PAY_AMT1 == 0) &
                (dataframe.PAY_AMT2 == 0) &
                (dataframe.PAY_AMT3 == 0) &
                (dataframe.PAY_AMT4 == 0) &
                (dataframe.PAY_AMT5 == 0) &
                (dataframe.PAY_AMT6 == 0)].index)
'''
dataframe = dataframe.drop(dataframe[(dataframe.BILL_AMT1 == 0) &
                (dataframe.BILL_AMT2 == 0) &
                (dataframe.BILL_AMT3 == 0) &
                (dataframe.BILL_AMT4 == 0) &
                (dataframe.BILL_AMT5 == 0) &
                (dataframe.BILL_AMT6 == 0)].index)

dataframe = dataframe.drop(dataframe[(dataframe.PAY_AMT1 == 0) &
                (dataframe.PAY_AMT2 == 0) &
                (dataframe.PAY_AMT3 == 0) &
                (dataframe.PAY_AMT4 == 0) &
                (dataframe.PAY_AMT5 == 0) &
                (dataframe.PAY_AMT6 == 0)].index)


In [16]:
dataframe

Unnamed: 0_level_0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,defaultPaymentNextMonth
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,20000,2,2,1,24,2,2,-1,-1,-2,...,0,0,0,0,689,0,0,0,0,1
2,120000,2,2,2,26,-1,2,0,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
3,90000,2,2,2,34,0,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
4,50000,2,2,1,37,0,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
5,50000,1,2,1,57,-1,0,-1,0,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0
6,50000,1,1,2,37,0,0,0,0,0,...,19394,19619,20024,2500,1815,657,1000,1000,800,0
7,500000,1,1,2,29,0,0,0,0,0,...,542653,483003,473944,55000,40000,38000,20239,13750,13770,0
8,100000,2,2,2,23,0,-1,-1,0,0,...,221,-159,567,380,601,0,581,1687,1542,0
9,140000,2,3,1,28,0,0,2,0,0,...,12211,11793,3719,3329,0,432,1000,1000,1000,0
10,20000,1,3,2,35,-2,-2,-2,-2,-1,...,0,13007,13912,0,0,0,13007,1122,0,0


In [None]:
# Set up datasets

data_new = dataframe.rename({'defaultPaymentNextMonth':'y'}, axis='columns')

payment = data_new['y']

# SEX

sex = dataframe['SEX']


# Education

edu = dataframe['EDUCATION']


# Marriage

mar = dataframe['MARRIAGE']


# Make new dataframe with chosen columns

merged_data = pd.concat([sex, edu, mar, payment], axis=1)

merged_data


In [17]:
print(np.unique(X[:,0]))
print(np.unique(X[:,1]))
print(np.unique(X[:,2]))
print(np.unique(X[:,3]))
print(np.unique(X[:,4]))
print(np.unique(X[:,5]))
print(np.unique(X[:,6]))
print(np.unique(X[:,7]))
print(np.unique(X[:,8]))
print(np.unique(X[:,9]))
print(np.unique(X[:,10]))
print(np.unique(X[:,11]))
print(np.unique(X[:,12]))
print(np.unique(X[:,13]))
print(np.unique(X[:,14]))
print(np.unique(X[:,15]))
print(np.unique(X[:,16]))
print(np.unique(X[:,17]))
print(np.unique(X[:,18]))
print(np.unique(X[:,19]))
print(np.unique(X[:,20]))
print(np.unique(X[:,21]))
print(np.unique(X[:,22]))
print(np.unique(X[:,23]))
print(np.unique(X[:,24]))
print(np.unique(X[:,25]))


[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[  10000.   16000.   20000.   30000.   40000.   50000.   60000.   70000.
   80000.   90000.  100000.  110000.  120000.  130000.  140000.  150000.
  160000.  170000.  180000.  190000.  200000.  210000.  220000.  230000.
  240000.  250000.  260000.  270000.  280000.  290000.  300000.  310000.
  320000.  327680.  330000.  340000.  350000.  360000.  370000.  380000.
  390000.  400000.  410000.  420000.  430000.  440000.  450000.  460000.
  470000.  480000.  490000.  500000.  510000.  520000.  530000.  540000.
  550000.  560000.  570000.  580000.  590000.  600000.  610000.  620000.
  630000.  640000.  650000.  660000.  670000.  680000.  690000.  700000.
  710000.  720000.  730000.  740000.  750000.  760000.  780000.  800000.
 1000000.]
[1. 2.]
[0. 1. 2. 3. 4. 5. 6.]
[21. 22. 23. 24. 25. 26. 27. 28. 29. 30. 31. 32. 33. 34. 35. 36. 37. 38.
 39. 40. 41. 42. 43. 44. 45. 46. 47. 48. 49. 50. 51. 52. 53. 54. 55. 56.
 57. 58. 59. 60. 61. 62. 63. 64. 65. 66. 67. 68. 6

In [116]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X)
#print(scaler.mean_)
#print(scaler.transform(X))
X = scaler.transform(X)
print(np.unique(X[:,0]))
print(np.unique(X[:,1]))
print(np.unique(X[:,2]))
print(np.unique(X[:,3]))
print(np.unique(X[:,4]))
print(np.unique(X[:,5]))
print(np.unique(X[:,6]))
print(np.unique(X[:,7]))
print(np.unique(X[:,8]))
print(np.unique(X[:,9]))
print(np.unique(X[:,10]))
print(np.unique(X[:,11]))
print(np.unique(X[:,12]))
print(np.unique(X[:,13]))
print(np.unique(X[:,14]))
print(np.unique(X[:,15]))
print(np.unique(X[:,16]))
print(np.unique(X[:,17]))
print(np.unique(X[:,18]))
print(np.unique(X[:,19]))
print(np.unique(X[:,20]))
print(np.unique(X[:,21]))
print(np.unique(X[:,22]))
print(np.unique(X[:,23]))
print(np.unique(X[:,24]))
print(np.unique(X[:,25]))


[-0.04246464 23.54900328]
[-0.91426088  1.09377971]
[-1.06647132  0.93767172]
[-0.10432569  9.58536681]
[-1.21379411 -1.16754973 -1.13672015 -1.05964618 -0.98257222 -0.90549825
 -0.82842429 -0.75135032 -0.67427636 -0.59720239 -0.52012843 -0.44305446
 -0.3659805  -0.28890654 -0.21183257 -0.13475861 -0.05768464  0.01938932
  0.09646329  0.17353725  0.25061122  0.32768518  0.40475915  0.48183311
  0.55890707  0.63598104  0.713055    0.79012897  0.86720293  0.9442769
  1.02135086  1.09842483  1.17549879  1.2346916   1.25257276  1.32964672
  1.40672068  1.48379465  1.56086861  1.63794258  1.71501654  1.79209051
  1.86916447  1.94623844  2.0233124   2.10038637  2.17746033  2.2545343
  2.33160826  2.40868222  2.48575619  2.56283015  2.63990412  2.71697808
  2.79405205  2.87112601  2.94819998  3.02527394  3.10234791  3.17942187
  3.25649583  3.3335698   3.41064376  3.48771773  3.56479169  3.64186566
  3.71893962  3.79601359  3.87308755  3.95016152  4.02723548  4.10430944
  4.18138341  4.258457

# Egen logistic regression.


In [160]:

eta = 0.0001 # This is out eta
#m = 10

Niteration = 5
#beta = np.random.randn(26,1)
#
beta = parameters.reshape([26,1])

for iter in range(Niteration):
    
    sig = sigmoid(XTrain@beta)
    gradients = -(np.transpose(XTrain)@(yTrain-sig))
    beta -= eta*gradients
  
    #Cost function
    cost = cost_log_ols(XTrain@beta,yTrain)
 
    print('cost is', cost)

cost is 169513727.15771538
cost is 173592565.08923486
cost is 169513736.66230506
cost is 173592551.10736522
cost is 169513745.3224298


In [106]:
eta = 0.1 # This is out eta

Niteration = 100
beta = np.random.randn(26,1)
#
#beta = parameters.reshape([26,1])

for iter in range(Niteration):
    
    sig = sigmoid(XTrain@beta)
    #gradients = -(np.transpose(XTrain)@(yTrain-sig))
    gradient = np.dot(XTrain.T, (sig - yTrain)) / yTrain.shape[0]
    beta -= eta*gradient
    #beta , pred  = fx.OridinaryLeastSquares(XTrain, yTrain,XTrain)
    #Cost function
    cost = cost_log_ols(XTrain@beta,yTrain.T)
    logloss=log_loss(yTrain, np.round(XTrain@beta), eps=1e-16, normalize=True)
    #cost = (-yTrain * np.log(sig) - (1 - yTrain) * np.log(1 - sig)).mean()
    print('cost is', cost)
    print('logl is', logloss)


cost is [[1.61336638]]
logl is 17.31939576120805
cost is [[1.57053352]]
logl is 17.177207384447662
cost is [[1.52988973]]
logl is 17.008127482364685
cost is [[1.49140793]]
logl is 16.839019697370176
cost is [[1.45488033]]
logl is 16.694479790762156
cost is [[1.42057713]]
logl is 16.527807184349864
cost is [[1.38828566]]
logl is 16.390586754944252
cost is [[1.35793141]]
logl is 16.23131727446895
cost is [[1.32942312]]
logl is 16.045177180854708
cost is [[1.30261549]]
logl is 15.85164093203135
cost is [[1.27755597]]
logl is 15.584624111340762
cost is [[1.2540694]]
logl is 15.4082526149669
cost is [[1.2320799]]
logl is 15.21706789599118
cost is [[1.21154033]]
logl is 15.045594639693277
cost is [[1.19232856]]
logl is 14.834844843325255
cost is [[1.17440124]]
logl is 14.665792824153804
cost is [[1.15766474]]
logl is 14.506579109501557
cost is [[1.14202951]]
logl is 14.325253607228685
cost is [[1.12743826]]
logl is 14.168482041886538
cost is [[1.11381904]]
logl is 13.992047808961741
cost is 

In [111]:
#pred_classes = sigmoid()

#print(np.array_equal(pred_classes, np.round(pred_classes)))
#print(pred_classes)

#beta = parameters.reshape([26,1])
activation =sigmoid(XTrain@beta) 
classes = np.zeros([len(activation)])



classes=np.round(activation)
print(100*np.sum(classes==yTrain)/len(activation),'%')


62.96 %


# Accuracy. 
Både egen kode og tester med scikit. 

In [144]:
activation =sigmoid(X@beta) 
classes = np.zeros([len(activation)])

for i in range (len(activation)):
    if activation[i]>=0.5:
        classes[i] = 1 
    else:
        classes[1] = 0
        


In [147]:
print(classes)
print(activation)
print(np.array_equal(classes,activation))

[0. 0. 0. ... 0. 1. 0.]
[[0.]
 [0.]
 [0.]
 ...
 [0.]
 [1.]
 [0.]]
False


In [50]:
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import log_loss

model = LogisticRegression()
model.fit(XTrain, yTrain)
predicted_classes = model.predict(XTrain)
accuracy = accuracy_score(yTrain.flatten(),predicted_classes)
accuracy = accuracy * 100
parameters = model.coef_
log_loss(yTrain, predicted_classes)



  y = column_or_1d(y, warn=True)


6.357454020076612

In [51]:
#print(parameters)
print(accuracy, '%')

81.59333333333333 %


# Tips fra gruppelærer: 
Lage et enklere dataset som har x med 1000 elementer og første 500 verdiene = 0 og de siste etter = 1, slik at y fra 0 - 500 = 0 osv. Sjekker man logisitic regression på dette så vil accuracy være 100% med scikit. Kan bruke cost-funksjon med log for OLS. Ikke nødvenig å kjøre for Ridge og Lasso, dette er tidskrevende for oppgaven. Han tror ikke vi har tid til dette. Var inne på god tanke med loopen. 

# Eget dataset: 

In [47]:
x_test = np.zeros([999, 1])
y_test = np.zeros([999, 1])

y_test[500:999, 0] = 1

x_test[500:999, 0] = 1

In [51]:
#print(x_test)
#print(y_test)

In [49]:
model = LogisticRegression()
model.fit(x_test, y_test)
predicted_classes = model.predict(x_test)
accuracy = accuracy_score(y_test.flatten(),predicted_classes)
accuracy = accuracy * 100
parameters = model.coef_

  y = column_or_1d(y, warn=True)


In [50]:
#print(parameters)
print(accuracy, '%')

100.0 %


In [52]:
beta = np.random.randn(26,1)

In [53]:
print(beta)

[[ 1.53277921]
 [ 1.46935877]
 [ 0.15494743]
 [ 0.37816252]
 [-0.88778575]
 [-1.98079647]
 [-0.34791215]
 [ 0.15634897]
 [ 1.23029068]
 [ 1.20237985]
 [-0.38732682]
 [-0.30230275]
 [-1.04855297]
 [-1.42001794]
 [-1.70627019]
 [ 1.9507754 ]
 [-0.50965218]
 [-0.4380743 ]
 [-1.25279536]
 [ 0.77749036]
 [-1.61389785]
 [-0.21274028]
 [-0.89546656]
 [ 0.3869025 ]
 [-0.51080514]
 [-1.18063218]]
