In [43]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import time

In [20]:
def read_data():
    df = pd.read_csv('bankrupcy.csv')
    Q1 = df.quantile(0.05)
    Q3 = df.quantile(0.95)
    IQR = Q3 - Q1
    dataset_out = df[~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)]
    return dataset_out.to_numpy()
#convert to numpy make it faster

In [21]:
# data set to 0 - 1 impact remains the same
def normalize(data):
    minimums = np.min(data,axis=0)
    maximums =np.max(data,axis=0)
    rng= maximums-minimums
    normalize_data = 1-((maximums-data)/rng)
    return normalize_data

In [22]:
#  1/(1+e^-a)
def logistc_function(thetas,X):
    #sygmoid
    return 1.0/(1+np.exp(-np.dot(X,thetas.T)))
    

In [23]:
def cost_func(thetas, X, y):
    log_func_value = logistc_function(thetas, X)
    step1 = y* np.log(log_func_value)
    step2 = (1 - y) * np.log(1 - log_func_value)
    return np.mean(-step1 - step2)

In [24]:
# return direction
def log_gradient(thetas, X, y): 
    calc_1 = logistc_function(thetas, X) - y.reshape(X.shape[0],1) 
    calc_f = np.dot(calc_1.T, X) 
    return calc_f

In [25]:
#thetas is random but will go to the global minima
def gradient_descent(X, y, thetas, learning_rate, convergance_criteria):
    cost = cost_func(thetas , X, y)
    change = 1
    iter_count = 1
    
    while(change > convergance_criteria):
        old_cost = cost
        thetas = thetas - (learning_rate * log_gradient(thetas, X, y))
        cost = cost_func(thetas, X, y)
        change = old_cost - cost
        iter_count += 1
        
    return thetas , iter_count
    

In [26]:
def cross_validation(train_X,train_Y,w,learning_rate,epsilon, length):
    Err_A=0
    lowest=1000;
    bestw=w;
    initw=w;
    for k in range(1,11):
        low= length*(k-1)
        up = length*k - 1
        w=initw
        validate_x=train_X[low:up,:]
        t_x=np.append(train_X[0:low,:],train_X[up:(length*10-1),:],axis=0)
        validate_y=train_Y[low:up,:]
        validate_y = validate_y[:,-1]
        t_y=np.append(train_Y[0:low,:],train_Y[up:(length*10-1),:],axis=0)
        t_y = t_y[:,-1]
        w ,count= gradient_descent(t_x,t_y,w,learning_rate,0.001)
        Xw = np.dot(validate_x,w.T)
        a = np.dot(validate_y.T,validate_y)
        b = 2 * np.dot(validate_y.T,Xw)
        c = np.dot(validate_x,w.T)
        d = np.dot(c.T,Xw)
        err = a - b + d
        # print("Err =" + str(err))
        y_pred = predict(w,validate_x)
        # y_pred = y_pred[:,None]
        score = Accu_eval(y_pred, validate_y, length)
        print("iteration: ",k)     
        print("Accuracy: " + str(score) + "%")
        Err_A += err
        if(err<lowest):
           lowest=err
           bestw=w
 
    Err_A = Err_A / 10
    return (Err_A,w)

In [27]:
def predict(thetas, X):
    prob = logistc_function(thetas,X)
    #dicision boundary
    predicted_value = np.where(prob > 0.5, 1, 0)
    return np.squeeze(predicted_value)

In [28]:
def Accu_eval(y_pred, y, length):
    corrects = np.sum(y == y_pred) / length * 100
    return corrects

In [54]:
#main
start_time = time.time()
learning_rate =0.001
covergance_criteria = 0.001
dataset = read_data()

trainNum = dataset.shape[0]
featureNum = dataset.shape[1]-1

while(trainNum % 10 != 0):
    trainNum = trainNum -1
    
print(trainNum)
testNum = int(trainNum/10)

normalized = normalize(dataset)
print("dataset shape", normalized.shape)

# copy all the rows apart from last column
X = normalized[:trainNum,:-1]
test_X = normalized[-testNum:,:-1]
test_X = np.hstack((np.matrix(np.ones(test_X.shape[0])).T,test_X))
X = np.hstack((np.matrix(np.ones(X.shape[0])).T,X))
y = normalized[:trainNum,-1]
test_y = normalized[-testNum:,-1]


print("main: " + str(X.shape))
#init thetas just zeros
thetas = np.matrix(np.zeros(X.shape[1]))

#training the thetas
(final_thetas, iter_count) = gradient_descent(X,y,thetas,learning_rate,covergance_criteria)
# print(final_thetas, iter_count)
y_pred = predict(final_thetas,test_X)
corrects = np.sum(test_y == y_pred)
total = test_y.__len__()
print("fit function(Gradient descent): ") 

print("Accuracy without cross validation ", corrects/total * 100)
print("--- %s seconds ---" % (time.time() - start_time))
newTime = time.time()

print("10 folder cross validation:")
y=y[:,None]
err, final_thetas = cross_validation(X,y,thetas,learning_rate,covergance_criteria,testNum)
print("--- %s seconds ---" % (time.time() - newTime))


330
dataset shape (334, 65)
main: (330, 65)
fit function(Gradient descent): 
Accuracy without cross validation  93.93939393939394
--- 1.4480409622192383 seconds ---
10 folder cross validation:
iteration:  1
Accuracy: 72.72727272727273%
iteration:  2
Accuracy: 84.84848484848484%
iteration:  3
Accuracy: 69.6969696969697%
iteration:  4
Accuracy: 81.81818181818183%
iteration:  5
Accuracy: 78.78787878787878%
iteration:  6
Accuracy: 78.78787878787878%
iteration:  7
Accuracy: 78.78787878787878%
iteration:  8
Accuracy: 69.6969696969697%
iteration:  9
Accuracy: 72.72727272727273%
iteration:  10
Accuracy: 81.81818181818183%
--- 19.890338897705078 seconds ---


In [51]:
# df = pd.read_csv('bankrupcy.csv')
# X = df.iloc[:,0:-1]
# y = df.iloc[:,-1:]
# print("sas",X.shape,y.shape)
# cor_selector(y,X,452)