In [28]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification

In [154]:
def generate_dataset(size, n_features):
    a1, a2 = make_classification(n_samples=size, n_features=n_features, n_classes=2, n_informative=4, n_redundant=0)
    a2 = a2.reshape(-1,1)
    a1 = np.hstack((np.ones((a1.shape[0],1)), a1)) # adding bias column to dataset
    print('dataset shape x: {} y: {}'.format(a1.shape, a2.shape))
    return a1, a2

In [196]:
def hypothesis(weights, x):
    # x: m x n
    # weights: n x 1
    h = np.power(1 + np.exp(np.negative(np.dot(x, weights))), -1)
    return h # m x 1

In [197]:
def cost(weights, x, y):
    # x: m x n
    # y: m x 1
    m = y.shape[0]
    h = hypothesis(weights, x) # m x 1
    j = -(np.dot(np.transpose(y), np.log(h)) + np.dot(np.transpose(1 - y), np.log(1 - h)))/m
    print('cost : {}'.format(j))
    return j

In [203]:
def update_weights(weights, x, y, k):
    # x: m x n
    # y: m x 1
    # weights: n x 1
    weights = weights - k * (np.dot(np.transpose(x), hypothesis(weights, x) - y))
    print('updated weights {}'.format(weights))
    return weights

In [204]:
def gradient_descent(xtrain, ytrain, lr, weights, batch_size):
    from IPython.display import display, clear_output
    import time
    k = lr/batch_size
    total_sample = xtrain.shape[0]
    j_prev = 100000000
    t = 0
    i = 0
    while True:
        for i in range(0 ,total_sample, batch_size):
            x_data = xtrain[i:i+batch_size, :]
            y_data = ytrain[i:i+batch_size, :]
            weights = update_weights(weights, x_data, y_data, k)
            j = cost(weights, x_data, y_data)
            display('Iteration '+str(t) + ' sample '+ str(i) + '-----j : {}'.format(j)+ '-----j_prev : {}'.format(j_prev))
            time.sleep(0.1)
            clear_output(wait=True)
            if abs(j_prev - j) < 0.00001:
                break
            j_prev = j
        else:
            t += 1
            continue
        break
    return weights

In [200]:
def predict(weights, xinput):
    y = hypothesis(weights, xinput)
    output = np.where(y >= 0.5, 1, 0)
    return output

In [213]:
def accuracy_score(ytest, pred):
    return np.sum(ytest == pred)/ytest.shape[0], np.sum(np.abs(pred_val-ytest))

In [205]:
xtrain, ytrain = generate_dataset(10000, 4)
xtrain, xtest, ytrain, ytest = train_test_split(xtrain, ytrain, test_size=0.3)
initial_weights = np.array([0]*5).reshape(-1, 1)
learning_rate = 0.1
batch_size = 10000

dataset shape x: (10000, 5) y: (10000, 1)


In [206]:
weights = gradient_descent(xtrain, ytrain, learning_rate, initial_weights, batch_size)
print(weights)

[[ 1.201605  ]
 [ 0.99087968]
 [-0.6729127 ]
 [ 1.29780923]
 [ 0.95925107]]


In [207]:
pred_val = predict(weights, xtest)

In [208]:
print(np.hstack((hypothesis(weights, xtest), pred_val, ytest)))

[[0.9860943  1.         1.        ]
 [0.00889801 0.         0.        ]
 [0.24136294 0.         0.        ]
 ...
 [0.8552246  1.         1.        ]
 [0.99942553 1.         1.        ]
 [0.88742154 1.         1.        ]]


In [214]:
accuracy, wrong_counts = accuracy_score(ytest, pred_val)
print(accuracy)
print(wrong_counts)

0.896
312


In [211]:
from sklearn import metrics
print(metrics.accuracy_score(ytest, pred_val))

0.896
