In [2]:
import numpy as np
import pandas as pd
import math

# logistic regression using gradient descent

In [3]:
def normalise(x, mean, std):
    return (x - mean) / std

In [4]:
def predict_row(row, coefficients):
    temp = 0
    for i in range(len(coefficients)):
        temp += coefficients[i] * row[i]
    return 1.0 / (1.0 + math.exp(-temp))

In [5]:
def cost_fn(x, y, coef):
    n = x_train.shape[1]
    m = len(y)
    h = np.zeros([m])
    cost = 0
    for i in range(m):
        h[i] = predict_row(x[i], coef)
        cost += y[i] * math.log(h[i]) + (1 - y[i]) * math.log(1 - h[i])
    cost = (-1.0) * cost / m
    return cost

In [6]:
def set_coeff(x, y, l_rate, n_epoch):
    n = x_train.shape[1]
    m = len(y)
    coeff = np.zeros([n])
    temp = np.zeros([n])
    h = np.zeros([m])
    for epoch in range(n_epoch):
        for i in range(n):
            error = 0
            for j in range(m):
                error += (predict_row(x[j], coeff) - y[j]) * x[j][i]
            temp[i] = l_rate * error
        coeff = np.subtract(coeff, temp)
#         if epoch % 100 == 0:
#             print(cost_fn(x, y, coeff))
    return coeff

In [7]:
df = pd.read_csv('diabetes.csv', header = None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [16]:
X = df.iloc[:, :-1].values
Y = df.iloc[:, -1].values
for i in range(X.shape[1]):
    me = X.T[i].mean()
    std = np.std(X.T[i])
    for j in range(X.shape[0]):
        X[j][i] = normalise(X[j][i], me, std)    
X

array([[ 0.63994726,  0.84832379,  0.14964075, ...,  0.20401277,
         0.46849198,  1.4259954 ],
       [-0.84488505, -1.12339636, -0.16054575, ..., -0.68442195,
        -0.36506078, -0.19067191],
       [ 1.23388019,  1.94372388, -0.26394125, ..., -1.10325546,
         0.60439732, -0.10558415],
       ...,
       [ 0.3429808 ,  0.00330087,  0.14964075, ..., -0.73518964,
        -0.68519336, -0.27575966],
       [-0.84488505,  0.1597866 , -0.47073225, ..., -0.24020459,
        -0.37110101,  1.17073215],
       [-0.84488505, -0.8730192 ,  0.04624525, ..., -0.20212881,
        -0.47378505, -0.87137393]])

In [9]:
x_train = X[:600]
y_train = Y[:600]
x_test = X[600:]
y_test = Y[600:]

In [10]:
temp = np.ones([len(y_train)])
temp = temp.reshape(600, 1)
x_train = np.append(temp, x_train, axis = 1)

In [11]:
temp = np.ones([len(y_test)])
temp = temp.reshape(168, 1)
x_test = np.append(temp, x_test, axis = 1)

In [12]:
l_rate = 0.001
epoch = 1000
theta = set_coeff(x_train, y_train, l_rate, epoch)

In [14]:
temp = np.ones((9, 1))
cost_fn(x_train, y_train, temp)

1.1176328614489954

In [15]:
cost_fn(x_train, y_train, theta)
theta

array([-0.87034447,  0.40202834,  1.03354693, -0.19523664, -0.03650942,
       -0.10104102,  0.80082497,  0.34019414,  0.09908342])

In [212]:
x_train.shape

(600, 9)

In [287]:
cnt = 0
for i in range(x_test.shape[0]):
    res = 0
    if predict_row(x_test[i], theta) >= 0.45:
        res = 1
    if y_test[i] == res:
        cnt += 1
print(cnt * 100 / 168)

79.16666666666667


# Logistic regression using normalised equation

In [213]:
temp = np.linalg.inv(np.matmul(x_train.T, x_train))

In [214]:
temp = np.matmul(temp, x_train.T)

In [215]:
result = np.matmul(temp, y_train)

In [216]:
result

array([-8.23788820e-01,  2.00750001e-02,  5.44472539e-03, -1.90712175e-03,
       -4.26278477e-04, -1.20939764e-04,  1.46925613e-02,  1.58155586e-01,
        1.32756030e-03])

In [217]:
cost_fn(x_train, y_train, result)

0.7029108467626587

In [245]:
cnt = 0
for i in range(x_test.shape[0]):
    res = 0
    if predict_row(x_test[i], result) > 0.615:
        res = 1
#     print(y_test[i], res)
    if y_test[i] == res:
        cnt += 1
print(cnt * 100 / 168)

79.76190476190476


# Logistic regression using sklearn

In [234]:
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression()
X_train = X[:600]
Y_train = Y[:600]
X_test = X[600:]
Y_test = Y[600:]
lr_model.fit(X_train, Y_train)
y_pred_sk = lr_model.predict(X_test)
print(f"Accuracy = {lr_model.score(X_test, Y_test)}")

Accuracy = 0.7559523809523809


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
