In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [5]:
df = pd.read_csv("diabetes.csv")

In [6]:
X = df.drop(columns=["Outcome"]).values
y = df['Outcome'].values

X_min = X.min(axis=0)
X_max = X.max(axis=0) 
X = (X - X_min) / (X_max - X_min)

X = np.hstack([np.ones((X.shape[0], 1)), X])

In [7]:
def split_data(X, y, train_ratio, val_ratio, test_ratio):
    n = len(X)
    idx = np.random.permutation(n)
    X, y = X[idx], y[idx]
    
    train_size = int(train_ratio * n)
    val_size = int(val_ratio * n)
    
    X_train, y_train = X[:train_size], y[:train_size]
    X_val, y_val = X[train_size:train_size+val_size], y[train_size:train_size+val_size]
    X_test, y_test = X[train_size+val_size:], y[train_size+val_size:]
    
    return X_train, y_train, X_val, y_val, X_test, y_test


In [8]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def predict(X, theta):
    return sigmoid(np.dot(X, theta))

def cutoff(y_pred_probs):
    return (y_pred_probs >= 0.5).astype(int)

def accuracy(y_true, y_pred_probs):
    y_pred = cutoff(y_pred_probs)
    return np.mean(y_pred == y_true)

def rss(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2)

def log_loss(y_true, y_pred_probs, eps=1e-15):
    y_pred_probs = np.clip(y_pred_probs, eps, 1 - eps)
    return -np.mean(y_true * np.log(y_pred_probs) + (1 - y_true) * np.log(1 - y_pred_probs))


In [9]:
def logistic_regression_bgd(X, y, alpha=0.1, rho=1e-6, max_epochs=1000):
    m, n = X.shape
    theta = np.zeros(n)
    prev_loss = float("inf")
    loss_history = []

    for _ in range(max_epochs):
        y_pred = predict(X, theta)             
        grad = (1/m) * X.T.dot(y_pred - y)    
        theta -= alpha * grad                 

        loss = log_loss(y, y_pred)           
        loss_history.append(loss)

        if abs(prev_loss - loss) < rho:
            break
        prev_loss = loss

    return theta, loss_history


def logistic_regression_sgd(X, y, alpha=0.01, rho=1e-6, max_epochs=1000):
    m, n = X.shape
    theta = np.zeros(n)
    prev_loss = float("inf")
    loss_history = []

    for epoch in range(max_epochs):
        idx = np.random.permutation(m)
        X_shuffled, y_shuffled = X[idx], y[idx]

        for i in range(m):
            xi = X_shuffled[i].reshape(1, -1)
            yi = y_shuffled[i]
            y_pred_i = predict(xi, theta)    
            grad_i = xi.T * (y_pred_i - yi)
            theta -= alpha * grad_i.flatten()

        y_pred_all = predict(X, theta)     
        loss = log_loss(y, y_pred_all)
        loss_history.append(loss)

        if abs(prev_loss - loss) < rho:
            break
        prev_loss = loss

    return theta, loss_history


In [10]:
splits = [(0.5, 0.1, 0.4), (0.6, 0.1, 0.3), (0.7,0.1,0.2), (0.8, 0.1, 0.1)]
alphas = [0.01, 0.1]
rhos = [1e-6, 1e-5]
epochs_list = [500, 1000]

import os
os.makedirs("plots", exist_ok=True)
os.makedirs("results", exist_ok=True)


In [11]:
def train_model(X, y, method="bgd"):
    results = []
    output_results = []
    best_params_final = None
    overall_best_val_rss = float("inf")

    for split in splits:
        X_train, y_train, X_val, y_val, X_test, y_test = split_data(X, y, *split)
        best_val_rss = float("inf")
        best_theta = None
        best_history = None
        best_params = None

        for alpha in alphas:
            for rho in rhos:
                for max_epoch in epochs_list:
                    if method == "bgd":
                        theta, history = logistic_regression_bgd(X_train, y_train, alpha, rho, max_epoch)
                    else:
                        theta, history = logistic_regression_sgd(X_train, y_train, alpha, rho, max_epoch)

                    y_val_pred = predict(X_val, theta)
                    val_rss = rss(y_val, y_val_pred)

                    if val_rss < best_val_rss:
                        best_val_rss = val_rss
                        best_theta = theta
                        best_history = history
                        best_params = (alpha, rho, max_epoch)

        if best_val_rss < overall_best_val_rss:
            overall_best_val_rss = best_val_rss
            best_params_final = best_params


        y_train_pred = predict(X_train, best_theta)
        y_test_pred = predict(X_test, best_theta)
        train_acc = accuracy(y_train, y_train_pred)
        test_acc = accuracy(y_test, y_test_pred)

        results.append([split, best_params, train_acc, test_acc])

        output_results.append([split, cutoff(y_train_pred).tolist(), y_train.tolist(), 
                               cutoff(y_test_pred).tolist(),y_test.tolist()
                            ])
        
        split_str = f"{split[0]}_{split[1]}_{split[2]}"
        plt.plot(best_history)
        plt.xlabel("Epochs")
        plt.ylabel("Log Loss")
        plt.title(f"{method.upper()} Training Curve Split={split}")
        plt.savefig(f"plots/{method}_training_curve_{split_str}.png")
        plt.close()


    df_results = pd.DataFrame(results, columns=["Split", "Best Params (alpha,rho,epochs)", "Train Accuracy", "Test Accuracy"])
    df_results.to_csv(f"results/{method}_final_results.csv", index=False)

    df_outputs = pd.DataFrame(output_results, columns=["Split", "y_train_pred", "y_train", "y_test_pred", "y_test"])
    df_outputs.to_csv(f"results/{method}_train_test_predictions.csv", index=False)

    print(f"{method.upper()} training complete!")
    print("Overall best hyperparameters:", best_params_final)

In [12]:
train_model(X, y, method="bgd")
train_model(X, y, method="sgd")

BGD training complete!
Overall best hyperparameters: (0.1, 1e-06, 1000)
SGD training complete!
Overall best hyperparameters: (0.01, 1e-05, 1000)
