In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

np.random.seed(123456789)

In [None]:
# loading the diabetes dataset
df = pd.read_csv("diabetes.csv")

In [None]:
# data analysis 
feature_names = [c for c in df.columns if c != "Outcome"]
X_raw = df[feature_names].values.astype(float)
y = df["Outcome"].values.astype(int).reshape(-1, 1)

# analysis of dataset
print("X_raw shape:", X_raw.shape)
print("y shape    :", y.shape)
print("Features   :", feature_names)
np.mean(y)

# target variable distribution, want to see how many patients have diabetes vs not
plt.figure(figsize=(5,4))
sns.countplot(x=y.reshape(-1))
plt.title("Distribution of Diabetes Outcome")
plt.xlabel("Outcome (0 = No diabetes, 1 = Diabetes)")
plt.ylabel("Number of patients")
plt.show()

print("Class proportions:")
print("No diabetes:", np.mean(y==0))
print("Diabetes   :", np.mean(y==1))

# feature distributions, seeing what medical variables look like
df_clean = pd.DataFrame(X_clean, columns=feature_names)
df_clean.hist(figsize=(14,10), bins=20)
plt.suptitle("Feature Distributions", fontsize=16)
plt.show()

# feature distributions for diabetic vs non-diabetic
df_plot = pd.DataFrame(X_clean, columns=feature_names)
df_plot["Outcome"] = y
features_to_plot = ["Glucose", "BMI", "Age", "Insulin"]

for f in features_to_plot:
    plt.figure(figsize=(5,4))
    sns.histplot(data=df_plot, x=f, hue="Outcome", kde=True, bins=20)
    plt.title(f"{f} distribution by diabetes outcome")
    plt.show()

# correlation matrix
plt.figure(figsize=(10,8))
sns.heatmap(df_plot.corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Matrix")
plt.show()

In [None]:
# replacing "zero" in data with the mean of nonzero elements for specific columns
def replace_zeros_with_nonzero_mean(X, feature_names, columns_to_fix):
    X = X.copy()
    for col in columns_to_fix:
        j = feature_names.index(col)
        mask_zero = (X[:, j] == 0)
        n_zero = np.sum(mask_zero)
        mask_nonzero = ~mask_zero
        mean_val = np.mean(X[mask_nonzero, j])
        X[mask_zero, j] = mean_val
        print(f"{col:15s}: replaced {n_zero} zeros with mean(nonzero) = {mean_val:.3f}")
    return X

columns_to_fix = ["Glucose","BloodPressure","SkinThickness","Insulin","BMI"]
X_clean = replace_zeros_with_nonzero_mean(X_raw, feature_names, columns_to_fix)

In [None]:
# functions for standardise, data matrix and split
def standardise(data_matrix, row_of_means=None, row_of_stds=None):
    if row_of_means is None or row_of_stds is None:
        row_of_means = np.mean(data_matrix, axis=0)
        centered = data_matrix - row_of_means
        row_of_stds = np.std(centered, axis=0)
        return centered / row_of_stds, row_of_means, row_of_stds
    else:
        return (data_matrix - row_of_means) / row_of_stds

def linear_regression_data(data_inputs):
    first_column = np.ones((len(data_inputs), 1))
    return np.c_[first_column, data_inputs]

def train_test_split(X, y, test_ratio=0.2, seed=123):
    np.random.seed(seed)
    n = len(X)
    idx = np.random.permutation(n)
    test_n = int(round(n * test_ratio))
    test_idx = idx[:test_n]
    train_idx = idx[test_n:]
    return X[train_idx], y[train_idx], X[test_idx], y[test_idx]

def classification_accuracy(predicted_labels, true_labels):
    return np.mean(predicted_labels.reshape(-1) == true_labels.reshape(-1))

X_train_raw, y_train, X_test_raw, y_test = train_test_split(X_clean, y, test_ratio=0.2, seed=123)

print("Train shapes:", X_train_raw.shape, y_train.shape)
print("Test shapes :", X_test_raw.shape, y_test.shape)

In [None]:
# standardise using training data only
X_train, mu_X, std_X = standardise(X_train_raw)
X_test = standardise(X_test_raw, mu_X, std_X)

Phi_train = linear_regression_data(X_train)
Phi_test = linear_regression_data(X_test)

print("Phi_train shape:", Phi_train.shape)
print("Phi_test shape :", Phi_test.shape)

In [None]:
# using logistic regression for binary data and gradient descent
def logistic_function(inputs):
    return 1 / (1 + np.exp(-inputs))

def model_function(data_matrix, weights):
    return data_matrix @ weights

def binary_logistic_regression_cost_function(data_matrix, data_labels, weights):
    z = model_function(data_matrix, weights)
    # using logaddexp for numerical stability
    return np.mean(np.logaddexp(0, z) - data_labels * z)

def binary_logistic_regression_gradient(data_matrix, data_labels, weights):
    p = logistic_function(model_function(data_matrix, weights))
    return data_matrix.T @ (p - data_labels) / len(data_matrix)

def binary_prediction_labels(data_matrix, weights, threshold=0.5):
    p = logistic_function(model_function(data_matrix, weights))
    return (p > threshold).astype(int)

def gradient_descent(objective, gradient, initial_weights, step_size=1.0, no_of_iterations=1000, print_output=200):
    weights = np.copy(initial_weights)
    objective_values = [objective(weights)]
    for k in range(no_of_iterations):
        weights -= step_size * gradient(weights)
        objective_values.append(objective(weights))
        if (k + 1) % print_output == 0:
           print("Iteration {k}/{m}, objective = {o}.".format(k=k+1, m=no_of_iterations, o=objective_values[-2]))
    print("Iteration completed after {k}/{m}, objective = {o}.".format(k=no_of_iterations, m=no_of_iterations, o=objective_values[-1]))
    return weights, objective_values

In [None]:
# training the baseline logistic model, using step size: tau = 3.9 * s / ||Phi||^2

w0 = np.zeros((Phi_train.shape[1], 1))
step_size = 3.9 * len(Phi_train) / (np.linalg.norm(Phi_train) ** 2)

objective = lambda w: binary_logistic_regression_cost_function(Phi_train, y_train, w)
gradient = lambda w: binary_logistic_regression_gradient(Phi_train, y_train, w)

w_log, objective_vals = gradient_descent(objective, gradient, w0, step_size=step_size, no_of_iterations=2000, print_output=200)

pred_train = binary_prediction_labels(Phi_train, w_log)
pred_test = binary_prediction_labels(Phi_test, w_log)

acc_train = classification_accuracy(pred_train, y_train)
acc_test = classification_accuracy(pred_test, y_test)

print("Train accuracy:", acc_train)
print("Test accuracy :", acc_test)

plt.plot(objective_vals)
plt.xlabel("Iteration")
plt.ylabel("Cost")
plt.title("Training objective values")
plt.show()


In [None]:
# creating confusion matrix, ROC graph and AUC 
def confusion_matrix_binary(y_true, y_pred):
    y_true = y_true.reshape(-1)
    y_pred = y_pred.reshape(-1)
    tn = np.sum((y_true==0) & (y_pred==0))
    fp = np.sum((y_true==0) & (y_pred==1))
    fn = np.sum((y_true==1) & (y_pred==0))
    tp = np.sum((y_true==1) & (y_pred==1))
    return np.array([[tn, fp],[fn, tp]])

def roc_curve_points(data_matrix, weights, y_true, delta=0.01):
    y_true = y_true.reshape(-1)
    probs = logistic_function(model_function(data_matrix, weights)).reshape(-1)
    thresholds = np.arange(0, 1+delta, delta)

    points = []
    for t in thresholds:
        y_pred = (probs > t).astype(int)
        cm = confusion_matrix_binary(y_true, y_pred)
        tn, fp = cm[0,0], cm[0,1]
        fn, tp = cm[1,0], cm[1,1]
        tpr = tp/(tp+fn) if (tp+fn)>0 else 0.0
        fpr = fp/(fp+tn) if (fp+tn)>0 else 0.0
        points.append((fpr, tpr, t))
    return points

def auc_from_roc_points(roc_pts):
    roc_sorted = sorted(roc_pts, key=lambda x: x[0])  # sort by FPR
    fpr = np.array([p[0] for p in roc_sorted])
    tpr = np.array([p[1] for p in roc_sorted])
    return np.trapz(tpr, fpr)

cm_log = confusion_matrix_binary(y_test, pred_test)
print("Confusion matrix [[TN, FP],[FN, TP]]:\n", cm_log)

roc_pts_log = roc_curve_points(Phi_test, w_log, y_test, delta=0.01)
auc_log = auc_from_roc_points(roc_pts_log)
print("Test AUC (baseline logistic):", auc_log)

x = [p[0] for p in roc_pts_log]
y_ = [p[1] for p in roc_pts_log]
plt.plot(x, y_)
plt.plot([0, 1], [0, 1], "k--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC curve (baseline logistic) - test set")
plt.show()

In [1]:
# K-fold split and grid search
def KFold_split(data_size, K, seed=123456789):
    np.random.seed(seed)
    indexes = np.random.permutation(data_size)
    m, r = divmod(data_size, K)
    indexes_split = [indexes[i*m + min(i,r):(i+1)*m + min(i+1,r)] for i in range(K)]
    return indexes_split

def grid_search(objective, grid):
    values = np.array([objective(point) for point in grid])
    return grid[np.argmin(values)]

In [None]:
# L1-regularised logistic regression using proximal gradient
def soft_thresholding(argument, threshold):
    return np.sign(argument) * np.maximum(0, np.abs(argument) - threshold)

def logistic_cost_L1(data_matrix, labels, weights, alpha, penalise_bias=False):
    base = binary_logistic_regression_cost_function(data_matrix, labels, weights)
    if penalise_bias:
        return base + alpha * np.sum(np.abs(weights))
    return base + alpha * np.sum(np.abs(weights[1:]))

def logistic_grad_smooth_part(data_matrix, labels, weights):
    # gradient of just the logistic loss (smooth part)
    return binary_logistic_regression_gradient(data_matrix, labels, weights)

def proximal_gradient_descent(objective, gradient, proximal_map, initial_weights, step_size=1.0, no_of_iterations=1000, print_output=200):
    weights = np.copy(initial_weights)
    objective_values = [objective(weights)]
    for k in range(no_of_iterations):
        weights = proximal_map(weights - step_size * gradient(weights))
        objective_values.append(objective(weights))
        if (k + 1) % print_output == 0:
            print("Iteration {k}/{m}, objective = {o}.".format(k=k+1, m=no_of_iterations, o=objective_values[-2]))
    print("Iteration completed after {k}/{m}, objective = {o}.".format(k=no_of_iterations, m=no_of_iterations, o=objective_values[-1]))
    return weights, objective_values

def train_logistic_L1_prox(Phi, y, alpha, iters=3000, print_output=500):
    w0 = np.zeros((Phi.shape[1], 1))
    step = 0.9 * len(Phi) / (np.linalg.norm(Phi) ** 2)

    objective = lambda w: logistic_cost_L1(Phi, y, w, alpha, penalise_bias=False)
    gradient = lambda w: logistic_grad_smooth_part(Phi, y, w)

    # proximal map
    # no threshold for bias term
    def prox(w):
        w_new = np.copy(w)
        w_new[1:] = soft_thresholding(w_new[1:], step * alpha)
        return w_new

    return proximal_gradient_descent(
        objective, gradient, prox, w0,
        step_size=step, no_of_iterations=iters, print_output=print_output)

def kfold_cv_L1(Phi, y, K, alpha, iters=2000):
    idx_splits = KFold_split(len(Phi), K)
    accs = []
    for i in range(K):
        val_idx = idx_splits[i]
        tr_idx = np.concatenate([idx_splits[j] for j in range(K) if j != i])

        w_hat, _ = train_logistic_L1_prox(Phi[tr_idx], y[tr_idx], alpha,iters=iters, print_output=10**9)
        pred_val = binary_prediction_labels(Phi[val_idx], w_hat)
        accs.append(classification_accuracy(pred_val, y[val_idx]))
    # return validation error
    return 1.0 - np.mean(accs)  

In [None]:
# cross-validation curve of validation error vs alpha (L1)
alpha_grid = np.array([0.0, 0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2])
K = 5

cv_errors = np.array([kfold_cv_L1(Phi_train, y_train, K, a, iters=2000) for a in alpha_grid])

best_alpha = alpha_grid[np.argmin(cv_errors)]
best_error = np.min(cv_errors)

print("Best alpha (L1) =", best_alpha)
print("Best CV error   =", best_error)

plt.plot(alpha_grid, cv_errors, marker='o')
plt.xlabel("alpha (L1)")
plt.ylabel("K-fold validation error (1 - accuracy)")
plt.title("L1 logistic regression: CV error vs alpha")
plt.show()

In [None]:
# training final L1 model with the best alpha
w_l1, obj_vals_l1 = train_logistic_L1_prox(Phi_train, y_train, best_alpha,iters=4000, print_output=500)

pred_test_l1 = binary_prediction_labels(Phi_test, w_l1)
acc_test_l1 = classification_accuracy(pred_test_l1, y_test)

print("Test accuracy (L1):", acc_test_l1)

plt.plot(obj_vals_l1)
plt.xlabel("Iteration")
plt.ylabel("Objective")
plt.title("L1 logistic regression training objective")
plt.show()

print("L1 weights (bias then features):")
for name, val in zip(["BIAS"] + feature_names, w_l1.reshape(-1)):
    print(f"{name:25s} {val: .4f}")

In [None]:
# confusion matrix, ROC and AUC for L1 model
cm_l1 = confusion_matrix_binary(y_test, pred_test_l1)
print("Confusion matrix [[TN, FP],[FN, TP]]:\n", cm_l1)

roc_pts_l1 = roc_curve_points(Phi_test, w_l1, y_test, delta=0.01)
auc_l1 = auc_from_roc_points(roc_pts_l1)
print("Test AUC (L1):", auc_l1)

x = [p[0] for p in roc_pts_l1]
y_ = [p[1] for p in roc_pts_l1]
plt.plot(x, y_)
plt.plot([0, 1], [0, 1], "k--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC curve (L1 model) - test set")
plt.show()

In [None]:
# comparisons
print("Summary:")
print("Baseline logistic: test acc =", acc_test,  ", test AUC =", auc_log)
print("L1 logistic      : test acc =", acc_test_l1, ", test AUC =", auc_l1)