In [4]:
import random
import math
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import KFold

<h1>Logistic Regression From Scratch</h1>
<h3>Start with Loading in Data</h3>

In [5]:
def load_data(file_path):
    data = []
    labels = []

    with open(file_path, 'r') as file:
        for line in file:
            # Split the line into tokens based on whitespace separation
            tokens = line.strip().split()

            # Convert "Present" or "Absent" to binary. Assuming this is always the 5th column in the data.
            binary_feature = 1 if tokens[4] == "Present" else 0
            
            # Replace the "Present"/"Absent" with its binary representation
            tokens[4] = binary_feature

            # All tokens except the last are features (convert all to floats)
            features = [float(token) for token in tokens[:-1]]

            # The last token is the label (convert to int)
            label = int(tokens[-1])

            # Add the feature vector and label to the data lists
            data.append(features)
            labels.append(label)

    return data, labels


dataset1 = './project3_dataset1.txt'
dataset2 = './project3_dataset2.txt'

data1, labels1 = load_data(dataset1)
data2, labels2 = load_data(dataset2)

print(data1[0])
print(data2[0])

[20.13, 28.25, 131.2, 1261.0, 0.0, 0.1034, 0.144, 0.09791, 0.1752, 0.05533, 0.7655, 2.463, 5.203, 99.04, 0.005769, 0.02423, 0.0395, 0.01678, 0.01898, 0.002498, 23.69, 38.25, 155.0, 1731.0, 0.1166, 0.1922, 0.3215, 0.1628, 0.2572, 0.06637]
[132.0, 6.2, 6.47, 36.21, 1.0, 62.0, 30.77, 14.14, 45.0]


<h3>Add Intercept Term and Initialize Weights</h3>

In [10]:
# Create Intercepts
data1_with_intercept = [[1] + row for row in data1]
data2_with_intercept = [[1] + row for row in data2]

# Convert your data to numpy arrays
data1_with_intercept = np.array(data1_with_intercept)
labels1 = np.array(labels1)
data2_with_intercept = np.array(data2_with_intercept)
labels2 = np.array(labels2)

# Initialize Weights
weights1 = [random.uniform(-0.01, 0.01) for _ in range(len(data1_with_intercept[0]))]
weights2 = [random.uniform(-0.01, 0.01) for _ in range(len(data2_with_intercept[0]))]

<h3>Model Training</h3>

In [11]:
def sigmoid(z):
    if z < 0:
        return math.exp(z) / (1 + math.exp(z))
    else:
        return 1 / (1 + math.exp(-z))

def compute_gradient(X, y, theta):
    m = len(y)
    gradient = [0 for _ in range(len(theta))]
    
    for i in range(m):
        xi = X[i]
        yi = y[i]
        prediction = sigmoid(sum(w * x for w, x in zip(theta, xi)))
        for j in range(len(theta)):
            gradient[j] += (prediction - yi) * xi[j]
    
    gradient = [g / m for g in gradient]
    return gradient

def gradient_descent(X, y, theta, alpha, iterations, lambda_reg):
    for _ in range(iterations):
        gradient = compute_gradient(X, y, theta)
        # Apply L2 regularization
        regularization = [lambda_reg * t for t in theta]
        regularization[0] = 0  # Do not regularize the bias term
        theta = [t - alpha * (g + r) for t, g, r in zip(theta, gradient, regularization)]
    return theta

# Predict Function
def predict(X, theta):
    return [1 if sigmoid(sum(w * x for w, x in zip(theta, xi))) >= 0.5 else 0 for xi in X]


# Hyperparameters
alpha = 0.01
iterations = 1000


In [15]:
def train_and_evaluate(X, y, folds, alphas, iterations, lambda_regs):
    results = []

    kf = KFold(n_splits=folds)
    for alpha in alphas:
        print(f"Current Alpha: {alpha}")
        for lambda_reg in lambda_regs:
            print(f"Current Lambda: {lambda_reg}")
            metrics = {
                "Alpha": alpha, 
                "Lambda": lambda_reg, 
                "Accuracy": [], 
                "Precision": [], 
                "Recall": [], 
                "F1": [], 
                "AUC": []
            }

            for train_index, test_index in kf.split(X):
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]

                theta = [0 for _ in range(X_train.shape[1])]
                theta = gradient_descent(X_train, y_train, theta, alpha, iterations, lambda_reg)

                predictions = predict(X_test, theta)

                # Calculate and store metrics
                metrics["Accuracy"].append(accuracy_score(y_test, predictions))
                metrics["Precision"].append(precision_score(y_test, predictions, zero_division=0))
                metrics["Recall"].append(recall_score(y_test, predictions))
                metrics["F1"].append(f1_score(y_test, predictions))
                metrics["AUC"].append(roc_auc_score(y_test, predictions))

            # Compute average of each metric and store it
            averaged_metrics = {metric: np.mean(values) for metric, values in metrics.items() if metric not in ["Alpha", "Lambda"]}
            results.append({**{"Alpha": alpha, "Lambda": lambda_reg}, **averaged_metrics})

    return results


# Hyperparameters to test
alphas = [0.01, 0.1, 1]  # Learning rates
lambda_regs = [0, 0.1, 1]  # Regularization strengths

# Conduct analysis
print("Starting Results Dataset 1")
results_dataset1 = train_and_evaluate(data1_with_intercept, labels1, 10, alphas, iterations, lambda_regs)
print("Starting Results Dataset 2")
results_dataset2 = train_and_evaluate(data2_with_intercept, labels2, 10, alphas, iterations, lambda_regs)

# Print or analyze the results
for result in results_dataset1:
    print(result)

for result in results_dataset2:
    print(result)

Starting Results Dataset 1
Current Alpha: 0.01
Current Lambda: 0
Current Lambda: 0.1
Current Lambda: 1
Current Alpha: 0.1
Current Lambda: 0
Current Lambda: 0.1
Current Lambda: 1
Current Alpha: 1
Current Lambda: 0
Current Lambda: 0.1
Current Lambda: 1
Starting Results Dataset 2
Current Alpha: 0.01
Current Lambda: 0
Current Lambda: 0.1
Current Lambda: 1
Current Alpha: 0.1
Current Lambda: 0
Current Lambda: 0.1
Current Lambda: 1
Current Alpha: 1
Current Lambda: 0
Current Lambda: 0.1
Current Lambda: 1
{'Alpha': 0.01, 'Lambda': 0, 'Accuracy': 0.9139097744360903, 'Precision': 0.8874317719790865, 'Recall': 0.8664889883588837, 'F1': 0.8739105376919184, 'AUC': 0.9038377780045999}
{'Alpha': 0.01, 'Lambda': 0.1, 'Accuracy': 0.8821428571428571, 'Precision': 0.8628014074682155, 'Recall': 0.8255052040791666, 'F1': 0.8251243065586384, 'AUC': 0.8719732136593057}
{'Alpha': 0.01, 'Lambda': 1, 'Accuracy': 0.4608082706766917, 'Precision': 0.6035087719298244, 'Recall': 0.6618000279290601, 'F1': 0.4045088282