In [23]:
# Import necessary libraries
import numpy as np
import torch
import torch.nn.functional as F
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import log_loss

In [26]:
# Class for X values
class X_values:
    def __init__(self, dataset):
        self.feature_dim = dataset.shape[1]
        self.data_size = dataset.shape[0]
        self.tensor = torch.tensor(dataset, dtype=torch.float32)
        print(f"X_values created with dimensions {self.tensor.shape}")

class Y_values:
    def __init__(self, targets):
        self.feature_dim = targets.shape[0]
        self.tensor = torch.tensor(targets, dtype=torch.float32).view(-1, 1)
        print(f"Y_values created with dimensions {self.tensor.shape}")

class Dataset:
    def __init__(self, x_values, y_values):
        self.x_tensor = x_values.tensor
        self.y_tensor = y_values.tensor

        if self.x_tensor.shape[0] != self.y_tensor.shape[0]:
            raise ValueError("Mismatch between X and Y dimensions!")
        
        self.feature_dim = x_values.feature_dim
        self.data_size = x_values.data_size
        print(f"Dataset created with X: {self.x_tensor.shape}, Y: {self.y_tensor.shape}")

def split_dataset(dataset, train_rate, val_rate, test_rate):
    if abs(train_rate + val_rate + test_rate - 1.0) > 1e-6:
        raise ValueError("Split rates must sum to 1.")

    total_size = dataset.x_tensor.shape[0]
    train_size = int(total_size * train_rate)
    val_size = int(total_size * val_rate)

    indices = torch.randperm(total_size)
    train_indices = indices[:train_size]
    val_indices = indices[train_size:train_size + val_size]
    test_indices = indices[train_size + val_size:]

    x_train, y_train = dataset.x_tensor[train_indices], dataset.y_tensor[train_indices]
    x_val, y_val = dataset.x_tensor[val_indices], dataset.y_tensor[val_indices]
    x_test, y_test = dataset.x_tensor[test_indices], dataset.y_tensor[test_indices]

    train_dataset = Dataset(X_values(x_train.numpy()), Y_values(y_train.numpy()))
    val_dataset = Dataset(X_values(x_val.numpy()), Y_values(y_val.numpy()))
    test_dataset = Dataset(X_values(x_test.numpy()), Y_values(y_test.numpy()))

    return train_dataset, val_dataset, test_dataset

# Fix calculation functions
def calculate_loss(x_tensor, y_tensor, weights):
    logits = torch.matmul(x_tensor, weights)
    loss = torch.mean(torch.log(1 + torch.exp(-y_tensor * logits)))
    return loss

def calculate_gradient(x_tensor, y_tensor, weights):
    logits = torch.matmul(x_tensor, weights)
    sigmoid = 1 / (1 + torch.exp(-logits))  # Sigmoid of the logits
    gradient = (1 / x_tensor.shape[0]) * torch.matmul(x_tensor.T, sigmoid - y_tensor)
    return gradient




    
# Logistic regression implementation
def logistic_regression_train(train_dataset, val_dataset, iter_num, lr):
    # Initialize weights randomly
    weights = torch.randn((train_dataset.x_tensor.shape[1], 1), requires_grad=True)
    
    for i in range(iter_num):
        # Training step
        train_loss = calculate_loss(train_dataset.x_tensor, train_dataset.y_tensor, weights)
        train_gradient = calculate_gradient(train_dataset.x_tensor, train_dataset.y_tensor, weights)
        
        # Update weights
        with torch.no_grad():
            weights -= lr * train_gradient
        
        # Validation step (no gradient computation needed)
        with torch.no_grad():
            val_loss = calculate_loss(val_dataset.x_tensor, val_dataset.y_tensor, weights)
        
        # Print iteration details
        if ((i+1) % 10 == 0):
            print(f"Iteration {i + 1}: Train Loss: {train_loss.item():.4f}, "f"Val Loss: {val_loss.item():.4f}, "f"Weights: {[f'{w:.4f}' for w in weights.view(-1).tolist()]}")

    return weights

def logistic_regression_test(test_dataset, weights):
    data_size = test_dataset.data_size
    print(f"Test Loss: {calculate_loss(test_dataset.x_tensor, test_dataset.y_tensor, weights)}")

    
def logistic_regression_sklearn(train_dataset, val_dataset):
    # Initialize the scikit-learn logistic regression model
    model = LogisticRegression(solver='lbfgs', max_iter=1000)

    # Convert tensors to numpy arrays for scikit-learn compatibility
    x_train = train_dataset.x_tensor.numpy()
    y_train = train_dataset.y_tensor.numpy().ravel()

    x_val = val_dataset.x_tensor.numpy()
    y_val = val_dataset.y_tensor.numpy().ravel()

    # Fit the model
    model.fit(x_train, y_train)

    # Evaluate the model on the validation set using log loss
    val_preds = model.predict_proba(x_val)[:, 1]
    val_loss = log_loss(y_val, val_preds)
    print(f"Validation Log Loss: {val_loss:.4f}")
    print()
    
    return model

    
def compare_models(scratch_weights, sklearn_model, val_dataset):
    # Predictions from scratch model
    scratch_logits = torch.matmul(val_dataset.x_tensor, scratch_weights)
    scratch_probs = 1 / (1 + torch.exp(-scratch_logits))
    
    # Predictions from Scikit-learn model
    sklearn_probs = sklearn_model.predict_proba(val_dataset.x_tensor.numpy())[:, 1]  # Get probabilities for class 1

    # Compute mean squared error (MSE) between predictions
    mse = mean_squared_error(scratch_probs.detach().numpy(), sklearn_probs)
    print(f"Mean Squared Error (MSE) between predictions: {mse:.4f}")

    # Optionally, compare validation losses
    scratch_loss = calculate_loss(val_dataset.x_tensor, val_dataset.y_tensor, scratch_weights).item()
    sklearn_loss = -sklearn_model.score(val_dataset.x_tensor.numpy(), val_dataset.y_tensor.numpy())  # Negative accuracy
    print(f"Scratch Model Loss: {scratch_loss:.4f}")
    print(f"Scikit-learn Loss (negative accuracy): {sklearn_loss:.4f}")

def compare_test_results(scratch_weights, sklearn_model, test_dataset):
    # Scratch model test loss
    scratch_loss = calculate_loss(test_dataset.x_tensor, test_dataset.y_tensor, scratch_weights).item()
    print(f"Scratch Model Test Loss: {scratch_loss:.4f}")

    # Predictions from scratch model
    scratch_logits = torch.matmul(test_dataset.x_tensor, scratch_weights)
    scratch_probs = 1 / (1 + torch.exp(-scratch_logits))

    # Predictions from Scikit-learn model
    sklearn_probs = sklearn_model.predict_proba(test_dataset.x_tensor.numpy())[:, 1]  # Get probabilities for class 1

    # Compute mean squared error (MSE) between predictions
    mse = mean_squared_error(scratch_probs.detach().numpy(), sklearn_probs)
    print(f"Mean Squared Error (MSE) between test predictions: {mse:.4f}")

    # Optionally, compare accuracies
    scratch_preds = (scratch_probs > 0.5).float()
    sklearn_preds = (sklearn_probs > 0.5).astype(np.float32)
    
    scratch_accuracy = (scratch_preds == test_dataset.y_tensor).float().mean().item()
    sklearn_accuracy = (sklearn_preds == test_dataset.y_tensor.numpy()).mean()

    print(f"Scratch Model Test Accuracy: {scratch_accuracy:.4f}")
    print(f"Scikit-learn Model Test Accuracy: {sklearn_accuracy:.4f}")


    
def generate_synthetic_data(num_samples=300, num_features=10, seed=42):
    np.random.seed(seed)  # For reproducibility
    torch.manual_seed(seed)

    # Generate random feature matrix (Gaussian distribution)
    X = np.random.randn(num_samples, num_features)

    # Define true weights and bias
    true_weights = np.random.randn(num_features)
    true_bias = np.random.randn()

    # Compute logits (linear combination of features and weights)
    logits = X @ true_weights + true_bias

    # Apply sigmoid function to get probabilities
    probs = 1 / (1 + np.exp(-logits))

    # Generate binary targets based on probabilities
    y = (probs > 0.5).astype(np.float32)  # Threshold at 0.5

    return X, y


# Generate synthetic dataset
X, y = generate_synthetic_data()
x_values = X_values(X)
y_values = Y_values(y)
dataset = Dataset(x_values, y_values)

# Split dataset
train_dataset, val_dataset, test_dataset = split_dataset(dataset, train_rate=0.7, val_rate=0.2, test_rate=0.1)

# Train scratch model
scratch_weights = logistic_regression_train(train_dataset, val_dataset, iter_num=200, lr=0.01)

# Test scratch model
#logistic_regression_test(test_dataset, scratch_weights)

# Train scikit-learn model
sklearn_model = logistic_regression_sklearn(train_dataset, val_dataset)

# Test scikit-learn model

# Compare the two models
#compare_models(scratch_weights, sklearn_model, val_dataset)
compare_test_results(scratch_weights, sklearn_model, test_dataset)


        

X_values created with dimensions torch.Size([300, 10])
Y_values created with dimensions torch.Size([300, 1])
Dataset created with X: torch.Size([300, 10]), Y: torch.Size([300, 1])
X_values created with dimensions torch.Size([210, 10])
Y_values created with dimensions torch.Size([210, 1])
Dataset created with X: torch.Size([210, 10]), Y: torch.Size([210, 1])
X_values created with dimensions torch.Size([60, 10])
Y_values created with dimensions torch.Size([60, 1])
Dataset created with X: torch.Size([60, 10]), Y: torch.Size([60, 1])
X_values created with dimensions torch.Size([30, 10])
Y_values created with dimensions torch.Size([30, 1])
Dataset created with X: torch.Size([30, 10]), Y: torch.Size([30, 1])
Iteration 10: Train Loss: 0.7182, Val Loss: 0.7797, Weights: ['-0.1707', '-0.5499', '0.7918', '0.4878', '0.8143', '0.3025', '0.7296', '-1.1190', '2.2569', '0.7613']
Iteration 20: Train Loss: 0.7121, Val Loss: 0.7717, Weights: ['-0.1845', '-0.5485', '0.7783', '0.5006', '0.8041', '0.2792',