# Cyclic Coordinate Descent for Logistic Regression with Lasso regularization

This notebook presents the implementation of Cyclic Coordinate Descent (CCD) algorithm for parameter 
estimation in regularized logistic regression with l1 (lasso) penalty and compares it with standard 
logistic regression model without regularization. 

## Imports & Consts

TODO: Add information about reproducibility

In [1]:
import os
from scipy.io import arff
import numpy as np
import pandas as pd
from typing import List

In [2]:
CONST_DATASET_DIRECTORY_PATH = "./datasets"
CONST_RESuLTS_DIRECTORY_PATH = "./results"

## Data preprocessing

TODO: Find 4 datasets

In [10]:
def load_datasets() -> List[dict[str, pd.DataFrame]]:
    """Load all ARFF datasets from the datasets folder and return them as a list of polars dataframes."""
    datasets = []
    for file in os.listdir(CONST_DATASET_DIRECTORY_PATH):
        if file.endswith(".arff"):
            data = arff.loadarff(f"{CONST_DATASET_DIRECTORY_PATH}/{file}")
            df = pd.DataFrame(data[0])
            datasets.append({"name": file.strip(".arff"), "data": df})
    return datasets


def load_dataset(path: str) -> pd.DataFrame:
    """Load a dataset from a given path and return it as a polars dataframe."""
    data = arff.loadarff(path)
    return pd.DataFrame(data[0])

In [4]:
class Dataset:

    def __init__(
        self, name: str, df: pd.DataFrame, preprocessing_steps: List[callable] = None
    ):
        """Initialize a new dataset with a name, data, and preprocess the data resulting in two numpy arrays. X - features, and y - target."""

        self.name = name
        self.preprocessing_steps = preprocessing_steps

        self.X = df[df.columns[:-1]]
        self.y = df[df.columns[-1]]

        for step in self.preprocessing_steps:
            self.X = step(self.X)

        # Improve Logistic Regression performance by converting to numpy arrays
        self.X = self.X.to_numpy()

        # Convert the target to binary values
        self.class_names = self.y.unique()

        # To mitigate CopyOnWriteWarning
        self.y = self.y.copy()
        self.y[self.y == self.class_names[0]] = 0
        self.y[self.y == self.class_names[1]] = 1
        self.y = self.y.to_numpy()

    def fill_missing_values(df: pd.DataFrame) -> pd.DataFrame:
        """Fill the missing values in the dataframe using the mean of the column strategy."""
        return df.fillna(df.mean())

    def remove_colinear_features(
        df: pd.DataFrame, threshold: float = 0.8
    ) -> pd.DataFrame:
        """Remove features of a dataframe that are colinear."""

        corr_matrix = df.corr().abs()

        upper_tri = corr_matrix.where(
            np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
        )

        to_drop = [
            column for column in upper_tri.columns if any(upper_tri[column] > threshold)
        ]

        return df.drop(columns=to_drop)

    def normalize(df: pd.DataFrame) -> pd.DataFrame:
        """Normalize the features of a dataframe based on mean and standard deviation."""

        # Use the Min-Max normalization to produce features in range [0, 1]

        return (df - df.min()) / (df.max() - df.min())


class APBreastKidney(Dataset):
    """APBreastKidney dataset.
    source: https://www.openml.org/search?type=data&sort=runs&id=1158&status=active
    """

    dataset_path = f"{CONST_DATASET_DIRECTORY_PATH}/AP_Breast_Kidney.arff"

    def __init__(self):
        """Initialize tha APBreastKidney dataset"""

        data = load_dataset(APBreastKidney.dataset_path)

        super().__init__(
            "APBreastKidney",
            data,
            [
                Dataset.fill_missing_values,
                Dataset.remove_colinear_features,
                Dataset.normalize,
            ],
        )

In [None]:
%%time
# Keep in mind removing colinear features on a dataset with couple thousands of them is relatively time consuming

datasets = [APBreastKidney()]

In [14]:
%%time

# Alternatively, load all ARFF datasets from the datasets folder
# As a result you will obtain a list Datasets objects

datasets = load_datasets()

preprocessing_steps = [
    Dataset.fill_missing_values,
    Dataset.remove_colinear_features,
    Dataset.normalize,
]

for i in range(len(datasets)):
    datasets[i] = Dataset(datasets[i]["name"], datasets[i]["data"], preprocessing_steps)

## LogRegCCD

Implementation of regularized Logistic Regression wiht Cyclic Coordinate Descent based on the [publication](https://www.jstatsoft.org/article/view/v033i01).

TODO: Add high-level overview of the algorithm

In [None]:
# TODO: Implement it.


class LogRegCCD:
    """Logistic Regression with Coordinate Cyclic Descent and Lasso Regularization."""

    def __init__(self) -> None:
        """Initialize the LogRegCCD model."""
        pass

    def fit(self, X_train: np.ndarray, y_train: np.ndarray) -> None:
        """Fit the Logsitic Regression model on provided training features and labels."""
        pass

    def validate(self, X_valid: np.ndarray, y_valid: np.ndarray, measure: str) -> float:
        """Compute the provided measure based on the validation features and labels."""
        pass

    def predict_proba(self, X_test: np.ndarray) -> np.ndarray:
        """Predict the probabilities of the classes for the test features."""
        pass

    def plot(selfl, measure: str) -> None:
        """Plot the evalueation measure over different values of lambda."""
        pass

    def plot_coefficients(self) -> None:
        """Plot the coeefficients of the model over different values of lambda."""
        pass

## Performance & Comparison

In [None]:
# TODO: Performance and Comparison