# Classification


In [21]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from collections import Counter


seed = 42
np.random.seed(seed)
random.seed(seed)

# Load the body dataset
df = pd.read_csv("data.csv")

# Map famhist to numeric
if "famhist" in df.columns:
    df["famhist"] = df["famhist"].map({"Absent": 0, "Present": 1})

# Fix skewed variables
for col in ["tobacco", "alcohol"]:
    if col in df.columns:
        df[col] = np.log1p(df[col])

# Split the data frame into features and labels
X = df.drop(columns=["chd"])
y = df["chd"]

#parameter ranges
lambda_range = [1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]
C_range = [1.0 / lam for lam in lambda_range]  # sklearn uses C = 1/Î»
k_range = [1, 3, 5, 7, 9, 11, 13, 15]

param_grid_log = {"clf__C": C_range}
param_grid_knn = {"clf__n_neighbors": k_range}

# 2-layer CV
def two_layer_cv(X, y, outer_folds=10, inner_folds=10, random_state=42):
    CV_outer = StratifiedKFold(n_splits=outer_folds, shuffle=True, random_state=random_state)

    fold_results = []
    for i, (train_outer_idx, test_outer_idx) in enumerate(CV_outer.split(X, y)):
        # Split X and y into training/testing data for this outer fold
        # Name them X_train_outer, X_test_outer, y_train_outer, y_test_outer
        X_train_outer, X_test_outer = X.iloc[train_outer_idx], X.iloc[test_outer_idx]
        y_train_outer, y_test_outer = y.iloc[train_outer_idx], y.iloc[test_outer_idx]

        # baseline
        maj_class = Counter(y_train_outer).most_common(1)[0][0]
        y_test_pred_base = np.full_like(y_test_outer, fill_value=maj_class)
        Etest_base = np.mean(y_test_outer != y_test_pred_base)

        #logistic regression (inner loop)
        inner_CV = StratifiedKFold(n_splits=inner_folds, shuffle=True, random_state=random_state)

        val_error = []
        for C in C_range:
            inner_err = []
            for tr_in_idx, te_in_idx in inner_CV.split(X_train_outer, y_train_outer):
                X_train_inner, X_test_inner = X_train_outer.iloc[tr_in_idx], X_train_outer.iloc[te_in_idx]
                y_train_inner, y_test_inner = y_train_outer.iloc[tr_in_idx], y_train_outer.iloc[te_in_idx]

                model = make_pipeline(StandardScaler(), LogisticRegression(C=C, max_iter=1000))
                model.fit(X_train_inner, y_train_inner)
                y_pred_inner = model.predict(X_test_inner)
                inner_err.append(np.mean(y_pred_inner != y_test_inner))
            val_error.append(np.mean(inner_err))
        best_idx = np.argmin(val_error)
        C_star = C_range[best_idx]
        lambda_star = lambda_range[best_idx]

        # Retrain on full outer-train with best C, test on outer-test
        model_log = make_pipeline(StandardScaler(), LogisticRegression(C=C_star, max_iter=1000))
        model_log.fit(X_train_outer, y_train_outer)
        y_pred_outer_log = model_log.predict(X_test_outer)
        Etest_log = np.mean(y_pred_outer_log != y_test_outer)

        # knn (inner loop)
        val_error_knn = []
        for k in k_range:
            inner_err = []
            for tr_in_idx, te_in_idx in inner_CV.split(X_train_outer, y_train_outer):
                X_train_inner, X_test_inner = X_train_outer.iloc[tr_in_idx], X_train_outer.iloc[te_in_idx]
                y_train_inner, y_test_inner = y_train_outer.iloc[tr_in_idx], y_train_outer.iloc[te_in_idx]

                model = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=k))
                model.fit(X_train_inner, y_train_inner)
                y_pred_inner = model.predict(X_test_inner)
                inner_err.append(np.mean(y_pred_inner != y_test_inner))
            val_error_knn.append(np.mean(inner_err))
        best_idx_knn = np.argmin(val_error_knn)
        k_star = k_range[best_idx_knn]

        # Retrain on full outer-train with best k, test on outer-test
        model_knn = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=k_star))
        model_knn.fit(X_train_outer, y_train_outer)
        y_pred_outer_knn = model_knn.predict(X_test_outer)
        Etest_knn = np.mean(y_pred_outer_knn != y_test_outer)

        #save fold results
        fold_results.append({
            "": i + 1,
            "lambda* (log)": lambda_star,
            "Etest_log": Etest_log,
            "k* (knn)": k_star,
            "Etest_knn": Etest_knn,
            "Etest_base": Etest_base
        })

    return fold_results

#run two-level CV
table = two_layer_cv(X, y, outer_folds=10, inner_folds=10, random_state=1)
table = pd.DataFrame(table)
#display results
summary = pd.DataFrame({
    "": ["logistic", "knn", "baseline"],
    "Etest_mean": [
        table["Etest_log"].mean(),
        table["Etest_knn"].mean(),
        table["Etest_base"].mean()
    ],
    "Etest_sd": [
        table["Etest_log"].std(ddof=1),
        table["Etest_knn"].std(ddof=1),
        table["Etest_base"].std(ddof=1)
    ]
})

# Print the results
print("2-layer cross-validation results per fold")
print(table.round(4).to_string(index=False))
print("\nSummary")
print(summary.round(4).to_string(index=False))

2-layer cross-validation results per fold
    lambda* (log)  Etest_log  k* (knn)  Etest_knn  Etest_base
 1        10.0000     0.2979        15     0.2553      0.3404
 2       100.0000     0.2979        11     0.2128      0.3404
 3        10.0000     0.2391        15     0.2609      0.3478
 4        10.0000     0.3043        13     0.3261      0.3478
 5        10.0000     0.2609        15     0.2609      0.3478
 6         0.0001     0.3478        15     0.3913      0.3478
 7        10.0000     0.2826        15     0.3478      0.3478
 8        10.0000     0.2391        15     0.3043      0.3478
 9        10.0000     0.2826        11     0.3043      0.3478
10        10.0000     0.3043        15     0.2826      0.3478

Summary
          Etest_mean  Etest_sd
logistic      0.2857    0.0330
     knn      0.2946    0.0517
baseline      0.3463    0.0031


# Logistic Regression

In [2]:
def regularize_data(file):
    df = pd.read_csv(file)
    df['famhist'] = df['famhist'].map({'Present': 1, 'Absent': 0})
    df = (df-df.mean()) / df.std()
    return df

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error

def logistic_regression_classification():
    threshold = 0.5

    df = regularize_data("data.csv")
    y = (df["chd"]>threshold).astype(int)
    X = df.drop(columns=["chd", "row.names"])


    lambda_val = 0.1 #pode ter que ser mudado porque depende do ponto 4


    C_val = 1 / lambda_val
    logreg_model = LogisticRegression(
        penalty="l2",
        C=100,
        solver="lbfgs",
        max_iter=1000,
        random_state=42
    )

    logreg_model.fit(X, y)

    print("Bias:", logreg_model.intercept_)
    print("Features:", X.columns.tolist())
    print("Coefficients for each feature:", logreg_model.coef_)


    y_pred = logreg_model.predict(X)
    accuracy = accuracy_score(y, y_pred)
    # mse
    mse = mean_squared_error(y, y_pred)

    print(f"Logistic Regression Accuracy: {accuracy:.4f}")
    print(f"Logistic Regression MSE: {mse:.4f}")

logistic_regression_classification()


Bias: [-0.87863012]
Features: ['sbp', 'tobacco', 'ldl', 'adiposity', 'famhist', 'typea', 'obesity', 'alcohol', 'age']
Coefficients for each feature: [[ 0.13321451  0.36438477  0.36031357  0.14263364  0.45636594  0.38864065
  -0.26353865  0.00313647  0.66182163]]
Logistic Regression Accuracy: 0.7338
Logistic Regression MSE: 0.2662
