In [4]:
import pandas as pd, numpy as np, random
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.base import clone
from collections import Counter

seed = 42
np.random.seed(seed); random.seed(seed)

df = pd.read_csv("data.csv")
if "famhist" in df.columns:
    df["famhist"] = df["famhist"].map({"Absent":0,"Present":1}).astype(int)

for col in ["tobacco","alcohol"]:
    if col in df.columns:
        df[col] = np.log1p(df[col])

y = df["chd"].astype(int)
X = df.drop(columns=["chd"])

def error_rate(y_true, y_pred):
    return np.mean(y_true != y_pred)

logistic = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=1000))
])
knn = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", KNeighborsClassifier())
])

lambda_range = [1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]
C_range = [1.0/l for l in lambda_range]
k_range = [1, 3, 5, 7, 9, 11, 13, 15]

K=10
outer = StratifiedKFold(K, shuffle=True, random_state=seed)
outer_splits = list(outer.split(X, y))

rows = []
y_true_all, yhat_log_all, yhat_knn_all, yhat_base_all = [], [], [], []

for fold_idx, (train_index, test_index) in tqdm(
    enumerate(outer_splits, start=1),
    total=len(outer_splits),
    desc="Outer CV fold"
):
    # Outer split
    X_train, y_train = X.iloc[train_index,:], y.iloc[train_index]
    X_test,  y_test  = X.iloc[test_index,:],  y.iloc[test_index]

    # Baseline
    maj = Counter(y_train).most_common(1)[0][0]
    yhat_base = np.full_like(y_test, fill_value=maj)

    # Inner CV Log. Regression
    inner = StratifiedKFold(K, shuffle=True, random_state=123)
    best_C, best_score = None, -np.inf

    for C in C_range:
        fold_scores = []
        for tr_in, va_in in inner.split(X_train, y_train):
            Xi_tr, yi_tr = X_train.iloc[tr_in,:], y_train.iloc[tr_in]
            Xi_va, yi_va = X_train.iloc[va_in,:], y_train.iloc[va_in]

            model = clone(logistic)
            model.set_params(clf__C=C)
            model.fit(Xi_tr, yi_tr)
            y_pred_va = model.predict(Xi_va)
            fold_scores.append(accuracy_score(yi_va, y_pred_va))
        mean_acc = np.mean(fold_scores)
        if mean_acc > best_score:
            best_score = mean_acc
            best_C = C

    final_log = clone(logistic).set_params(clf__C=best_C).fit(X_train, y_train)
    yhat_log = final_log.predict(X_test)
    Etest_log = error_rate(y_test, yhat_log)
    lambda_star = 1.0 / best_C

    # Inner CV KNN
    best_k, best_score = None, -np.inf

    for k in k_range:
        fold_scores = []
        for tr_in, va_in in inner.split(X_train, y_train):
            Xi_tr, yi_tr = X_train.iloc[tr_in,:], y_train.iloc[tr_in]
            Xi_va, yi_va = X_train.iloc[va_in,:], y_train.iloc[va_in]

            model = clone(knn)
            model.set_params(clf__n_neighbors=k)
            model.fit(Xi_tr, yi_tr)
            y_pred_va = model.predict(Xi_va)
            fold_scores.append(accuracy_score(yi_va, y_pred_va))
        mean_acc = np.mean(fold_scores)
        if mean_acc > best_score:
            best_score = mean_acc
            best_k = k

    final_knn = clone(knn).set_params(clf__n_neighbors=best_k).fit(X_train, y_train)
    yhat_knn = final_knn.predict(X_test)
    Etest_knn = error_rate(y_test, yhat_knn)

    Etest_base = error_rate(y_test, yhat_base)

    rows.append({
        "fold": fold_idx,
        "lambda* (log)": lambda_star,
        "Etest_log": Etest_log,
        "k* (knn)": best_k,
        "Etest_knn": Etest_knn,
        "Etest_base": Etest_base
    })

    y_true_all.append(y_test.values)
    yhat_log_all.append(yhat_log)
    yhat_knn_all.append(yhat_knn)
    yhat_base_all.append(yhat_base)

table_cls = pd.DataFrame(rows)

summary = pd.DataFrame({
    "method": ["logistic", "knn", "baseline"],
    "Etest_mean": [
        table_cls["Etest_log"].mean(),
        table_cls["Etest_knn"].mean(),
        table_cls["Etest_base"].mean()
    ],
    "Etest_sd": [
        table_cls["Etest_log"].std(ddof=1),
        table_cls["Etest_knn"].std(ddof=1),
        table_cls["Etest_base"].std(ddof=1)
    ]
})

print("\nTable 2 (classification; two-level CV) ")
print(table_cls.round(4).to_string(index=False))

print("\nSummary")
print(summary.round(4).to_string(index=False))

y_true_all  = np.concatenate(y_true_all)
yhat_log_all  = np.concatenate(yhat_log_all)
yhat_knn_all  = np.concatenate(yhat_knn_all)
yhat_base_all = np.concatenate(yhat_base_all)


Outer CV fold: 100%|██████████| 10/10 [00:20<00:00,  2.09s/it]


Table 2 (classification; two-level CV) 
 fold  lambda* (log)  Etest_log  k* (knn)  Etest_knn  Etest_base
    1       100.0000     0.2979        15     0.3617      0.3404
    2         1.0000     0.3617        15     0.3617      0.3404
    3         0.0001     0.3913        15     0.3261      0.3478
    4       100.0000     0.3043        15     0.3261      0.3478
    5        10.0000     0.1957        11     0.3043      0.3478
    6         1.0000     0.2609        13     0.3261      0.3478
    7        10.0000     0.2174        13     0.2174      0.3478
    8        10.0000     0.2826        15     0.3043      0.3478
    9        10.0000     0.3261        13     0.3261      0.3478
   10         0.0001     0.1957        11     0.2391      0.3478

Summary
  method  Etest_mean  Etest_sd
logistic      0.2833    0.0671
     knn      0.3093    0.0472
baseline      0.3463    0.0031





In [21]:
import pandas as pd, numpy as np, random
from tqdm import tqdm

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler   # (import mantido, não usado)
from sklearn.pipeline import Pipeline              # (import mantido, não usado)
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.base import clone
from collections import Counter

seed = 42
np.random.seed(seed); random.seed(seed)

df = pd.read_csv("data.csv")
if "famhist" in df.columns:
    df["famhist"] = df["famhist"].map({"Absent":0,"Present":1}).astype(int)
for col in ["tobacco","alcohol"]:
    if col in df.columns:
        df[col] = np.log1p(df[col])

y = df["chd"].astype(int)
X = df.drop(columns=["chd"])

# ==== trocado: sem Pipeline/StandardScaler; modelos "puros"
logistic = LogisticRegression(penalty="l2", solver="lbfgs", max_iter=1000)
knn = KNeighborsClassifier()

lambda_range = [1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]
C_range = [1.0/l for l in lambda_range]
k_range = [1, 3, 5, 7, 9, 11, 13, 15]

outer = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
outer_splits = list(outer.split(X, y))

rows = []
y_true_all, yhat_log_all, yhat_knn_all, yhat_base_all = [], [], [], []

# helper: normalização manual com stats do treino
def standardize_train_test(X_tr, X_te):
    # aceita DataFrame ou ndarray
    Xtr = X_tr.to_numpy() if hasattr(X_tr, "to_numpy") else np.asarray(X_tr)
    Xte = X_te.to_numpy() if hasattr(X_te, "to_numpy") else np.asarray(X_te)
    mu = np.mean(Xtr, axis=0)
    sd = np.std(Xtr, axis=0, ddof=0)
    sd_safe = sd.copy()
    sd_safe[sd_safe == 0] = 1.0
    return (Xtr - mu)/sd_safe, (Xte - mu)/sd_safe

for fold_idx, (train_index, test_index) in tqdm(
    enumerate(outer_splits, start=1),
    total=len(outer_splits),
    desc="Outer CV fold"
):
    # Outer split
    X_train, y_train = X.iloc[train_index,:], y.iloc[train_index]
    X_test,  y_test  = X.iloc[test_index,:],  y.iloc[test_index]

    # Baseline
    maj = Counter(y_train).most_common(1)[0][0]
    yhat_base = np.full_like(y_test, fill_value=maj)
    Etest_base = 1 - accuracy_score(y_test, yhat_base)

    # ===== Inner CV Log. Regression (seleção de C) =====
    inner = StratifiedKFold(n_splits=10, shuffle=True, random_state=123)
    best_C, best_score = None, -np.inf

    for C in C_range:
        fold_scores = []
        for tr_in, va_in in inner.split(X_train, y_train):
            Xi_tr, yi_tr = X_train.iloc[tr_in,:], y_train.iloc[tr_in]
            Xi_va, yi_va = X_train.iloc[va_in,:], y_train.iloc[va_in]

            # normalize dentro do inner fold
            Xi_tr_s, Xi_va_s = standardize_train_test(Xi_tr, Xi_va)

            model = clone(logistic)
            model.set_params(C=C)
            model.fit(Xi_tr_s, yi_tr)
            y_pred_va = model.predict(Xi_va_s)
            fold_scores.append(accuracy_score(yi_va, y_pred_va))
        mean_acc = np.mean(fold_scores)
        if mean_acc > best_score:
            best_score = mean_acc
            best_C = C

    # Refit final LR no outer (normalização com stats do outer-train)
    Xtr_s_outer, Xte_s_outer = standardize_train_test(X_train, X_test)
    final_log = clone(logistic).set_params(C=best_C).fit(Xtr_s_outer, y_train)
    yhat_log = final_log.predict(Xte_s_outer)
    Etest_log = error_rate(y_test, yhat_log)
    lambda_star = 1.0 / best_C

    # ===== Inner CV KNN =====
    best_k, best_score = None, -np.inf

    for k in k_range:
        fold_scores = []
        for tr_in, va_in in inner.split(X_train, y_train):
            Xi_tr, yi_tr = X_train.iloc[tr_in,:], y_train.iloc[tr_in]
            Xi_va, yi_va = X_train.iloc[va_in,:], y_train.iloc[va_in]

            Xi_tr_s, Xi_va_s = standardize_train_test(Xi_tr, Xi_va)

            model = clone(knn)
            model.set_params(n_neighbors=k)
            model.fit(Xi_tr_s, yi_tr)
            y_pred_va = model.predict(Xi_va_s)
            fold_scores.append(accuracy_score(yi_va, y_pred_va))
        mean_acc = np.mean(fold_scores)
        if mean_acc > best_score:
            best_score = mean_acc
            best_k = k

    # Refit final KNN no outer (com a mesma normalização do outer)
    final_knn = clone(knn).set_params(n_neighbors=best_k).fit(Xtr_s_outer, y_train)
    yhat_knn = final_knn.predict(Xte_s_outer)
    Etest_knn = 1 - accuracy_score(y_test, yhat_knn)

#----------------------------------------------

    rows.append({
        "fold": fold_idx,
        "lambda* (log)": lambda_star,
        "Etest_log": Etest_log,
        "k* (knn)": best_k,
        "Etest_knn": Etest_knn,
        "Etest_base": Etest_base
    })

    y_true_all.append(y_test.values)
    yhat_log_all.append(yhat_log)
    yhat_knn_all.append(yhat_knn)
    yhat_base_all.append(yhat_base)


table_cls = pd.DataFrame(rows)

summary = pd.DataFrame({
    "method": ["logistic", "knn", "baseline"],
    "Etest_mean": [
        table_cls["Etest_log"].mean(),
        table_cls["Etest_knn"].mean(),
        table_cls["Etest_base"].mean()
    ],
    "Etest_sd": [
        table_cls["Etest_log"].std(ddof=1),
        table_cls["Etest_knn"].std(ddof=1),
        table_cls["Etest_base"].std(ddof=1)
    ]
})
#----------------------------------------------

print("\nTable 2 (classification; two-level CV) ")
print(table_cls.round(4).to_string(index=False))

print("\nSummary")
print(summary.round(4).to_string(index=False))

y_true_all  = np.concatenate(y_true_all)
yhat_log_all  = np.concatenate(yhat_log_all)
yhat_knn_all  = np.concatenate(yhat_knn_all)
yhat_base_all = np.concatenate(yhat_base_all)


Outer CV fold: 100%|██████████| 10/10 [00:05<00:00,  1.85it/s]


Table 2 (classification; two-level CV) 
 fold  lambda* (log)  Etest_log  k* (knn)  Etest_knn  Etest_base
    1       100.0000     0.2979        15     0.3617      0.3404
    2         1.0000     0.3617        15     0.3617      0.3404
    3         0.0001     0.3913        15     0.3261      0.3478
    4       100.0000     0.3043        15     0.3261      0.3478
    5        10.0000     0.1957        11     0.3043      0.3478
    6         1.0000     0.2609        13     0.3261      0.3478
    7        10.0000     0.2174        13     0.2174      0.3478
    8        10.0000     0.2826        15     0.3043      0.3478
    9        10.0000     0.3261        13     0.3261      0.3478
   10         0.0001     0.1957        11     0.2391      0.3478

Summary
  method  Etest_mean  Etest_sd
logistic      0.2833    0.0671
     knn      0.3093    0.0472
baseline      0.3463    0.0031





In [78]:
#COM PIPELINE

import pandas as pd
import numpy as np
import random
from tqdm import tqdm

from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from collections import Counter

# ----------------------------
# Setup
# ----------------------------
seed = 42
np.random.seed(seed)
random.seed(seed)

# ----------------------------
# Load and preprocess data
# ----------------------------
df = pd.read_csv("data.csv")

# Map famhist to numeric
if "famhist" in df.columns:
    df["famhist"] = df["famhist"].map({"Absent": 0, "Present": 1}).astype(int)

# Fix skewed variables
for col in ["tobacco", "alcohol"]:
    if col in df.columns:
        df[col] = np.log1p(df[col])

# Define features and target
y = df["chd"].astype(int)
X = df.drop(columns=["chd"])

# ----------------------------
# Pipelines (scaling inside CV)
# ----------------------------
logistic = Pipeline([("scaler", StandardScaler()),("clf", LogisticRegression(max_iter=1000))])

knn = Pipeline([("scaler", StandardScaler()),("clf", KNeighborsClassifier())])

# ----------------------------
# Parameter grids
# ----------------------------
lambda_range = [1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]
C_range = [1.0 / lam for lam in lambda_range]  # sklearn uses C = 1/λ
k_range = [1, 3, 5, 7, 9, 11, 13, 15]

param_grid_log = {"clf__C": C_range}
param_grid_knn = {"clf__n_neighbors": k_range}

# ----------------------------
# Two-level CV (outer + inner)
# ----------------------------
outer = StratifiedKFold(n_splits=10, shuffle=True)
#outer_splits = list(outer.split(X, y))

rows = []
y_true_all = []
yhat_log_all = []
yhat_knn_all = []
yhat_base_all = []

for fold_idx, (train_index, test_index) in tqdm(enumerate(outer.split(X,y)),total=outer.get_n_splits(X,y),desc="Cross-validation fold"
):

    X_train = X.iloc[train_index, :]
    y_train = y[train_index]
    X_test = X.iloc[test_index, :]
    y_test = y[test_index]

    # --- Baseline ---
    maj = Counter(y_train).most_common(1)[0][0]
    ypredict_base = np.array([maj] * len(y_test))
    Etest_base = sum(y_test != ypredict_base) / len(ypredict_base)

    # --- Logistic Regression (inner CV) ---
    inner = StratifiedKFold(n_splits=10, shuffle=True,random_state=123)

    g_log = GridSearchCV(
        estimator=logistic,
        param_grid=param_grid_log,
        cv=inner,
        scoring="accuracy",
        n_jobs=-1
    )

    g_log.fit(X_train, y_train)
    ypredict_log = g_log.predict(X_test)
    Etest_log = sum(y_test != ypredict_log) / len(ypredict_log)
    lambda_star = 1.0 / g_log.best_params_["clf__C"]

    # --- KNN (inner CV) ---
    g_knn = GridSearchCV(
        estimator=knn,
        param_grid=param_grid_knn,
        cv=inner,
        scoring="accuracy",
        n_jobs=-1
    )
    g_knn.fit(X_train, y_train)
    ypredict_knn = g_knn.predict(X_test)
    Etest_knn = sum(y_test != ypredict_knn) / len(ypredict_knn)
    k_star = g_knn.best_params_["clf__n_neighbors"]

    # --- Save results ---
    rows.append({
        "fold": fold_idx,
        "lambda* (log)": lambda_star,
        "Etest_log": Etest_log,
        "k* (knn)": k_star,
        "Etest_knn": Etest_knn,
        "Etest_base": Etest_base
    })

    # Save predictions for future tests (McNemar, etc.)
    y_true_all.append(y_test.values)
    yhat_log_all.append(yhat_log)
    yhat_knn_all.append(yhat_knn)
    yhat_base_all.append(yhat_base)

# ----------------------------
# Results summary
# ----------------------------
table = pd.DataFrame(rows)

summary = pd.DataFrame({
    "method": ["logistic", "knn", "baseline"],
    "Etest_mean": [
        table["Etest_log"].mean(),
        table["Etest_knn"].mean(),
        table["Etest_base"].mean()
    ],
    "Etest_sd": [
        table["Etest_log"].std(ddof=1),
        table["Etest_knn"].std(ddof=1),
        table["Etest_base"].std(ddof=1)
    ]
})

print("\nTable 2")
print(table.round(4).to_string(index=False))

print("\nSummary ")
print(summary.round(4).to_string(index=False))

# Prepare concatenated arrays for next section
y_true_all = np.concatenate(y_true_all)
yhat_log_all = np.concatenate(yhat_log_all)
yhat_knn_all = np.concatenate(yhat_knn_all)
yhat_base_all = np.concatenate(yhat_base_all)



Cross-validation fold: 100%|██████████| 10/10 [00:02<00:00,  4.07it/s]


Table 2
 fold  lambda* (log)  Etest_log  k* (knn)  Etest_knn  Etest_base
    0       100.0000     0.2979        15     0.3617      0.3404
    1         1.0000     0.3617        15     0.3617      0.3404
    2         0.0001     0.3913        15     0.3261      0.3478
    3       100.0000     0.3043        15     0.3261      0.3478
    4        10.0000     0.1957        11     0.3043      0.3478
    5         1.0000     0.2609        13     0.3261      0.3478
    6        10.0000     0.2174        13     0.2174      0.3478
    7        10.0000     0.2826        15     0.3043      0.3478
    8        10.0000     0.3261        13     0.3261      0.3478
    9         0.0001     0.1957        11     0.2391      0.3478

Summary 
  method  Etest_mean  Etest_sd
logistic      0.2833    0.0671
     knn      0.3093    0.0472
baseline      0.3463    0.0031





In [86]:
#SEM PIPELINE
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from collections import Counter


SEED = 42
SEED_2 = 123
np.random.seed(SEED)
random.seed(SEED)

#absent/present to 0/1
df = pd.read_csv("data.csv")
if "famhist" in df.columns:
    df["famhist"] = df["famhist"].map({"Absent": 0, "Present": 1}).astype(int)

#skewed attributes
for col in ["tobacco", "alcohol"]:
    if col in df.columns:
        df[col] = np.log1p(df[col])

y = df["chd"].astype(int)
X = df.drop(columns=["chd"])

#parameters to test
lambda_range = [1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]
C_range = [1.0 / lam for lam in lambda_range]
k_range = [1, 3, 5, 7, 9, 11, 13, 15]

#for gridsearch
param_grid_log = {"C": C_range}
param_grid_knn = {"n_neighbors": k_range}

# two-level cross-validation
outer = StratifiedKFold(n_splits=10, shuffle=True, random_state=SEED)
scaler = StandardScaler()

rows = []
y_true_all = []
ypredict_log_all = []
ypredict_knn_all = []
ypredict_base_all = []

#outer
for fold_idx, (train_index, test_index) in tqdm(
    enumerate(outer.split(X, y)),
    total=outer.get_n_splits(X, y),
    desc="Cross-validation fold"
):

    X_train = X.iloc[train_index, :]
    y_train = y[train_index]
    X_test = X.iloc[test_index, :]
    y_test = y[test_index]

    #normalize data
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # baseline
    maj = Counter(y_train).most_common(1)[0][0]
    ypredict_base = np.array([maj] * len(y_test))
    Etest_base = sum(y_test != ypredict_base) / len(ypredict_base)

    # inner logistic regression
    inner = StratifiedKFold(n_splits=10, shuffle=True, random_state=SEED_2)

    grid_log = GridSearchCV(
        estimator=LogisticRegression(max_iter=1000),
        param_grid=param_grid_log,
        cv=inner,
        scoring="accuracy",
        n_jobs=-1
    )
    grid_log.fit(X_train_scaled, y_train)
    ypredict_log = grid_log.predict(X_test_scaled)
    Etest_log = sum(y_test != ypredict_log) / len(ypredict_log)
    lambda_star = 1.0 / grid_log.best_params_["C"]

    # inner knn
    grid_knn = GridSearchCV(
        estimator=KNeighborsClassifier(),
        param_grid=param_grid_knn,
        cv=inner,
        scoring="accuracy",
        n_jobs=-1
    )
    grid_knn.fit(X_train_scaled, y_train)
    ypredict_knn = grid_knn.predict(X_test_scaled)
    Etest_knn = sum(y_test != ypredict_knn) / len(ypredict_knn)
    k_star = grid_knn.best_params_["n_neighbors"]


    rows.append({"fold": fold_idx,
        "lambda* (log)": lambda_star,
        "Etest_log": Etest_log,
        "k* (knn)": k_star,
        "Etest_knn": Etest_knn,
        "Etest_base": Etest_base
    })

    #for next step
    y_true_all.append(y_test.values)
    ypredict_log_all.append(ypredict_log)
    ypredict_knn_all.append(ypredict_knn)
    ypredict_base_all.append(ypredict_base)

#results
overview = []
for nome, coluna in [("logistic", "Etest_log"), ("knn", "Etest_knn"), ("baseline", "Etest_base")]:
    vals = table[coluna].values
    overview.append({"method": nome, "Etest_mean": float(np.mean(vals)), "Etest_sd": float(np.std(vals, ddof=1))})

summary = pd.DataFrame(overview)

print("\nTable per fold")
print(table.round(4))

print("\nSummary")
print(summary.round(4))

#for next step
y_true_all = np.concatenate(y_true_all)
ypredict_log_all = np.concatenate(ypredict_log_all)
ypredict_knn_all = np.concatenate(ypredict_knn_all)
ypredict_base_all = np.concatenate(ypredict_base_all)




Cross-validation fold: 100%|██████████| 10/10 [00:01<00:00,  5.47it/s]


Table per fold
   fold  lambda* (log)  Etest_log  k* (knn)  Etest_knn  Etest_base
0     0       100.0000     0.2979        11     0.3404      0.3404
1     1         1.0000     0.3617        15     0.3617      0.3404
2     2         0.0001     0.3913        11     0.3696      0.3478
3     3       100.0000     0.3043        15     0.3261      0.3478
4     4        10.0000     0.1957        15     0.2391      0.3478
5     5        10.0000     0.2609        13     0.3261      0.3478
6     6        10.0000     0.2174        15     0.2609      0.3478
7     7        10.0000     0.2826        11     0.2826      0.3478
8     8        10.0000     0.3261        15     0.3043      0.3478
9     9         0.0001     0.1957        13     0.2391      0.3478

Summary
     method  Etest_mean  Etest_sd
0  logistic      0.2833    0.0671
1       knn      0.3050    0.0479
2  baseline      0.3463    0.0031





# Logistic Regression

In [2]:
def regularize_data(file):
    df = pd.read_csv(file)
    df['famhist'] = df['famhist'].map({'Present': 1, 'Absent': 0})
    df = (df-df.mean()) / df.std()
    return df

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error

def logistic_regression_classification():
    threshold = 0.5

    df = regularize_data("data.csv")
    y = (df["chd"]>threshold).astype(int)
    X = df.drop(columns=["chd", "row.names"])


    lambda_val = 0.1 #pode ter que ser mudado porque depende do ponto 4


    C_val = 1 / lambda_val
    logreg_model = LogisticRegression(
        penalty="l2",
        C=100,
        solver="lbfgs",
        max_iter=1000,
        random_state=42
    )

    logreg_model.fit(X, y)

    print("Bias:", logreg_model.intercept_)
    print("Features:", X.columns.tolist())
    print("Coefficients for each feature:", logreg_model.coef_)


    y_pred = logreg_model.predict(X)
    accuracy = accuracy_score(y, y_pred)
    # mse
    mse = mean_squared_error(y, y_pred)

    print(f"Logistic Regression Accuracy: {accuracy:.4f}")
    print(f"Logistic Regression MSE: {mse:.4f}")

logistic_regression_classification()


Bias: [-0.87863012]
Features: ['sbp', 'tobacco', 'ldl', 'adiposity', 'famhist', 'typea', 'obesity', 'alcohol', 'age']
Coefficients for each feature: [[ 0.13321451  0.36438477  0.36031357  0.14263364  0.45636594  0.38864065
  -0.26353865  0.00313647  0.66182163]]
Logistic Regression Accuracy: 0.7338
Logistic Regression MSE: 0.2662
