In [11]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# =========================
# LOAD DATA
# =========================
DATA_PATH = r"C:\Users\msi\Desktop\Model Training\KOA_Dataset_optionA.csv"
TARGET = "KOA"

df = pd.read_csv(DATA_PATH)
df = df.drop(columns=["Timestamp", "Column 1", "Patient Identification Number"], errors="ignore")
df = df.loc[:, ~df.columns.str.contains("^Unnamed")]

X = df.drop(columns=[TARGET], errors="ignore")
y = df[TARGET]

# Encode target if categorical
if y.dtype == "object":
    y = y.astype("category").cat.codes
else:
    y = y.astype(int)

# =========================
# COLUMN GROUPS
# =========================
num_cols = X.select_dtypes(include=["int64", "float64", "int32", "float32"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object", "category", "bool"]).columns.tolist()

# =========================
# PREPROCESSING (WITH IMPUTER ✅)
# =========================
numeric_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),   # ✅ FIX
    ("scaler", StandardScaler())
])

categorical_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),  # ✅ FIX
    ("ohe", OneHotEncoder(handle_unknown="ignore", drop="first"))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_pipe, num_cols),
        ("cat", categorical_pipe, cat_cols),
    ],
    remainder="drop"
)

# =========================
# SPLIT DATA
# =========================
X_trainval, X_test, y_trainval, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=42
)

X_train, X_val, y_train, y_val = train_test_split(
    X_trainval, y_trainval, test_size=0.20, stratify=y_trainval, random_state=42
)

# =========================
# LOGISTIC REGRESSION MODEL
# =========================
lr = LogisticRegression(
    C=0.01,                    
    penalty="elasticnet",     
    l1_ratio=1,          
    solver="saga",
    multi_class="multinomial",
    class_weight=None,        
    max_iter=6000,
    random_state=42
)

pipe = Pipeline([
    ("preprocess", preprocess),
    ("model", lr)
])

# =========================
# TRAIN & EVALUATE
# =========================
pipe.fit(X_train, y_train)

print("Logistic Regression Accuracy")
print("Train:", accuracy_score(y_train, pipe.predict(X_train)))
print("Val  :", accuracy_score(y_val, pipe.predict(X_val)))
print("Test :", accuracy_score(y_test, pipe.predict(X_test)))


Logistic Regression Accuracy
Train: 0.821875
Val  : 0.8625
Test : 0.85
