In [44]:

import os
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.base import clone
from collections import defaultdict
import warnings
warnings.filterwarnings("ignore")


In [45]:
DATA_FILES = ["project1_dataset1.txt", "project1_dataset2.txt"]  # update paths if needed
SAVE_RESULTS = True
OUT_DIR = "classification_outputs"
RANDOM_STATE = 42
N_SPLITS = 10

In [46]:
# Models definitions (you can tune these hyperparams)
models = {
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "DecisionTree": DecisionTreeClassifier(max_depth=None, random_state=RANDOM_STATE),
    "NaiveBayes": GaussianNB(),
    "SVM": SVC(kernel="rbf", C=1.0, probability=True, class_weight="balanced", random_state=RANDOM_STATE),
    # AdaBoost will be created per fold with DecisionTree base
    "NeuralNet": MLPClassifier(hidden_layer_sizes=(100, ), activation="relu",
                               solver="adam", alpha=1e-4, max_iter=400, random_state=RANDOM_STATE)
}
# AdaBoost base
adaboost_base = DecisionTreeClassifier(max_depth=1, random_state=RANDOM_STATE)
adaboost = AdaBoostClassifier(base_estimator=adaboost_base, n_estimators=50, random_state=RANDOM_STATE)


In [47]:
# Metric functions
def compute_metrics(y_true, y_pred):
    return {
        "accuracy": accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred, zero_division=0),
        "recall": recall_score(y_true, y_pred, zero_division=0),
        "f1": f1_score(y_true, y_pred, zero_division=0)
    }

In [48]:
def load_dataset(path):
    if not os.path.exists(path):
        print(f"⚠️ File not found: {path}")
        return None

    try:
        # Try both tab and comma separators automatically
        df = pd.read_csv(path, sep=None, engine="python")
        print(f"Loaded: {path} with shape {df.shape}")
        return df
    except Exception as e:
        print(f"Error reading {path}: {e}")
        return None
    

In [49]:
def prepare_dataframes(df):
    """
    df: pandas DataFrame loaded without header. Assumes last column is label.
    Returns: X DataFrame (original column names preserved as ints), y Series
    """
    df = df.copy()
    # Drop rows with all NaN
    df.dropna(how="all", inplace=True)
    # Last column is label
    y = df.iloc[:, -1].astype(int)
    X = df.iloc[:, :-1]
    # Detect columns that are numeric vs object/nominal
    # Some numeric columns may be parsed as object if they contain strings; we'll try coercion
    numeric_mask = []
    for col in X.columns:
        # attempt to coerce to numeric
        coerced = pd.to_numeric(X[col], errors="coerce")
        non_null_fraction = coerced.notna().mean()
        # if most entries can be coerced to numeric, treat as numeric
        if non_null_fraction > 0.9:
            X[col] = coerced
            numeric_mask.append(True)
        else:
            # leave as object/categorical
            X[col] = X[col].astype(str)
            numeric_mask.append(False)
    numeric_mask = np.array(numeric_mask)
    return X, y, numeric_mask

In [50]:
def make_preprocessor(X, numeric_mask):
    """
    Builds a ColumnTransformer that imputes & scales numeric columns and imputes & one-hot encodes categorical columns.
    """
    numeric_cols = [c for c, m in zip(X.columns, numeric_mask) if m]
    cat_cols = [c for c, m in zip(X.columns, numeric_mask) if not m]

    numeric_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="mean")),
        ("scaler", StandardScaler())
    ])
    cat_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent", fill_value="missing")),
        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse=False))
    ])

    preprocessor = ColumnTransformer(transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", cat_transformer, cat_cols)
    ], remainder="drop")
    return preprocessor, numeric_cols, cat_cols



In [51]:
# ---------- Main experiment function ----------
def run_experiments(path):
    print(f"\n--- Processing file: {path} ---")
    df = load_dataset(path)
    print(f"Loaded shape: {df.shape}")
    X_raw, y, numeric_mask = prepare_dataframes(df)
    print(f"Detected {numeric_mask.sum()} numeric columns and {len(numeric_mask)-numeric_mask.sum()} nominal columns.")

    preprocessor, numeric_cols, cat_cols = make_preprocessor(X_raw, numeric_mask)

    skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)

    # Prepare structures to store per-model fold metrics
    results = {name: [] for name in list(models.keys()) + ["AdaBoost"]}
    fold_idx = 0

    # optionally store per-fold predictions
    per_fold_records = []

    for train_idx, test_idx in skf.split(X_raw, y):
        fold_idx += 1
        X_train_raw = X_raw.iloc[train_idx]
        X_test_raw = X_raw.iloc[test_idx]
        y_train = y.iloc[train_idx].values
        y_test = y.iloc[test_idx].values

        # Fit preprocessor on training data and transform both
        preprocessor.fit(X_train_raw)
        X_train = preprocessor.transform(X_train_raw)
        X_test = preprocessor.transform(X_test_raw)

        # For each model, fit and evaluate
        for name, clf in models.items():
            model = clone(clf)
            # For tree and NB, scaling is okay but not required. For KNN & SVM scaling matters; we've already scaled numerics.
            # Fit
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            met = compute_metrics(y_test, y_pred)
            results[name].append(met)
            # optionally save per-fold preds
            for i_true, i_pred, idx in zip(y_test, y_pred, test_idx):
                per_fold_records.append({
                    "file": os.path.basename(path),
                    "fold": fold_idx,
                    "index": int(idx),
                    "method": name,
                    "y_true": int(i_true),
                    "y_pred": int(i_pred)
                })

        # AdaBoost using DecisionTree base
        ada = clone(adaboost)
        ada.fit(X_train, y_train)
        y_pred = ada.predict(X_test)
        met = compute_metrics(y_test, y_pred)
        results["AdaBoost"].append(met)
        for i_true, i_pred, idx in zip(y_test, y_pred, test_idx):
            per_fold_records.append({
                "file": os.path.basename(path),
                "fold": fold_idx,
                "index": int(idx),
                "method": "AdaBoost",
                "y_true": int(i_true),
                "y_pred": int(i_pred)
            })

    # Summarize: compute mean ± std for each metric
    summary_rows = []
    for name, mets in results.items():
        # mets is list of dicts (one per fold)
        accs = [m["accuracy"] for m in mets]
        precs = [m["precision"] for m in mets]
        recs = [m["recall"] for m in mets]
        f1s = [m["f1"] for m in mets]
        summary_rows.append({
            "file": os.path.basename(path),
            "method": name,
            "accuracy_mean": np.mean(accs),
            "accuracy_std": np.std(accs),
            "precision_mean": np.mean(precs),
            "precision_std": np.std(precs),
            "recall_mean": np.mean(recs),
            "recall_std": np.std(recs),
            "f1_mean": np.mean(f1s),
            "f1_std": np.std(f1s)
        })

    summary_df = pd.DataFrame(summary_rows).sort_values(["file", "f1_mean"], ascending=[True, False])
    per_fold_df = pd.DataFrame(per_fold_records)

    if SAVE_RESULTS:
        os.makedirs(OUT_DIR, exist_ok=True)
        summary_df.to_csv(os.path.join(OUT_DIR, f"results_summary_{os.path.basename(path)}.csv"), index=False)
        per_fold_df.to_csv(os.path.join(OUT_DIR, f"per_fold_predictions_{os.path.basename(path)}.csv"), index=False)
        print(f"Saved summary and per-fold predictions to {OUT_DIR}/")

    # Print nicely
    pd.set_option("display.float_format", "{:.4f}".format)
    print("\nSummary (means ± std):")
    for _, row in summary_df.iterrows():
        print(f"{row['method']:12s} | Acc {row['accuracy_mean']:.4f} ± {row['accuracy_std']:.4f} | "
              f"Prec {row['precision_mean']:.4f} ± {row['precision_std']:.4f} | "
              f"Rec {row['recall_mean']:.4f} ± {row['recall_std']:.4f} | "
              f"F1 {row['f1_mean']:.4f} ± {row['f1_std']:.4f}")
    return summary_df, per_fold_df

# Run on each file
all_summaries = []
all_per_fold = []
for fpath in DATA_FILES:
    if not os.path.exists(fpath):
        print(f"Warning: file not found: {fpath}. Skipping. (Please place it in the working dir or update DATA_FILES.)")
        continue
    s, p = run_experiments(fpath)
    all_summaries.append(s)
    all_per_fold.append(p)

# Combine and save overall summary
if SAVE_RESULTS and all_summaries:
    combined = pd.concat(all_summaries, ignore_index=True)
    combined.to_csv(os.path.join(OUT_DIR, "combined_results_summary.csv"), index=False)
    print(f"\nCombined summary saved to {OUT_DIR}/combined_results_summary.csv")


--- Processing file: project1_dataset1.txt ---
Loaded: project1_dataset1.txt with shape (568, 31)
Loaded shape: (568, 31)
Detected 30 numeric columns and 0 nominal columns.
Saved summary and per-fold predictions to classification_outputs/

Summary (means ± std):
SVM          | Acc 0.9736 ± 0.0212 | Prec 0.9643 ± 0.0422 | Rec 0.9671 ± 0.0296 | F1 0.9651 ± 0.0272
NeuralNet    | Acc 0.9736 ± 0.0117 | Prec 0.9772 ± 0.0300 | Rec 0.9526 ± 0.0301 | F1 0.9641 ± 0.0161
KNN          | Acc 0.9684 ± 0.0172 | Prec 0.9855 ± 0.0222 | Rec 0.9290 ± 0.0381 | F1 0.9559 ± 0.0240
AdaBoost     | Acc 0.9560 ± 0.0252 | Prec 0.9569 ± 0.0328 | Rec 0.9242 ± 0.0529 | F1 0.9394 ± 0.0350
NaiveBayes   | Acc 0.9279 ± 0.0362 | Prec 0.9156 ± 0.0774 | Rec 0.8959 ± 0.0508 | F1 0.9032 ± 0.0456
DecisionTree | Acc 0.9245 ± 0.0359 | Prec 0.8988 ± 0.0592 | Rec 0.9004 ± 0.0582 | F1 0.8984 ± 0.0486

--- Processing file: project1_dataset2.txt ---
Loaded: project1_dataset2.txt with shape (461, 10)
Loaded shape: (461, 10)
Detecte