In [1]:
# Reviewer note: minimal mount logic; avoids duplicate if already mounted.

import os
from google.colab import drive

if os.path.ismount('/content/drive'):
    print("OK: /content/drive is already mounted.")
else:
    drive.mount('/content/drive', force_remount=False)


Mounted at /content/drive


In [2]:
# Reviewer note: paths updated to Attempt_1_4_and_5_combined_version_2; outputs go to Features_ReducedPlus.
# Reviewer note: logic preserved â€” base Reduced features retained, plus per-sensor deltas and AUC from log_mean_rel.

import pandas as pd
import re, os
from pathlib import Path

# Source feature tables from Step-5 (wide features)
TRAIN5 = "/content/drive/My Drive/Final_Year_Project/Attempt_1_4_and_5_combined_version_2/Features/train/train_features.csv"
TEST5  = "/content/drive/My Drive/Final_Year_Project/Attempt_1_4_and_5_combined_version_2/Features/test/test_features.csv"

# Output directory for ReducedPlus
OUT_RP = "/content/drive/My Drive/Final_Year_Project/Attempt_1_4_and_5_combined_version_2/Features_ReducedPlus"
Path(OUT_RP).mkdir(parents=True, exist_ok=True)

ID_COLS = ["group_id","spice","target"]
CTX_COLS = ["temp_mean","rh_mean","pressure_mean"]

def select_reduced_base(df):
    # Reviewer note: start from Reduced base: rel, log_slope_per_s, log_std, plus context means; exclude ID columns.
    keep = []
    for c in df.columns:
        if c in ID_COLS:
            continue
        if c in CTX_COLS:
            keep.append(c)
            continue
        if c.endswith("_n"):
            continue
        if c.endswith("_log_slope_per_s") or c.endswith("_log_std") or c.endswith("_rel"):
            keep.append(c)
    return keep

def collect_rel_by_sensor(df):
    # Reviewer note: collect Sx_Hy_log_mean_rel columns per sensor and sort by heater step.
    pat = re.compile(r"^S(\d+)_H(\d+)_log_mean_rel$")
    per_sensor = {}
    for c in df.columns:
        m = pat.match(c)
        if m:
            s = int(m.group(1)); h = int(m.group(2))
            per_sensor.setdefault(s, []).append((h, c))
    for s in per_sensor:
        per_sensor[s] = sorted(per_sensor[s], key=lambda x: x[0])
    return per_sensor

def build_reduced_plus(src_csv, dst_csv):
    # Reviewer note: single-pass row-wise construction to avoid accidental leakage; enforce numeric at the end.
    df = pd.read_csv(src_csv)

    base_cols = select_reduced_base(df)
    per_sensor = collect_rel_by_sensor(df)

    rows = []
    for _, row in df.iterrows():
        feats = {}

        # Keep Reduced base features
        for c in base_cols:
            feats[c] = row[c]

        # Add AUC and delta features from log_mean_rel per sensor across heater steps
        for s, seq in per_sensor.items():
            cols = [c for _, c in seq]
            vals = [row[c] for c in cols]

            # AUC across steps 1..9 (exclude baseline step 0)
            if len(vals) >= 2:
                auc = float(pd.Series(vals[1:]).sum())
            else:
                auc = 0.0
            feats[f"S{s}_AUC_rel_mean"] = auc

            # Deltas for k = 1..9 where available: rel(k) - rel(k-1)
            for k in range(1, len(vals)):
                feats[f"S{s}_H{k}_d_rel_mean"] = float(vals[k] - vals[k-1])

        rows.append(feats)

    new_df = pd.DataFrame(rows)
    out = pd.concat([df[ID_COLS], new_df], axis=1)

    # Enforce numeric on features
    for c in out.columns:
        if c in ID_COLS:
            continue
        out[c] = pd.to_numeric(out[c], errors="coerce")

    out.to_csv(dst_csv, index=False)
    print(f"[OK] Wrote ReducedPlus: {dst_csv} | feature cols={out.shape[1]-len(ID_COLS)}")

# Build ReducedPlus for train and test
build_reduced_plus(TRAIN5, f"{OUT_RP}/train_reduced_plus.csv")
build_reduced_plus(TEST5,  f"{OUT_RP}/test_reduced_plus.csv")

[OK] Wrote ReducedPlus: /content/drive/My Drive/Final_Year_Project/Attempt_1_4_and_5_combined_version_2/Features_ReducedPlus/train_reduced_plus.csv | feature cols=563
[OK] Wrote ReducedPlus: /content/drive/My Drive/Final_Year_Project/Attempt_1_4_and_5_combined_version_2/Features_ReducedPlus/test_reduced_plus.csv | feature cols=563


In [3]:
# Reviewer note: Random Forest on Features_ReducedPlus; no cross-validation.
# Reviewer note: outputs saved under Features_ReducedPlus/outputs/RandomForest; accuracy printed as percentage.
# Reviewer note: per-cycle predictions are printed in full to console and saved to CSV.

import os, json, joblib
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    precision_recall_fscore_support, f1_score
)

# Base paths
OUT_RP = "/content/drive/My Drive/Final_Year_Project/Attempt_1_4_and_5_combined_version_2/Features_ReducedPlus"
TRAIN_RP = f"{OUT_RP}/train_reduced_plus.csv"
TEST_RP  = f"{OUT_RP}/test_reduced_plus.csv"

# Output directories
OUT_ROOT = Path(OUT_RP) / "outputs"
OUT_RF   = OUT_ROOT / "RandomForest"
OUT_RF.mkdir(parents=True, exist_ok=True)

ID_COLS = ["group_id","spice","target"]
TARGET_COL = "target"

# Load data
train = pd.read_csv(TRAIN_RP)
test  = pd.read_csv(TEST_RP)

X_train = train.drop(columns=ID_COLS, errors="ignore")
y_train = train[TARGET_COL].astype(int).values
X_test  = test.drop(columns=ID_COLS, errors="ignore")
y_test  = test[TARGET_COL].astype(int).values

meta_cols = [c for c in ["group_id","spice"] if c in test.columns]
meta = test[meta_cols].copy() if meta_cols else pd.DataFrame(index=test.index)

# Build and fit model
rf = RandomForestClassifier(
    n_estimators=1000,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=2,
    max_features="sqrt",
    class_weight="balanced_subsample",
    n_jobs=-1,
    random_state=42
)
rf.fit(X_train, y_train)

# Persist model
joblib.dump(rf, OUT_RF / "model.joblib")

# Predict and evaluate
y_pred = rf.predict(X_test)
labels = np.unique(np.concatenate([y_train, y_test], axis=0))

acc = float(accuracy_score(y_test, y_pred))
acc_pct = acc * 100.0
rep = classification_report(y_test, y_pred, labels=labels, digits=4)
p, r, f1, s = precision_recall_fscore_support(y_test, y_pred, labels=labels, zero_division=0)

metrics = {
    "model": "random_forest_reduced_plus",
    "accuracy": acc,
    "f1": {
        "macro": float(f1_score(y_test, y_pred, average="macro")),
        "weighted": float(f1_score(y_test, y_pred, average="weighted"))
    },
    "per_class": {
        str(int(lbl)): {"precision": float(pi), "recall": float(ri), "f1": float(fi), "support": int(si)}
        for lbl, pi, ri, fi, si in zip(labels, p, r, f1, s)
    }
}

with open(OUT_RF / "metrics.json", "w") as f:
    json.dump(metrics, f, indent=2)

with open(OUT_RF / "classification_report.txt", "w") as f:
    f.write(rep)

cm = confusion_matrix(y_test, y_pred, labels=labels)
cm_df = pd.DataFrame(
    cm,
    index=[f"true_{int(i)}" for i in labels],
    columns=[f"pred_{int(i)}" for i in labels]
)
cm_df.to_csv(OUT_RF / "confusion_matrix.csv", index=True)

# Per-cycle predictions
preds = meta.copy()
preds["y_true"] = y_test
preds["y_pred"] = y_pred
preds.to_csv(OUT_RF / "per_cycle_predictions.csv", index=False)

# Console summary
print(f"[ReducedPlus RF] test accuracy: {acc_pct:.2f}%")
print("\n[ReducedPlus RF] classification report:\n", rep)
print("\n[ReducedPlus RF] confusion matrix:\n", cm_df)
print("\n[ReducedPlus RF] per-cycle predictions (full):")
with pd.option_context("display.max_rows", None, "display.max_columns", None, "display.width", 0, "display.max_colwidth", None):
    print(preds.to_string(index=False))
print("\n[ReducedPlus RF] per-cycle predictions shape:", preds.shape)
print("\n[ReducedPlus RF] outputs saved to:", OUT_RF.resolve())


[ReducedPlus RF] test accuracy: 50.00%

[ReducedPlus RF] classification report:
               precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000         5
           1     0.0000    0.0000    0.0000         5
           2     0.5556    1.0000    0.7143         5
           3     1.0000    1.0000    1.0000         5

    accuracy                         0.5000        20
   macro avg     0.3889    0.5000    0.4286        20
weighted avg     0.3889    0.5000    0.4286        20


[ReducedPlus RF] confusion matrix:
         pred_0  pred_1  pred_2  pred_3
true_0       0       5       0       0
true_1       1       0       4       0
true_2       0       0       5       0
true_3       0       0       0       5

[ReducedPlus RF] per-cycle predictions (full):
        group_id    spice  y_true  y_pred
   Anise_cycle_1    Anise       0       1
   Anise_cycle_2    Anise       0       1
   Anise_cycle_3    Anise       0       1
   Anise_cycle_4    Anise       0   

In [4]:
# Reviewer note: Logistic Regression on Features_ReducedPlus; no cross-validation.
# Reviewer note: outputs saved under Features_ReducedPlus/outputs/LogisticRegression; accuracy printed as percentage.
# Reviewer note: per-cycle predictions are printed in full to console and saved to CSV.

import os, json, joblib
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    precision_recall_fscore_support, f1_score
)

# Base paths
OUT_RP = "/content/drive/My Drive/Final_Year_Project/Attempt_1_4_and_5_combined_version_2/Features_ReducedPlus"
TRAIN_RP = f"{OUT_RP}/train_reduced_plus.csv"
TEST_RP  = f"{OUT_RP}/test_reduced_plus.csv"

# Output directories
OUT_ROOT = Path(OUT_RP) / "outputs"
OUT_LR   = OUT_ROOT / "LogisticRegression"
OUT_LR.mkdir(parents=True, exist_ok=True)

ID_COLS = ["group_id","spice","target"]
TARGET_COL = "target"

# Load data
train = pd.read_csv(TRAIN_RP)
test  = pd.read_csv(TEST_RP)

X_train = train.drop(columns=ID_COLS, errors="ignore")
y_train = train[TARGET_COL].astype(int).values
X_test  = test.drop(columns=ID_COLS, errors="ignore")
y_test  = test[TARGET_COL].astype(int).values

meta_cols = [c for c in ["group_id","spice"] if c in test.columns]
meta = test[meta_cols].copy() if meta_cols else pd.DataFrame(index=test.index)

# Build and fit model
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=8000, multi_class="auto", random_state=42))
])
pipe.fit(X_train, y_train)

# Persist model
joblib.dump(pipe, OUT_LR / "model.joblib")

# Predict and evaluate
y_pred = pipe.predict(X_test)
labels = np.unique(np.concatenate([y_train, y_test], axis=0))

acc = float(accuracy_score(y_test, y_pred))
acc_pct = acc * 100.0
rep = classification_report(y_test, y_pred, labels=labels, digits=4)
p, r, f1, s = precision_recall_fscore_support(y_test, y_pred, labels=labels, zero_division=0)

metrics = {
    "model": "logistic_regression_reduced_plus",
    "accuracy": acc,
    "f1": {
        "macro": float(f1_score(y_test, y_pred, average="macro")),
        "weighted": float(f1_score(y_test, y_pred, average="weighted"))
    },
    "per_class": {
        str(int(lbl)): {"precision": float(pi), "recall": float(ri), "f1": float(fi), "support": int(si)}
        for lbl, pi, ri, fi, si in zip(labels, p, r, f1, s)
    }
}

with open(OUT_LR / "metrics.json", "w") as f:
    json.dump(metrics, f, indent=2)

with open(OUT_LR / "classification_report.txt", "w") as f:
    f.write(rep)

cm = confusion_matrix(y_test, y_pred, labels=labels)
cm_df = pd.DataFrame(
    cm,
    index=[f"true_{int(i)}" for i in labels],
    columns=[f"pred_{int(i)}" for i in labels]
)
cm_df.to_csv(OUT_LR / "confusion_matrix.csv", index=True)

# Per-cycle predictions
preds = meta.copy()
preds["y_true"] = y_test
preds["y_pred"] = y_pred
preds.to_csv(OUT_LR / "per_cycle_predictions.csv", index=False)

# Console summary
print(f"[ReducedPlus LogReg] test accuracy: {acc_pct:.2f}%")
print("\n[ReducedPlus LogReg] classification report:\n", rep)
print("\n[ReducedPlus LogReg] confusion matrix:\n", cm_df)
print("\n[ReducedPlus LogReg] per-cycle predictions (full):")
with pd.option_context("display.max_rows", None, "display.max_columns", None, "display.width", 0, "display.max_colwidth", None):
    print(preds.to_string(index=False))
print("\n[ReducedPlus LogReg] per-cycle predictions shape:", preds.shape)
print("\n[ReducedPlus LogReg] outputs saved to:", OUT_LR.resolve())


[ReducedPlus LogReg] test accuracy: 70.00%

[ReducedPlus LogReg] classification report:
               precision    recall  f1-score   support

           0     0.8333    1.0000    0.9091         5
           1     0.0000    0.0000    0.0000         5
           2     0.5000    1.0000    0.6667         5
           3     1.0000    0.8000    0.8889         5

    accuracy                         0.7000        20
   macro avg     0.5833    0.7000    0.6162        20
weighted avg     0.5833    0.7000    0.6162        20


[ReducedPlus LogReg] confusion matrix:
         pred_0  pred_1  pred_2  pred_3
true_0       5       0       0       0
true_1       0       0       5       0
true_2       0       0       5       0
true_3       1       0       0       4

[ReducedPlus LogReg] per-cycle predictions (full):
        group_id    spice  y_true  y_pred
   Anise_cycle_1    Anise       0       0
   Anise_cycle_2    Anise       0       0
   Anise_cycle_3    Anise       0       0
   Anise_cycle_4    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [5]:
# Reviewer note: SVM (RBF) on Features_ReducedPlus; no cross-validation.
# Reviewer note: outputs saved under Features_ReducedPlus/outputs/SVM; accuracy printed as percentage.
# Reviewer note: per-cycle predictions are printed in full to console and saved to CSV.

import os, json, joblib
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    precision_recall_fscore_support, f1_score
)

# Base paths
OUT_RP = "/content/drive/My Drive/Final_Year_Project/Attempt_1_4_and_5_combined_version_2/Features_ReducedPlus"
TRAIN_RP = f"{OUT_RP}/train_reduced_plus.csv"
TEST_RP  = f"{OUT_RP}/test_reduced_plus.csv"

# Output directories
OUT_ROOT = Path(OUT_RP) / "outputs"
OUT_SVM  = OUT_ROOT / "SVM"
OUT_SVM.mkdir(parents=True, exist_ok=True)

ID_COLS = ["group_id","spice","target"]
TARGET_COL = "target"

# Load data
train = pd.read_csv(TRAIN_RP)
test  = pd.read_csv(TEST_RP)

X_train = train.drop(columns=ID_COLS, errors="ignore")
y_train = train[TARGET_COL].astype(int).values
X_test  = test.drop(columns=ID_COLS, errors="ignore")
y_test  = test[TARGET_COL].astype(int).values

meta_cols = [c for c in ["group_id","spice"] if c in test.columns]
meta = test[meta_cols].copy() if meta_cols else pd.DataFrame(index=test.index)

# Build and fit model (no CV)
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", SVC(kernel="rbf", C=1.0, gamma="scale", probability=False, random_state=42))
])
pipe.fit(X_train, y_train)

# Persist model
joblib.dump(pipe, OUT_SVM / "model.joblib")

# Predict and evaluate
y_pred = pipe.predict(X_test)
labels = np.unique(np.concatenate([y_train, y_test], axis=0))

acc = float(accuracy_score(y_test, y_pred))
acc_pct = acc * 100.0
rep = classification_report(y_test, y_pred, labels=labels, digits=4)
p, r, f1, s = precision_recall_fscore_support(y_test, y_pred, labels=labels, zero_division=0)

metrics = {
    "model": "svm_rbf_reduced_plus",
    "accuracy": acc,
    "f1": {
        "macro": float(f1_score(y_test, y_pred, average="macro")),
        "weighted": float(f1_score(y_test, y_pred, average="weighted"))
    },
    "per_class": {
        str(int(lbl)): {"precision": float(pi), "recall": float(ri), "f1": float(fi), "support": int(si)}
        for lbl, pi, ri, fi, si in zip(labels, p, r, f1, s)
    }
}

with open(OUT_SVM / "metrics.json", "w") as f:
    json.dump(metrics, f, indent=2)

with open(OUT_SVM / "classification_report.txt", "w") as f:
    f.write(rep)

cm = confusion_matrix(y_test, y_pred, labels=labels)
cm_df = pd.DataFrame(
    cm,
    index=[f"true_{int(i)}" for i in labels],
    columns=[f"pred_{int(i)}" for i in labels]
)
cm_df.to_csv(OUT_SVM / "confusion_matrix.csv", index=True)

# Per-cycle predictions
preds = meta.copy()
preds["y_true"] = y_test
preds["y_pred"] = y_pred
preds.to_csv(OUT_SVM / "per_cycle_predictions.csv", index=False)

# Console summary
print(f"[ReducedPlus SVM] test accuracy: {acc_pct:.2f}%")
print("\n[ReducedPlus SVM] classification report:\n", rep)
print("\n[ReducedPlus SVM] confusion matrix:\n", cm_df)
print("\n[ReducedPlus SVM] per-cycle predictions (full):")
with pd.option_context("display.max_rows", None, "display.max_columns", None, "display.width", 0, "display.max_colwidth", None):
    print(preds.to_string(index=False))
print("\n[ReducedPlus SVM] per-cycle predictions shape:", preds.shape)
print("\n[ReducedPlus SVM] outputs saved to:", OUT_SVM.resolve())


[ReducedPlus SVM] test accuracy: 50.00%

[ReducedPlus SVM] classification report:
               precision    recall  f1-score   support

           0     0.4545    1.0000    0.6250         5
           1     0.0000    0.0000    0.0000         5
           2     0.4286    0.6000    0.5000         5
           3     1.0000    0.4000    0.5714         5

    accuracy                         0.5000        20
   macro avg     0.4708    0.5000    0.4241        20
weighted avg     0.4708    0.5000    0.4241        20


[ReducedPlus SVM] confusion matrix:
         pred_0  pred_1  pred_2  pred_3
true_0       5       0       0       0
true_1       3       0       2       0
true_2       2       0       3       0
true_3       1       0       2       2

[ReducedPlus SVM] per-cycle predictions (full):
        group_id    spice  y_true  y_pred
   Anise_cycle_1    Anise       0       0
   Anise_cycle_2    Anise       0       0
   Anise_cycle_3    Anise       0       0
   Anise_cycle_4    Anise       

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [6]:
# Reviewer note: XGBoost on Features_ReducedPlus; no cross-validation.
# Reviewer note: outputs saved under Features_ReducedPlus/outputs/XGBoost; accuracy printed as percentage.
# Reviewer note: per-cycle predictions are printed in full to console and saved to CSV.

import os, json, joblib
from pathlib import Path
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    precision_recall_fscore_support, f1_score
)

# Base paths
OUT_RP = "/content/drive/My Drive/Final_Year_Project/Attempt_1_4_and_5_combined_version_2/Features_ReducedPlus"
TRAIN_RP = f"{OUT_RP}/train_reduced_plus.csv"
TEST_RP  = f"{OUT_RP}/test_reduced_plus.csv"

# Output directories
OUT_ROOT = Path(OUT_RP) / "outputs"
OUT_XGB  = OUT_ROOT / "XGBoost"
OUT_XGB.mkdir(parents=True, exist_ok=True)

ID_COLS = ["group_id","spice","target"]
TARGET_COL = "target"

# Load data
train = pd.read_csv(TRAIN_RP)
test  = pd.read_csv(TEST_RP)

X_train = train.drop(columns=ID_COLS, errors="ignore")
y_train = train[TARGET_COL].astype(int).values
X_test  = test.drop(columns=ID_COLS, errors="ignore")
y_test  = test[TARGET_COL].astype(int).values

meta_cols = [c for c in ["group_id","spice"] if c in test.columns]
meta = test[meta_cols].copy() if meta_cols else pd.DataFrame(index=test.index)

# Build and fit model (no CV)
num_classes = int(np.unique(y_train).shape[0])
xgb = XGBClassifier(
    objective="multi:softmax",
    num_class=num_classes,
    eval_metric="mlogloss",
    n_estimators=400,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)
xgb.fit(X_train, y_train)

# Persist model
joblib.dump(xgb, OUT_XGB / "model.joblib")

# Predict and evaluate
y_pred = xgb.predict(X_test)
labels = np.unique(np.concatenate([y_train, y_test], axis=0))

acc = float(accuracy_score(y_test, y_pred))
acc_pct = acc * 100.0
rep = classification_report(y_test, y_pred, labels=labels, digits=4)
p, r, f1, s = precision_recall_fscore_support(y_test, y_pred, labels=labels, zero_division=0)

metrics = {
    "model": "xgboost_reduced_plus",
    "accuracy": acc,
    "f1": {
        "macro": float(f1_score(y_test, y_pred, average="macro")),
        "weighted": float(f1_score(y_test, y_pred, average="weighted"))
    },
    "per_class": {
        str(int(lbl)): {"precision": float(pi), "recall": float(ri), "f1": float(fi), "support": int(si)}
        for lbl, pi, ri, fi, si in zip(labels, p, r, f1, s)
    }
}

with open(OUT_XGB / "metrics.json", "w") as f:
    json.dump(metrics, f, indent=2)

with open(OUT_XGB / "classification_report.txt", "w") as f:
    f.write(rep)

cm = confusion_matrix(y_test, y_pred, labels=labels)
cm_df = pd.DataFrame(
    cm,
    index=[f"true_{int(i)}" for i in labels],
    columns=[f"pred_{int(i)}" for i in labels]
)
cm_df.to_csv(OUT_XGB / "confusion_matrix.csv", index=True)

# Per-cycle predictions
preds = meta.copy()
preds["y_true"] = y_test
preds["y_pred"] = y_pred
preds.to_csv(OUT_XGB / "per_cycle_predictions.csv", index=False)

# Console summary
print(f"[ReducedPlus XGBoost] test accuracy: {acc_pct:.2f}%")
print("\n[ReducedPlus XGBoost] classification report:\n", rep)
print("\n[ReducedPlus XGBoost] confusion matrix:\n", cm_df)
print("\n[ReducedPlus XGBoost] per-cycle predictions (full):")
with pd.option_context("display.max_rows", None, "display.max_columns", None, "display.width", 0, "display.max_colwidth", None):
    print(preds.to_string(index=False))
print("\n[ReducedPlus XGBoost] per-cycle predictions shape:", preds.shape)
print("\n[ReducedPlus XGBoost] outputs saved to:", OUT_XGB.resolve())


[ReducedPlus XGBoost] test accuracy: 45.00%

[ReducedPlus XGBoost] classification report:
               precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000         5
           1     0.0000    0.0000    0.0000         5
           2     0.6667    0.8000    0.7273         5
           3     0.5556    1.0000    0.7143         5

    accuracy                         0.4500        20
   macro avg     0.3056    0.4500    0.3604        20
weighted avg     0.3056    0.4500    0.3604        20


[ReducedPlus XGBoost] confusion matrix:
         pred_0  pred_1  pred_2  pred_3
true_0       0       5       0       0
true_1       0       0       2       3
true_2       0       0       4       1
true_3       0       0       0       5

[ReducedPlus XGBoost] per-cycle predictions (full):
        group_id    spice  y_true  y_pred
   Anise_cycle_1    Anise       0       1
   Anise_cycle_2    Anise       0       1
   Anise_cycle_3    Anise       0       1
   Anise_cycle_4

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [7]:
# Reviewer note: KNN on Features_ReducedPlus; no cross-validation.
# Reviewer note: outputs saved under Features_ReducedPlus/outputs/KNN; accuracy printed as percentage.
# Reviewer note: per-cycle predictions are printed in full to console and saved to CSV.

import os, json, joblib
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    precision_recall_fscore_support, f1_score
)

# Base paths
OUT_RP = "/content/drive/My Drive/Final_Year_Project/Attempt_1_4_and_5_combined_version_2/Features_ReducedPlus"
TRAIN_RP = f"{OUT_RP}/train_reduced_plus.csv"
TEST_RP  = f"{OUT_RP}/test_reduced_plus.csv"

# Output directories
OUT_ROOT = Path(OUT_RP) / "outputs"
OUT_KNN  = OUT_ROOT / "KNN"
OUT_KNN.mkdir(parents=True, exist_ok=True)

ID_COLS = ["group_id","spice","target"]
TARGET_COL = "target"

# Load data
train = pd.read_csv(TRAIN_RP)
test  = pd.read_csv(TEST_RP)

X_train = train.drop(columns=ID_COLS, errors="ignore")
y_train = train[TARGET_COL].astype(int).values
X_test  = test.drop(columns=ID_COLS, errors="ignore")
y_test  = test[TARGET_COL].astype(int).values

meta_cols = [c for c in ["group_id","spice"] if c in test.columns]
meta = test[meta_cols].copy() if meta_cols else pd.DataFrame(index=test.index)

# Build and fit model (no CV)
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", KNeighborsClassifier(n_neighbors=7, weights="distance", metric="minkowski", p=2))
])
pipe.fit(X_train, y_train)

# Persist model
joblib.dump(pipe, OUT_KNN / "model.joblib")

# Predict and evaluate
y_pred = pipe.predict(X_test)
labels = np.unique(np.concatenate([y_train, y_test], axis=0))

acc = float(accuracy_score(y_test, y_pred))
acc_pct = acc * 100.0
rep = classification_report(y_test, y_pred, labels=labels, digits=4)
p, r, f1, s = precision_recall_fscore_support(y_test, y_pred, labels=labels, zero_division=0)

metrics = {
    "model": "knn_reduced_plus",
    "accuracy": acc,
    "f1": {
        "macro": float(f1_score(y_test, y_pred, average="macro")),
        "weighted": float(f1_score(y_test, y_pred, average="weighted"))
    },
    "per_class": {
        str(int(lbl)): {"precision": float(pi), "recall": float(ri), "f1": float(fi), "support": int(si)}
        for lbl, pi, ri, fi, si in zip(labels, p, r, f1, s)
    }
}

with open(OUT_KNN / "metrics.json", "w") as f:
    json.dump(metrics, f, indent=2)

with open(OUT_KNN / "classification_report.txt", "w") as f:
    f.write(rep)

cm = confusion_matrix(y_test, y_pred, labels=labels)
cm_df = pd.DataFrame(
    cm,
    index=[f"true_{int(i)}" for i in labels],
    columns=[f"pred_{int(i)}" for i in labels]
)
cm_df.to_csv(OUT_KNN / "confusion_matrix.csv", index=True)

# Per-cycle predictions
preds = meta.copy()
preds["y_true"] = y_test
preds["y_pred"] = y_pred
preds.to_csv(OUT_KNN / "per_cycle_predictions.csv", index=False)

# Console summary
print(f"[ReducedPlus KNN] test accuracy: {acc_pct:.2f}%")
print("\n[ReducedPlus KNN] classification report:\n", rep)
print("\n[ReducedPlus KNN] confusion matrix:\n", cm_df)
print("\n[ReducedPlus KNN] per-cycle predictions (full):")
with pd.option_context("display.max_rows", None, "display.max_columns", None, "display.width", 0, "display.max_colwidth", None):
    print(preds.to_string(index=False))
print("\n[ReducedPlus KNN] per-cycle predictions shape:", preds.shape)
print("\n[ReducedPlus KNN] outputs saved to:", OUT_KNN.resolve())


[ReducedPlus KNN] test accuracy: 65.00%

[ReducedPlus KNN] classification report:
               precision    recall  f1-score   support

           0     0.5556    1.0000    0.7143         5
           1     0.0000    0.0000    0.0000         5
           2     0.5714    0.8000    0.6667         5
           3     1.0000    0.8000    0.8889         5

    accuracy                         0.6500        20
   macro avg     0.5317    0.6500    0.5675        20
weighted avg     0.5317    0.6500    0.5675        20


[ReducedPlus KNN] confusion matrix:
         pred_0  pred_1  pred_2  pred_3
true_0       5       0       0       0
true_1       2       0       3       0
true_2       1       0       4       0
true_3       1       0       0       4

[ReducedPlus KNN] per-cycle predictions (full):
        group_id    spice  y_true  y_pred
   Anise_cycle_1    Anise       0       0
   Anise_cycle_2    Anise       0       0
   Anise_cycle_3    Anise       0       0
   Anise_cycle_4    Anise       

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
