In [None]:
# Reviewer note: Google Drive mount for Colab; minimal logic to avoid duplicate.

import os
from google.colab import drive

if os.path.ismount('/content/drive'):
    print("OK: /content/drive is already mounted.")
else:
    drive.mount('/content/drive', force_remount=False)


Mounted at /content/drive


In [None]:
# Reviewer note: Build ReducedPlus2 (RP2) features from wide tables; paths aligned to current context.
# Reviewer note: context kept; per-sensor early/mid/tail means, peaks, ratios, coarse slopes; sensor-pair early ratios.

import pandas as pd
import numpy as np
import re, os
from pathlib import Path

BASE = "/content/drive/My Drive/Final_Year_Project/Attempt_1_4_and_5_combined_version_2"
OUT_RP2 = f"{BASE}/Features_ReducedPlus2"
Path(OUT_RP2).mkdir(parents=True, exist_ok=True)

ID_COLS = ["group_id","spice","target"]
CTX_COLS = ["temp_mean","rh_mean","pressure_mean"]

def collect_rel_cols(df):
    # returns: {sensor_idx: [(step, colname), ...]} sorted by step
    pat = re.compile(r"^S(\d+)_H(\d+)_log_mean_rel$")
    per_sensor = {}
    for c in df.columns:
        m = pat.match(c)
        if m:
            s = int(m.group(1)); h = int(m.group(2))
            per_sensor.setdefault(s, []).append((h, c))
    for s in per_sensor:
        per_sensor[s] = sorted(per_sensor[s], key=lambda x: x[0])
    return per_sensor

def build_rp2(src_csv, dst_csv):
    df = pd.read_csv(src_csv)
    per_sensor = collect_rel_cols(df)

    rows = []
    for _, r in df.iterrows():
        feats = {}

        # keep context vars
        for c in CTX_COLS:
            if c in df.columns:
                feats[c] = r[c]

        # per-sensor shape summaries
        for s, seq in per_sensor.items():
            steps = [h for h, _ in seq]
            cols  = [c for _, c in seq]
            vals  = [float(r[c]) for c in cols]
            vmap = {h: v for h, v in zip(steps, vals)}
            def v(h): return vmap.get(h, 0.0)

            # early/mid/tail means (exclude step 0 baseline)
            early = np.mean([v(1), v(2), v(3)])
            mid   = np.mean([v(4), v(5), v(6)])
            tail  = np.mean([v(7), v(8), v(9)])

            feats[f"S{s}_early_mean_rel"] = float(early)
            feats[f"S{s}_mid_mean_rel"]   = float(mid)
            feats[f"S{s}_tail_mean_rel"]  = float(tail)

            # peak step across 1..9 and early max
            vals_1_9 = [v(k) for k in range(1, 10)]
            peaks_idx = int(np.argmax(vals_1_9)) + 1
            feats[f"S{s}_early_max_rel"] = float(np.max([v(1), v(2), v(3)]))
            feats[f"S{s}_peak_step"]     = float(peaks_idx)

            # ratios and coarse slopes
            eps = 1e-6
            feats[f"S{s}_decay_ratio"] = float(mid / (early + eps))
            feats[f"S{s}_tail_ratio"]  = float(tail / (early + eps))
            feats[f"S{s}_early_slope"] = float((v(3) - v(1)) / 2.0)
            feats[f"S{s}_mid_slope"]   = float((v(6) - v(4)) / 2.0)
            feats[f"S{s}_tail_slope"]  = float((v(9) - v(7)) / 2.0)

        # sensor-pair early ratios (if both sensors exist)
        pairs = [(0,1), (2,3), (4,5), (6,7)]
        eps = 1e-6
        for a, b in pairs:
            if a in per_sensor and b in per_sensor:
                vmap_a = {h: float(r[c]) for h, c in per_sensor[a]}
                vmap_b = {h: float(r[c]) for h, c in per_sensor[b]}
                ea = float(np.mean([vmap_a.get(1, 0.0), vmap_a.get(2, 0.0), vmap_a.get(3, 0.0)]))
                eb = float(np.mean([vmap_b.get(1, 0.0), vmap_b.get(2, 0.0), vmap_b.get(3, 0.0)]))
                feats[f"S{a}S{b}_early_ratio"] = float(ea / (eb + eps))

        rows.append(feats)

    out = pd.DataFrame(rows)
    out = pd.concat([df[ID_COLS], out], axis=1)

    # enforce numeric on features
    for c in out.columns:
        if c in ID_COLS:
            continue
        out[c] = pd.to_numeric(out[c], errors="coerce")

    out.to_csv(dst_csv, index=False)
    print(f"[OK] Wrote RP2: {dst_csv} | feature cols = {out.shape[1]-len(ID_COLS)}")

# Build ReducedPlus2 for train and test (current context paths)
build_rp2(f"{BASE}/Features/train/train_features.csv", f"{OUT_RP2}/train_reduced_plus2.csv")
build_rp2(f"{BASE}/Features/test/test_features.csv",  f"{OUT_RP2}/test_reduced_plus2.csv")


[OK] Wrote RP2: /content/drive/My Drive/Final_Year_Project/Attempt_1_4_and_5_combined_version_2/Features_ReducedPlus2/train_reduced_plus2.csv | feature cols = 87
[OK] Wrote RP2: /content/drive/My Drive/Final_Year_Project/Attempt_1_4_and_5_combined_version_2/Features_ReducedPlus2/test_reduced_plus2.csv | feature cols = 87


In [None]:
# Reviewer note: Random Forest on Features_ReducedPlus2; no cross-validation.
# Reviewer note: outputs saved under Features_ReducedPlus2/outputs/RandomForest; accuracy printed as percentage.
# Reviewer note: per-cycle predictions are printed in full to console and saved to CSV.

import os, json, joblib
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    precision_recall_fscore_support, f1_score
)

# Base paths (updated context)
BASE    = "/content/drive/My Drive/Final_Year_Project/Attempt_1_4_and_5_combined_version_2"
OUT_RP2 = f"{BASE}/Features_ReducedPlus2"

TRAIN_RP2 = f"{OUT_RP2}/train_reduced_plus2.csv"
TEST_RP2  = f"{OUT_RP2}/test_reduced_plus2.csv"

# Output directories
OUT_ROOT = Path(OUT_RP2) / "outputs"
OUT_RF   = OUT_ROOT / "RandomForest"
OUT_RF.mkdir(parents=True, exist_ok=True)

ID_COLS    = ["group_id","spice","target"]
TARGET_COL = "target"

# Load data
train = pd.read_csv(TRAIN_RP2)
test  = pd.read_csv(TEST_RP2)

X_train = train.drop(columns=ID_COLS, errors="ignore")
y_train = train[TARGET_COL].astype(int).values
X_test  = test.drop(columns=ID_COLS, errors="ignore")
y_test  = test[TARGET_COL].astype(int).values

meta_cols = [c for c in ["group_id","spice"] if c in test.columns]
meta = test[meta_cols].copy() if meta_cols else pd.DataFrame(index=test.index)

# Build and fit model
rf = RandomForestClassifier(
    n_estimators=1200,
    max_depth=None,
    min_samples_leaf=2,
    max_features="sqrt",
    class_weight="balanced_subsample",
    n_jobs=-1,
    random_state=42
)
rf.fit(X_train, y_train)

# Persist model
joblib.dump(rf, OUT_RF / "model.joblib")

# Predict and evaluate
y_pred = rf.predict(X_test)
labels = np.unique(np.concatenate([y_train, y_test], axis=0))

acc = float(accuracy_score(y_test, y_pred))
acc_pct = acc * 100.0
rep = classification_report(y_test, y_pred, labels=labels, digits=4)
p, r, f1, s = precision_recall_fscore_support(y_test, y_pred, labels=labels, zero_division=0)

metrics = {
    "model": "random_forest_reduced_plus2",
    "accuracy": acc,
    "f1": {
        "macro": float(f1_score(y_test, y_pred, average="macro")),
        "weighted": float(f1_score(y_test, y_pred, average="weighted"))
    },
    "per_class": {
        str(int(lbl)): {"precision": float(pi), "recall": float(ri), "f1": float(fi), "support": int(si)}
        for lbl, pi, ri, fi, si in zip(labels, p, r, f1, s)
    }
}

with open(OUT_RF / "metrics.json", "w") as f:
    json.dump(metrics, f, indent=2)

with open(OUT_RF / "classification_report.txt", "w") as f:
    f.write(rep)

cm = confusion_matrix(y_test, y_pred, labels=labels)
cm_df = pd.DataFrame(
    cm,
    index=[f"true_{int(i)}" for i in labels],
    columns=[f"pred_{int(i)}" for i in labels]
)
cm_df.to_csv(OUT_RF / "confusion_matrix.csv", index=True)

# Per-cycle predictions
preds = meta.copy()
preds["y_true"] = y_test
preds["y_pred"] = y_pred
preds.to_csv(OUT_RF / "per_cycle_predictions.csv", index=False)

# Console summary
print(f"[ReducedPlus2 RF] test accuracy: {acc_pct:.2f}%")
print("\n[ReducedPlus2 RF] classification report:\n", rep)
print("\n[ReducedPlus2 RF] confusion matrix:\n", cm_df)
print("\n[ReducedPlus2 RF] per-cycle predictions (full):")
with pd.option_context("display.max_rows", None, "display.max_columns", None, "display.width", 0, "display.max_colwidth", None):
    print(preds.to_string(index=False))
print("\n[ReducedPlus2 RF] per-cycle predictions shape:", preds.shape)
print("\n[ReducedPlus2 RF] outputs saved to:", OUT_RF.resolve())


[ReducedPlus2 RF] test accuracy: 55.00%

[ReducedPlus2 RF] classification report:
               precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000         5
           1     0.1667    0.2000    0.1818         5
           2     1.0000    1.0000    1.0000         5
           3     1.0000    1.0000    1.0000         5

    accuracy                         0.5500        20
   macro avg     0.5417    0.5500    0.5455        20
weighted avg     0.5417    0.5500    0.5455        20


[ReducedPlus2 RF] confusion matrix:
         pred_0  pred_1  pred_2  pred_3
true_0       0       5       0       0
true_1       4       1       0       0
true_2       0       0       5       0
true_3       0       0       0       5

[ReducedPlus2 RF] per-cycle predictions (full):
        group_id    spice  y_true  y_pred
   Anise_cycle_1    Anise       0       1
   Anise_cycle_2    Anise       0       1
   Anise_cycle_3    Anise       0       1
   Anise_cycle_4    Anise       

In [None]:
# Reviewer note: Logistic Regression on Features_ReducedPlus2; no cross-validation.
# Reviewer note: outputs saved under Features_ReducedPlus2/outputs/LogisticRegression; accuracy printed as percentage.
# Reviewer note: per-cycle predictions are printed in full to console and saved to CSV.

import os, json, joblib
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    precision_recall_fscore_support, f1_score
)

# Base paths (updated context)
BASE    = "/content/drive/My Drive/Final_Year_Project/Attempt_1_4_and_5_combined_version_2"
OUT_RP2 = f"{BASE}/Features_ReducedPlus2"

TRAIN_RP2 = f"{OUT_RP2}/train_reduced_plus2.csv"
TEST_RP2  = f"{OUT_RP2}/test_reduced_plus2.csv"

# Output directories
OUT_ROOT = Path(OUT_RP2) / "outputs"
OUT_LR   = OUT_ROOT / "LogisticRegression"
OUT_LR.mkdir(parents=True, exist_ok=True)

ID_COLS    = ["group_id","spice","target"]
TARGET_COL = "target"

# Load data
train = pd.read_csv(TRAIN_RP2)
test  = pd.read_csv(TEST_RP2)

X_train = train.drop(columns=ID_COLS, errors="ignore")
y_train = train[TARGET_COL].astype(int).values
X_test  = test.drop(columns=ID_COLS, errors="ignore")
y_test  = test[TARGET_COL].astype(int).values

meta_cols = [c for c in ["group_id","spice"] if c in test.columns]
meta = test[meta_cols].copy() if meta_cols else pd.DataFrame(index=test.index)

# Build and fit model
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=8000, multi_class="auto", random_state=42))
])
pipe.fit(X_train, y_train)

# Persist model
joblib.dump(pipe, OUT_LR / "model.joblib")

# Predict and evaluate
y_pred = pipe.predict(X_test)
labels = np.unique(np.concatenate([y_train, y_test], axis=0))

acc = float(accuracy_score(y_test, y_pred))
acc_pct = acc * 100.0
rep = classification_report(y_test, y_pred, labels=labels, digits=4)
p, r, f1, s = precision_recall_fscore_support(y_test, y_pred, labels=labels, zero_division=0)

metrics = {
    "model": "logistic_regression_reduced_plus2",
    "accuracy": acc,
    "f1": {
        "macro": float(f1_score(y_test, y_pred, average="macro")),
        "weighted": float(f1_score(y_test, y_pred, average="weighted"))
    },
    "per_class": {
        str(int(lbl)): {"precision": float(pi), "recall": float(ri), "f1": float(fi), "support": int(si)}
        for lbl, pi, ri, fi, si in zip(labels, p, r, f1, s)
    }
}

with open(OUT_LR / "metrics.json", "w") as f:
    json.dump(metrics, f, indent=2)

with open(OUT_LR / "classification_report.txt", "w") as f:
    f.write(rep)

cm = confusion_matrix(y_test, y_pred, labels=labels)
cm_df = pd.DataFrame(
    cm,
    index=[f"true_{int(i)}" for i in labels],
    columns=[f"pred_{int(i)}" for i in labels]
)
cm_df.to_csv(OUT_LR / "confusion_matrix.csv", index=True)

# Per-cycle predictions
preds = meta.copy()
preds["y_true"] = y_test
preds["y_pred"] = y_pred
preds.to_csv(OUT_LR / "per_cycle_predictions.csv", index=False)

# Console summary
print(f"[ReducedPlus2 LogReg] test accuracy: {acc_pct:.2f}%")
print("\n[ReducedPlus2 LogReg] classification report:\n", rep)
print("\n[ReducedPlus2 LogReg] confusion matrix:\n", cm_df)
print("\n[ReducedPlus2 LogReg] per-cycle predictions (full):")
with pd.option_context("display.max_rows", None, "display.max_columns", None, "display.width", 0, "display.max_colwidth", None):
    print(preds.to_string(index=False))
print("\n[ReducedPlus2 LogReg] per-cycle predictions shape:", preds.shape)
print("\n[ReducedPlus2 LogReg] outputs saved to:", OUT_LR.resolve())


[ReducedPlus2 LogReg] test accuracy: 50.00%

[ReducedPlus2 LogReg] classification report:
               precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000         5
           1     0.0000    0.0000    0.0000         5
           2     0.6250    1.0000    0.7692         5
           3     1.0000    1.0000    1.0000         5

    accuracy                         0.5000        20
   macro avg     0.4062    0.5000    0.4423        20
weighted avg     0.4062    0.5000    0.4423        20


[ReducedPlus2 LogReg] confusion matrix:
         pred_0  pred_1  pred_2  pred_3
true_0       0       5       0       0
true_1       2       0       3       0
true_2       0       0       5       0
true_3       0       0       0       5

[ReducedPlus2 LogReg] per-cycle predictions (full):
        group_id    spice  y_true  y_pred
   Anise_cycle_1    Anise       0       1
   Anise_cycle_2    Anise       0       1
   Anise_cycle_3    Anise       0       1
   Anise_cycle_4



In [9]:
# Reviewer note: SVM (RBF) on Features_ReducedPlus2; no cross-validation.
# Reviewer note: saves ALL artifacts into a single folder: Features_ReducedPlus2/outputs/SVM.
# Reviewer note: accuracy printed as percentage; full per-cycle predictions printed and saved.

import os, json, joblib
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    precision_recall_fscore_support, f1_score
)

BASE      = "/content/drive/My Drive/Final_Year_Project/Attempt_1_4_and_5_combined_version_2"
OUT_RP2   = f"{BASE}/Features_ReducedPlus2"
TRAIN_RP2 = f"{OUT_RP2}/train_reduced_plus2.csv"
TEST_RP2  = f"{OUT_RP2}/test_reduced_plus2.csv"

# Single output folder for SVM
OUT_ROOT = Path(OUT_RP2) / "outputs"
OUT_SVM  = OUT_ROOT / "SVM"
OUT_SVM.mkdir(parents=True, exist_ok=True)

ID_COLS    = ["group_id","spice","target"]
TARGET_COL = "target"

# Load data
train = pd.read_csv(TRAIN_RP2)
test  = pd.read_csv(TEST_RP2)

X_train = train.drop(columns=ID_COLS, errors="ignore")
y_train = train[TARGET_COL].astype(int).values
X_test  = test.drop(columns=ID_COLS, errors="ignore")
y_test  = test[TARGET_COL].astype(int).values

meta_cols = [c for c in ["group_id","spice"] if c in test.columns]
meta = test[meta_cols].copy() if meta_cols else pd.DataFrame(index=test.index)

# Model
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", SVC(kernel="rbf", C=1.0, gamma="scale", probability=False, random_state=42))
])
pipe.fit(X_train, y_train)

# Save model in the single folder
joblib.dump(pipe, OUT_SVM / "model.joblib")

# Predict and evaluate
y_pred = pipe.predict(X_test)
labels = np.unique(np.concatenate([y_train, y_test], axis=0))
acc = float(accuracy_score(y_test, y_pred))
acc_pct = acc * 100.0
rep = classification_report(y_test, y_pred, labels=labels, digits=4)
p, r, f1, s = precision_recall_fscore_support(y_test, y_pred, labels=labels, zero_division=0)

metrics = {
    "model": "svm_rbf_reduced_plus2",
    "accuracy": acc,
    "f1": {
        "macro": float(f1_score(y_test, y_pred, average="macro")),
        "weighted": float(f1_score(y_test, y_pred, average="weighted"))
    },
    "per_class": {
        str(int(lbl)): {"precision": float(pi), "recall": float(ri), "f1": float(fi), "support": int(si)}
        for lbl, pi, ri, fi, si in zip(labels, p, r, f1, s)
    }
}
with open(OUT_SVM / "metrics.json", "w") as f:
    json.dump(metrics, f, indent=2)

with open(OUT_SVM / "classification_report.txt", "w") as f:
    f.write(rep)

cm = confusion_matrix(y_test, y_pred, labels=labels)
cm_df = pd.DataFrame(cm,
    index=[f"true_{int(i)}" for i in labels],
    columns=[f"pred_{int(i)}" for i in labels]
)
cm_df.to_csv(OUT_SVM / "confusion_matrix.csv", index=True)

# Per-cycle predictions
preds = meta.copy()
preds["y_true"] = y_test
preds["y_pred"] = y_pred
preds.to_csv(OUT_SVM / "per_cycle_predictions.csv", index=False)

# Console summary
print(f"[RP2 SVM] test accuracy: {acc_pct:.2f}%")
print("\n[RP2 SVM] classification report:\n", rep)
print("\n[RP2 SVM] confusion matrix:\n", cm_df)
print("\n[RP2 SVM] per-cycle predictions (full):")
with pd.option_context("display.max_rows", None, "display.max_columns", None, "display.width", 0, "display.max_colwidth", None):
    print(preds.to_string(index=False))
print("\n[RP2 SVM] outputs saved to:", OUT_SVM.resolve())


[RP2 SVM] test accuracy: 50.00%

[RP2 SVM] classification report:
               precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000         5
           1     0.0000    0.0000    0.0000         5
           2     0.7143    1.0000    0.8333         5
           3     1.0000    1.0000    1.0000         5

    accuracy                         0.5000        20
   macro avg     0.4286    0.5000    0.4583        20
weighted avg     0.4286    0.5000    0.4583        20


[RP2 SVM] confusion matrix:
         pred_0  pred_1  pred_2  pred_3
true_0       0       5       0       0
true_1       3       0       2       0
true_2       0       0       5       0
true_3       0       0       0       5

[RP2 SVM] per-cycle predictions (full):
        group_id    spice  y_true  y_pred
   Anise_cycle_1    Anise       0       1
   Anise_cycle_2    Anise       0       1
   Anise_cycle_3    Anise       0       1
   Anise_cycle_4    Anise       0       1
   Anise_cycle_5    An

In [10]:
# Reviewer note: XGBoost on Features_ReducedPlus2; no cross-validation.
# Reviewer note: saves ALL artifacts into a single folder: Features_ReducedPlus2/outputs/XGBoost.
# Reviewer note: accuracy printed as percentage; full per-cycle predictions printed and saved.

import os, json, joblib
from pathlib import Path
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    precision_recall_fscore_support, f1_score
)

BASE      = "/content/drive/My Drive/Final_Year_Project/Attempt_1_4_and_5_combined_version_2"
OUT_RP2   = f"{BASE}/Features_ReducedPlus2"
TRAIN_RP2 = f"{OUT_RP2}/train_reduced_plus2.csv"
TEST_RP2  = f"{OUT_RP2}/test_reduced_plus2.csv"

# Single output folder for XGBoost
OUT_ROOT = Path(OUT_RP2) / "outputs"
OUT_XGB  = OUT_ROOT / "XGBoost"
OUT_XGB.mkdir(parents=True, exist_ok=True)

ID_COLS    = ["group_id","spice","target"]
TARGET_COL = "target"

# Load data
train = pd.read_csv(TRAIN_RP2)
test  = pd.read_csv(TEST_RP2)

X_train = train.drop(columns=ID_COLS, errors="ignore")
y_train = train[TARGET_COL].astype(int).values
X_test  = test.drop(columns=ID_COLS, errors="ignore")
y_test  = test[TARGET_COL].astype(int).values

meta_cols = [c for c in ["group_id","spice"] if c in test.columns]
meta = test[meta_cols].copy() if meta_cols else pd.DataFrame(index=test.index)

# Model
num_classes = int(np.unique(y_train).shape[0])
xgb = XGBClassifier(
    objective="multi:softmax",
    num_class=num_classes,
    eval_metric="mlogloss",
    n_estimators=400,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)
xgb.fit(X_train, y_train)

# Save model in the single folder
joblib.dump(xgb, OUT_XGB / "model.joblib")

# Predict and evaluate
y_pred = xgb.predict(X_test)
labels = np.unique(np.concatenate([y_train, y_test], axis=0))
acc = float(accuracy_score(y_test, y_pred))
acc_pct = acc * 100.0
rep = classification_report(y_test, y_pred, labels=labels, digits=4)
p, r, f1, s = precision_recall_fscore_support(y_test, y_pred, labels=labels, zero_division=0)

metrics = {
    "model": "xgboost_reduced_plus2",
    "accuracy": acc,
    "f1": {
        "macro": float(f1_score(y_test, y_pred, average="macro")),
        "weighted": float(f1_score(y_test, y_pred, average="weighted"))
    },
    "per_class": {
        str(int(lbl)): {"precision": float(pi), "recall": float(ri), "f1": float(fi), "support": int(si)}
        for lbl, pi, ri, fi, si in zip(labels, p, r, f1, s)
    }
}
with open(OUT_XGB / "metrics.json", "w") as f:
    json.dump(metrics, f, indent=2)

with open(OUT_XGB / "classification_report.txt", "w") as f:
    f.write(rep)

cm = confusion_matrix(y_test, y_pred, labels=labels)
cm_df = pd.DataFrame(cm,
    index=[f"true_{int(i)}" for i in labels],
    columns=[f"pred_{int(i)}" for i in labels]
)
cm_df.to_csv(OUT_XGB / "confusion_matrix.csv", index=True)

# Per-cycle predictions
preds = meta.copy()
preds["y_true"] = y_test
preds["y_pred"] = y_pred
preds.to_csv(OUT_XGB / "per_cycle_predictions.csv", index=False)

# Console summary
print(f"[RP2 XGBoost] test accuracy: {acc_pct:.2f}%")
print("\n[RP2 XGBoost] classification report:\n", rep)
print("\n[RP2 XGBoost] confusion matrix:\n", cm_df)
print("\n[RP2 XGBoost] per-cycle predictions (full):")
with pd.option_context("display.max_rows", None, "display.max_columns", None, "display.width", 0, "display.max_colwidth", None):
    print(preds.to_string(index=False))
print("\n[RP2 XGBoost] outputs saved to:", OUT_XGB.resolve())


[RP2 XGBoost] test accuracy: 50.00%

[RP2 XGBoost] classification report:
               precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000         5
           1     0.0000    0.0000    0.0000         5
           2     1.0000    1.0000    1.0000         5
           3     0.8333    1.0000    0.9091         5

    accuracy                         0.5000        20
   macro avg     0.4583    0.5000    0.4773        20
weighted avg     0.4583    0.5000    0.4773        20


[RP2 XGBoost] confusion matrix:
         pred_0  pred_1  pred_2  pred_3
true_0       0       5       0       0
true_1       4       0       0       1
true_2       0       0       5       0
true_3       0       0       0       5

[RP2 XGBoost] per-cycle predictions (full):
        group_id    spice  y_true  y_pred
   Anise_cycle_1    Anise       0       1
   Anise_cycle_2    Anise       0       1
   Anise_cycle_3    Anise       0       1
   Anise_cycle_4    Anise       0       1
   Ani

In [11]:
# Reviewer note: KNN on Features_ReducedPlus2; no cross-validation.
# Reviewer note: saves ALL artifacts into a single folder: Features_ReducedPlus2/outputs/KNN.
# Reviewer note: accuracy printed as percentage; full per-cycle predictions printed and saved.

import os, json, joblib
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    precision_recall_fscore_support, f1_score
)

BASE      = "/content/drive/My Drive/Final_Year_Project/Attempt_1_4_and_5_combined_version_2"
OUT_RP2   = f"{BASE}/Features_ReducedPlus2"
TRAIN_RP2 = f"{OUT_RP2}/train_reduced_plus2.csv"
TEST_RP2  = f"{OUT_RP2}/test_reduced_plus2.csv"

# Single output folder for KNN
OUT_ROOT = Path(OUT_RP2) / "outputs"
OUT_KNN  = OUT_ROOT / "KNN"
OUT_KNN.mkdir(parents=True, exist_ok=True)

ID_COLS    = ["group_id","spice","target"]
TARGET_COL = "target"

# Load data
train = pd.read_csv(TRAIN_RP2)
test  = pd.read_csv(TEST_RP2)

X_train = train.drop(columns=ID_COLS, errors="ignore")
y_train = train[TARGET_COL].astype(int).values
X_test  = test.drop(columns=ID_COLS, errors="ignore")
y_test  = test[TARGET_COL].astype(int).values

meta_cols = [c for c in ["group_id","spice"] if c in test.columns]
meta = test[meta_cols].copy() if meta_cols else pd.DataFrame(index=test.index)

# Model
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", KNeighborsClassifier(n_neighbors=7, weights="distance", metric="minkowski", p=2))
])
pipe.fit(X_train, y_train)

# Save model in the single folder
joblib.dump(pipe, OUT_KNN / "model.joblib")

# Predict and evaluate
y_pred = pipe.predict(X_test)
labels = np.unique(np.concatenate([y_train, y_test], axis=0))
acc = float(accuracy_score(y_test, y_pred))
acc_pct = acc * 100.0
rep = classification_report(y_test, y_pred, labels=labels, digits=4)
p, r, f1, s = precision_recall_fscore_support(y_test, y_pred, labels=labels, zero_division=0)

metrics = {
    "model": "knn_reduced_plus2",
    "accuracy": acc,
    "f1": {
        "macro": float(f1_score(y_test, y_pred, average="macro")),
        "weighted": float(f1_score(y_test, y_pred, average="weighted"))
    },
    "per_class": {
        str(int(lbl)): {"precision": float(pi), "recall": float(ri), "f1": float(fi), "support": int(si)}
        for lbl, pi, ri, fi, si in zip(labels, p, r, f1, s)
    }
}
with open(OUT_KNN / "metrics.json", "w") as f:
    json.dump(metrics, f, indent=2)

with open(OUT_KNN / "classification_report.txt", "w") as f:
    f.write(rep)

cm = confusion_matrix(y_test, y_pred, labels=labels)
cm_df = pd.DataFrame(cm,
    index=[f"true_{int(i)}" for i in labels],
    columns=[f"pred_{int(i)}" for i in labels]
)
cm_df.to_csv(OUT_KNN / "confusion_matrix.csv", index=True)

# Per-cycle predictions
preds = meta.copy()
preds["y_true"] = y_test
preds["y_pred"] = y_pred
preds.to_csv(OUT_KNN / "per_cycle_predictions.csv", index=False)

# Console summary
print(f"[RP2 KNN] test accuracy: {acc_pct:.2f}%")
print("\n[RP2 KNN] classification report:\n", rep)
print("\n[RP2 KNN] confusion matrix:\n", cm_df)
print("\n[RP2 KNN] per-cycle predictions (full):")
with pd.option_context("display.max_rows", None, "display.max_columns", None, "display.width", 0, "display.max_colwidth", None):
    print(preds.to_string(index=False))
print("\n[RP2 KNN] outputs saved to:", OUT_KNN.resolve())


[RP2 KNN] test accuracy: 50.00%

[RP2 KNN] classification report:
               precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000         5
           1     0.0000    0.0000    0.0000         5
           2     0.8333    1.0000    0.9091         5
           3     1.0000    1.0000    1.0000         5

    accuracy                         0.5000        20
   macro avg     0.4583    0.5000    0.4773        20
weighted avg     0.4583    0.5000    0.4773        20


[RP2 KNN] confusion matrix:
         pred_0  pred_1  pred_2  pred_3
true_0       0       5       0       0
true_1       4       0       1       0
true_2       0       0       5       0
true_3       0       0       0       5

[RP2 KNN] per-cycle predictions (full):
        group_id    spice  y_true  y_pred
   Anise_cycle_1    Anise       0       1
   Anise_cycle_2    Anise       0       1
   Anise_cycle_3    Anise       0       1
   Anise_cycle_4    Anise       0       1
   Anise_cycle_5    An