In [None]:
# Drive mount logic kept minimal and predictable.
# Detect existing mount to avoid duplicate prompts; mount if absent.

import os
from google.colab import drive

if os.path.ismount('/content/drive'):
    print("OK: /content/drive is already mounted.")
else:
    drive.mount('/content/drive', force_remount=False)

Mounted at /content/drive


In [None]:
# Reviewer note: paths updated to Attempt_1_4_and_5_combined_version_2.
# Reviewer note: reduction keeps rel, log_slope_per_s, log_std, plus context means; IDs are preserved for downstream joins.

import os
from pathlib import Path
import pandas as pd
import re

# Project paths
TRAIN5 = "/content/drive/My Drive/Final_Year_Project/Attempt_1_4_and_5_combined_version_2/Features/train/train_features.csv"
TEST5  = "/content/drive/My Drive/Final_Year_Project/Attempt_1_4_and_5_combined_version_2/Features/test/test_features.csv"

# Output directory for reduced features
OUT_REDUCED = "/content/drive/My Drive/Final_Year_Project/Attempt_1_4_and_5_combined_version_2/Features_Reduced"
Path(OUT_REDUCED).mkdir(parents=True, exist_ok=True)

ID_COLS = ["group_id","spice","target"]
CTX_COLS = ["temp_mean","rh_mean","pressure_mean"]

def select_reduced_cols(df):
    # Reviewer note: ID columns excluded from feature list; context means retained explicitly.
    keep = []
    for c in df.columns:
        if c in ID_COLS:
            continue
        if c in CTX_COLS:
            keep.append(c)
            continue
        if c.endswith("_n"):
            continue
        if re.search(r"_rel$", c):
            keep.append(c)
        elif c.endswith("_log_slope_per_s"):
            keep.append(c)
        elif c.endswith("_log_std"):
            keep.append(c)
    return keep

def make_reduced(src_csv, dst_csv):
    # Reviewer note: preserve ID columns first; concatenate selected features for clarity and reproducibility.
    df = pd.read_csv(src_csv)
    feats = select_reduced_cols(df)
    reduced = pd.concat([df[ID_COLS], df[feats]], axis=1)
    reduced.to_csv(dst_csv, index=False)
    print(f"[OK] Wrote reduced: {dst_csv}  | kept {len(feats)} feature columns")

make_reduced(TRAIN5, f"{OUT_REDUCED}/train_reduced.csv")
make_reduced(TEST5,  f"{OUT_REDUCED}/test_reduced.csv")

[OK] Wrote reduced: /content/drive/My Drive/Final_Year_Project/Attempt_1_4_and_5_combined_version_2/Features_Reduced/train_reduced.csv  | kept 483 feature columns
[OK] Wrote reduced: /content/drive/My Drive/Final_Year_Project/Attempt_1_4_and_5_combined_version_2/Features_Reduced/test_reduced.csv  | kept 483 feature columns


In [None]:
# Reviewer note: Random Forest on Reduced features; no cross-validation.
# Reviewer note: outputs are saved under Features_Reduced/outputs/RandomForest; accuracy printed as percentage.
# Reviewer note: per-cycle prediction table is printed; per-cycle summary removed as requested.

import os, json, joblib
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    precision_recall_fscore_support, f1_score
)

# Assumes OUT_REDUCED defined earlier, for example:
# OUT_REDUCED = "/content/drive/My Drive/Final_Year_Project/Attempt_1_4_and_5_combined_version_2/Features_Reduced"

# Output directory structure
OUT_ROOT = Path(OUT_REDUCED) / "outputs"
OUT_RF   = OUT_ROOT / "RandomForest"
OUT_RF.mkdir(parents=True, exist_ok=True)

ID_COLS = ["group_id","spice","target"]
TARGET_COL = "target"

# Load reduced train/test
train = pd.read_csv(f"{OUT_REDUCED}/train_reduced.csv")
test  = pd.read_csv(f"{OUT_REDUCED}/test_reduced.csv")

X_train = train.drop(columns=ID_COLS, errors="ignore")
y_train = train[TARGET_COL].astype(int).values
X_test  = test.drop(columns=ID_COLS, errors="ignore")
y_test  = test[TARGET_COL].astype(int).values

# Optional metadata for per-cycle predictions
meta_cols = [c for c in ["group_id","spice"] if c in test.columns]
meta = test[meta_cols].copy() if meta_cols else pd.DataFrame(index=test.index)

# Build and fit model
rf = RandomForestClassifier(
    n_estimators=800,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=2,
    max_features="sqrt",
    class_weight="balanced_subsample",
    n_jobs=-1,
    random_state=42
)
rf.fit(X_train, y_train)

# Persist model
joblib.dump(rf, OUT_RF / "model.joblib")

# Predict on test
y_pred  = rf.predict(X_test)
labels  = np.unique(np.concatenate([y_train, y_test], axis=0))

# Metrics
acc = float(accuracy_score(y_test, y_pred))
acc_pct = acc * 100.0
rep = classification_report(y_test, y_pred, labels=labels, digits=4)
p, r, f1, s = precision_recall_fscore_support(y_test, y_pred, labels=labels, zero_division=0)

metrics = {
    "model": "random_forest",
    "accuracy": acc,
    "f1": {
        "macro": float(f1_score(y_test, y_pred, average="macro")),
        "weighted": float(f1_score(y_test, y_pred, average="weighted"))
    },
    "per_class": {
        str(int(lbl)): {"precision": float(pi), "recall": float(ri), "f1": float(fi), "support": int(si)}
        for lbl, pi, ri, fi, si in zip(labels, p, r, f1, s)
    }
}
with open(OUT_RF / "metrics.json", "w") as f:
    json.dump(metrics, f, indent=2)

# Save classification report
with open(OUT_RF / "classification_report.txt", "w") as f:
    f.write(rep)

# Confusion matrix as CSV with clear headers
cm = confusion_matrix(y_test, y_pred, labels=labels)
cm_df = pd.DataFrame(
    cm,
    index=[f"true_{int(i)}" for i in labels],
    columns=[f"pred_{int(i)}" for i in labels]
)
cm_df.to_csv(OUT_RF / "confusion_matrix.csv", index=True)

# Per-cycle predictions dataframe
preds = meta.copy()
preds["y_true"] = y_test
preds["y_pred"] = y_pred
preds.to_csv(OUT_RF / "per_cycle_predictions.csv", index=False)

# Console summary
print(f"[RandomForest] test accuracy: {acc_pct:.2f}%")
print("\n[RandomForest] classification report:\n", rep)
print("\n[RandomForest] confusion matrix:\n", cm_df)

# Print per-cycle prediction table to console (full table; no summary)
print("\n[RandomForest] per_cycle_predictions (full table):")
print(preds.to_string(index=False))

print("\n[RandomForest] outputs saved to:", OUT_RF.resolve())


[RandomForest] test accuracy: 45.00%

[RandomForest] classification report:
               precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000         5
           1     0.0000    0.0000    0.0000         5
           2     0.5000    1.0000    0.6667         5
           3     1.0000    0.8000    0.8889         5

    accuracy                         0.4500        20
   macro avg     0.3750    0.4500    0.3889        20
weighted avg     0.3750    0.4500    0.3889        20


[RandomForest] confusion matrix:
         pred_0  pred_1  pred_2  pred_3
true_0       0       5       0       0
true_1       1       0       4       0
true_2       0       0       5       0
true_3       0       0       1       4

[RandomForest] per_cycle_predictions (full table):
        group_id    spice  y_true  y_pred
   Anise_cycle_1    Anise       0       1
   Anise_cycle_2    Anise       0       1
   Anise_cycle_3    Anise       0       1
   Anise_cycle_4    Anise       0     

In [None]:
# Reviewer note: Logistic Regression on Reduced features; no cross-validation.
# Reviewer note: outputs are saved under Features_Reduced/outputs/LogisticRegression; accuracy printed as percentage.
# Reviewer note: per-cycle prediction table is printed to the console (full table); no additional summary.

import os, json, joblib
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    precision_recall_fscore_support, f1_score
)

# Assumes OUT_REDUCED was defined previously, for example:
# OUT_REDUCED = "/content/drive/My Drive/Final_Year_Project/Attempt_1_4_and_5_combined_version_2/Features_Reduced"

# Output directory structure
OUT_ROOT = Path(OUT_REDUCED) / "outputs"
OUT_LR   = OUT_ROOT / "LogisticRegression"
OUT_LR.mkdir(parents=True, exist_ok=True)

ID_COLS = ["group_id","spice","target"]
TARGET_COL = "target"

# Load reduced train/test
train = pd.read_csv(f"{OUT_REDUCED}/train_reduced.csv")
test  = pd.read_csv(f"{OUT_REDUCED}/test_reduced.csv")

X_train = train.drop(columns=ID_COLS, errors="ignore")
y_train = train[TARGET_COL].astype(int).values
X_test  = test.drop(columns=ID_COLS, errors="ignore")
y_test  = test[TARGET_COL].astype(int).values

# Optional metadata for per-cycle predictions
meta_cols = [c for c in ["group_id","spice"] if c in test.columns]
meta = test[meta_cols].copy() if meta_cols else pd.DataFrame(index=test.index)

# Build and fit model
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=5000, multi_class="auto", random_state=42))
])
pipe.fit(X_train, y_train)

# Persist model
joblib.dump(pipe, OUT_LR / "model.joblib")

# Predict on test
y_pred  = pipe.predict(X_test)
labels  = np.unique(np.concatenate([y_train, y_test], axis=0))

# Metrics
acc = float(accuracy_score(y_test, y_pred))
acc_pct = acc * 100.0
rep = classification_report(y_test, y_pred, labels=labels, digits=4)
p, r, f1, s = precision_recall_fscore_support(y_test, y_pred, labels=labels, zero_division=0)

metrics = {
    "model": "logistic_regression",
    "accuracy": acc,
    "f1": {
        "macro": float(f1_score(y_test, y_pred, average="macro")),
        "weighted": float(f1_score(y_test, y_pred, average="weighted"))
    },
    "per_class": {
        str(int(lbl)): {"precision": float(pi), "recall": float(ri), "f1": float(fi), "support": int(si)}
        for lbl, pi, ri, fi, si in zip(labels, p, r, f1, s)
    }
}
with open(OUT_LR / "metrics.json", "w") as f:
    json.dump(metrics, f, indent=2)

# Save classification report
with open(OUT_LR / "classification_report.txt", "w") as f:
    f.write(rep)

# Confusion matrix as CSV with clear headers
cm = confusion_matrix(y_test, y_pred, labels=labels)
cm_df = pd.DataFrame(
    cm,
    index=[f"true_{int(i)}" for i in labels],
    columns=[f"pred_{int(i)}" for i in labels]
)
cm_df.to_csv(OUT_LR / "confusion_matrix.csv", index=True)

# Per-cycle predictions
preds = meta.copy()
preds["y_true"] = y_test
preds["y_pred"] = y_pred
preds.to_csv(OUT_LR / "per_cycle_predictions.csv", index=False)

# Console summary
print(f"[LogisticRegression] test accuracy: {acc_pct:.2f}%")
print("\n[LogisticRegression] classification report:\n", rep)
print("\n[LogisticRegression] confusion matrix:\n", cm_df)

# Print per-cycle prediction table to console (full table; no summary)
print("\n[LogisticRegression] per_cycle_predictions (full table):")
print(preds.to_string(index=False))

print("\n[LogisticRegression] outputs saved to:", OUT_LR.resolve())


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[LogisticRegression] test accuracy: 70.00%

[LogisticRegression] classification report:
               precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000         5
           1     0.0000    0.0000    0.0000         5
           2     0.4545    1.0000    0.6250         5
           3     1.0000    0.8000    0.8889         5

    accuracy                         0.7000        20
   macro avg     0.6136    0.7000    0.6285        20
weighted avg     0.6136    0.7000    0.6285        20


[LogisticRegression] confusion matrix:
         pred_0  pred_1  pred_2  pred_3
true_0       5       0       0       0
true_1       0       0       5       0
true_2       0       0       5       0
true_3       0       0       1       4

[LogisticRegression] per_cycle_predictions (full table):
        group_id    spice  y_true  y_pred
   Anise_cycle_1    Anise       0       0
   Anise_cycle_2    Anise       0       0
   Anise_cycle_3    Anise       0       0
   Anise_cycle

In [None]:
# Reviewer note: SVM (RBF) on Reduced features; no cross-validation.
# Reviewer note: outputs are saved under Features_Reduced/outputs/SVM; accuracy printed as percentage.
# Reviewer note: per-cycle prediction table is printed to the console (full table); no additional summary.

import os, json, joblib
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    precision_recall_fscore_support, f1_score
)

# Base path for Reduced features
OUT_REDUCED = "/content/drive/My Drive/Final_Year_Project/Attempt_1_4_and_5_combined_version_2/Features_Reduced"
TRAIN_RED = f"{OUT_REDUCED}/train_reduced.csv"
TEST_RED  = f"{OUT_REDUCED}/test_reduced.csv"

# Output directory structure
OUT_ROOT = Path(OUT_REDUCED) / "outputs"
OUT_SVM  = OUT_ROOT / "SVM"
OUT_SVM.mkdir(parents=True, exist_ok=True)

ID_COLS = ["group_id","spice","target"]
TARGET_COL = "target"

# Load Reduced train/test
train = pd.read_csv(TRAIN_RED)
test  = pd.read_csv(TEST_RED)

X_train = train.drop(columns=ID_COLS, errors="ignore")
y_train = train[TARGET_COL].astype(int).values
X_test  = test.drop(columns=ID_COLS, errors="ignore")
y_test  = test[TARGET_COL].astype(int).values

# Optional metadata for per-cycle predictions
meta_cols = [c for c in ["group_id","spice"] if c in test.columns]
meta = test[meta_cols].copy() if meta_cols else pd.DataFrame(index=test.index)

# Build and fit model (no CV)
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", SVC(kernel="rbf", C=1.0, gamma="scale", probability=False, random_state=42))
])
pipe.fit(X_train, y_train)

# Persist model
joblib.dump(pipe, OUT_SVM / "model.joblib")

# Predict on test
y_pred  = pipe.predict(X_test)
labels  = np.unique(np.concatenate([y_train, y_test], axis=0))

# Metrics
acc = float(accuracy_score(y_test, y_pred))
acc_pct = acc * 100.0
rep = classification_report(y_test, y_pred, labels=labels, digits=4)
p, r, f1, s = precision_recall_fscore_support(y_test, y_pred, labels=labels, zero_division=0)

metrics = {
    "model": "svm_rbf",
    "accuracy": acc,
    "f1": {
        "macro": float(f1_score(y_test, y_pred, average="macro")),
        "weighted": float(f1_score(y_test, y_pred, average="weighted"))
    },
    "per_class": {
        str(int(lbl)): {"precision": float(pi), "recall": float(ri), "f1": float(fi), "support": int(si)}
        for lbl, pi, ri, fi, si in zip(labels, p, r, f1, s)
    }
}
with open(OUT_SVM / "metrics.json", "w") as f:
    json.dump(metrics, f, indent=2)

# Save classification report
with open(OUT_SVM / "classification_report.txt", "w") as f:
    f.write(rep)

# Confusion matrix as CSV with clear headers
cm = confusion_matrix(y_test, y_pred, labels=labels)
cm_df = pd.DataFrame(
    cm,
    index=[f"true_{int(i)}" for i in labels],
    columns=[f"pred_{int(i)}" for i in labels]
)
cm_df.to_csv(OUT_SVM / "confusion_matrix.csv", index=True)

# Per-cycle predictions
preds = meta.copy()
preds["y_true"] = y_test
preds["y_pred"] = y_pred
preds.to_csv(OUT_SVM / "per_cycle_predictions.csv", index=False)

# Console summary
print(f"[SVM_RBF_Reduced] test accuracy: {acc_pct:.2f}%")
print("\n[SVM_RBF_Reduced] classification report:\n", rep)
print("\n[SVM_RBF_Reduced] confusion matrix:\n", cm_df)

# Print per-cycle prediction table to console (full table; no summary)
print("\n[SVM_RBF_Reduced] per_cycle_predictions (full table):")
print(preds.to_string(index=False))

print("\n[SVM_RBF_Reduced] outputs saved to:", OUT_SVM.resolve())


[SVM_RBF_Reduced] test accuracy: 40.00%

[SVM_RBF_Reduced] classification report:
               precision    recall  f1-score   support

           0     0.4545    1.0000    0.6250         5
           1     0.0000    0.0000    0.0000         5
           2     0.3333    0.6000    0.4286         5
           3     0.0000    0.0000    0.0000         5

    accuracy                         0.4000        20
   macro avg     0.1970    0.4000    0.2634        20
weighted avg     0.1970    0.4000    0.2634        20


[SVM_RBF_Reduced] confusion matrix:
         pred_0  pred_1  pred_2  pred_3
true_0       5       0       0       0
true_1       3       0       2       0
true_2       2       0       3       0
true_3       1       0       4       0

[SVM_RBF_Reduced] per_cycle_predictions (full table):
        group_id    spice  y_true  y_pred
   Anise_cycle_1    Anise       0       0
   Anise_cycle_2    Anise       0       0
   Anise_cycle_3    Anise       0       0
   Anise_cycle_4    Anise 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [17]:
# Reviewer note: XGBoost on Features_Reduced; no cross-validation.
# Reviewer note: all artifacts saved in a single folder Features_Reduced/outputs/XGBoost.
# Reviewer note: accuracy printed as percentage; per-cycle predictions printed to console (full table).

import os, json, joblib
from pathlib import Path
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    precision_recall_fscore_support, f1_score
)

# Base path for Reduced features
OUT_REDUCED = "/content/drive/My Drive/Final_Year_Project/Attempt_1_4_and_5_combined_version_2/Features_Reduced"
TRAIN_RED = f"{OUT_REDUCED}/train_reduced.csv"
TEST_RED  = f"{OUT_REDUCED}/test_reduced.csv"

# Single output folder (no subdirectories)
OUT_XGB = Path(OUT_REDUCED) / "outputs" / "XGBoost"
OUT_XGB.mkdir(parents=True, exist_ok=True)

ID_COLS = ["group_id","spice","target"]
TARGET_COL = "target"

# Load Reduced train/test
train = pd.read_csv(TRAIN_RED)
test  = pd.read_csv(TEST_RED)

X_train = train.drop(columns=ID_COLS, errors="ignore")
y_train = train[TARGET_COL].astype(int).values
X_test  = test.drop(columns=ID_COLS, errors="ignore")
y_test  = test[TARGET_COL].astype(int).values

# Optional metadata for per-cycle predictions
meta_cols = [c for c in ["group_id","spice"] if c in test.columns]
meta = test[meta_cols].copy() if meta_cols else pd.DataFrame(index=test.index)

# Configure and fit model (no CV)
num_classes = int(np.unique(y_train).shape[0])
xgb = XGBClassifier(
    objective="multi:softmax",
    num_class=num_classes,
    eval_metric="mlogloss",
    n_estimators=400,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)
xgb.fit(X_train, y_train)

# Persist model into the same folder
joblib.dump(xgb, OUT_XGB / "final_model_xgb_reduced.joblib")

# Predict on test
y_pred  = xgb.predict(X_test)
labels  = np.unique(np.concatenate([y_train, y_test], axis=0))

# Metrics
acc = float(accuracy_score(y_test, y_pred))
acc_pct = acc * 100.0
rep = classification_report(y_test, y_pred, labels=labels, digits=4)
p, r, f1, s = precision_recall_fscore_support(y_test, y_pred, labels=labels, zero_division=0)

metrics = {
    "model": "xgboost_multi_softmax_no_cv",
    "accuracy": acc,
    "f1": {
        "macro": float(f1_score(y_test, y_pred, average="macro")),
        "weighted": float(f1_score(y_test, y_pred, average="weighted"))
    },
    "per_class": {
        str(int(lbl)): {"precision": float(pi), "recall": float(ri), "f1": float(fi), "support": int(si)}
        for lbl, pi, ri, fi, si in zip(labels, p, r, f1, s)
    }
}

# Write all eval artifacts into the same folder
with open(OUT_XGB / "metrics.json", "w") as f:
    json.dump(metrics, f, indent=2)

with open(OUT_XGB / "classification_report.txt", "w") as f:
    f.write(rep)

cm = confusion_matrix(y_test, y_pred, labels=labels)
cm_df = pd.DataFrame(
    cm,
    index=[f"true_{int(i)}" for i in labels],
    columns=[f"pred_{int(i)}" for i in labels]
)
cm_df.to_csv(OUT_XGB / "confusion_matrix.csv", index=True)

preds = meta.copy()
preds["y_true"] = y_test
preds["y_pred"] = y_pred
preds.to_csv(OUT_XGB / "test_predictions.csv", index=False)

# Console summary
print(f"[XGBoost_Reduced] test accuracy: {acc_pct:.2f}%")
print("\n[XGBoost_Reduced] classification report:\n", rep)
print("\n[XGBoost_Reduced] confusion matrix:\n", cm_df)

# Print per-cycle prediction table to console (full table; no summary)
print("\n[XGBoost_Reduced] per_cycle_predictions (full table):")
print(preds.to_string(index=False))

print("\n[XGBoost_Reduced] outputs saved to:", OUT_XGB.resolve())


[XGBoost_Reduced] test accuracy: 10.00%

[XGBoost_Reduced] classification report:
               precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000         5
           1     0.0000    0.0000    0.0000         5
           2     0.1818    0.4000    0.2500         5
           3     0.0000    0.0000    0.0000         5

    accuracy                         0.1000        20
   macro avg     0.0455    0.1000    0.0625        20
weighted avg     0.0455    0.1000    0.0625        20


[XGBoost_Reduced] confusion matrix:
         pred_0  pred_1  pred_2  pred_3
true_0       0       5       0       0
true_1       1       0       4       0
true_2       0       0       2       3
true_3       0       0       5       0

[XGBoost_Reduced] per_cycle_predictions (full table):
        group_id    spice  y_true  y_pred
   Anise_cycle_1    Anise       0       1
   Anise_cycle_2    Anise       0       1
   Anise_cycle_3    Anise       0       1
   Anise_cycle_4    Anise 

In [16]:
# Reviewer note: KNN on Features_Reduced with CV+GridSearch; all artifacts saved in a single output folder.
# Reviewer note: per-cycle prediction table is printed to the console (full table); accuracy printed as percentage.

import os, json, joblib
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    precision_recall_fscore_support, f1_score
)

# Base paths
OUT_REDUCED = "/content/drive/My Drive/Final_Year_Project/Attempt_1_4_and_5_combined_version_2/Features_Reduced"
TRAIN_RED = f"{OUT_REDUCED}/train_reduced.csv"
TEST_RED  = f"{OUT_REDUCED}/test_reduced.csv"

# Single output folder (no subdirectories)
OUT_KNN = Path(OUT_REDUCED) / "outputs" / "KNN"
OUT_KNN.mkdir(parents=True, exist_ok=True)

# Load data
ID_COLS = ["group_id","spice","target"]
TARGET_COL = "target"

tr = pd.read_csv(TRAIN_RED)
te = pd.read_csv(TEST_RED)

X_tr = tr.drop(columns=ID_COLS, errors="ignore").values
y_tr = tr[TARGET_COL].astype(int).values

X_te = te.drop(columns=ID_COLS, errors="ignore").values
y_te = te[TARGET_COL].astype(int).values

meta_cols = [c for c in ["group_id","spice"] if c in te.columns]
meta = te[meta_cols].copy() if meta_cols else pd.DataFrame(index=te.index)

# Pipeline and CV grid
pipe_knn = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", KNeighborsClassifier())
])

param_grid = {
    "clf__n_neighbors": [1, 3, 5, 7],
    "clf__weights": ["uniform", "distance"]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
gs = GridSearchCV(
    estimator=pipe_knn,
    param_grid=param_grid,
    scoring="accuracy",
    cv=cv,
    n_jobs=-1,
    refit=True,
    verbose=0
)
gs.fit(X_tr, y_tr)

print(f"[CV KNN-Reduced] best mean acc: {gs.best_score_:.4f}")
print("[CV KNN-Reduced] best params:", gs.best_params_)

# Save CV summary into the single output folder
with open(OUT_KNN / "cv_knn_reduced.json","w") as f:
    json.dump({
        "best_score": float(gs.best_score_),
        "best_params": gs.best_params_,
        "n_features": int(X_tr.shape[1]),
        "n_samples": int(X_tr.shape[0])
    }, f, indent=2)

# Evaluate best estimator on held-out test
best_knn = gs.best_estimator_
y_pred = best_knn.predict(X_te)

labels = np.unique(np.concatenate([y_tr, y_te], axis=0))
acc = float(accuracy_score(y_te, y_pred))
acc_pct = acc * 100.0
rep = classification_report(y_te, y_pred, labels=labels, digits=4)
p, r, f1, s = precision_recall_fscore_support(y_te, y_pred, labels=labels, zero_division=0)

metrics = {
    "model": "knn_cv_selected",
    "accuracy": acc,
    "f1": {
        "macro": float(f1_score(y_te, y_pred, average="macro")),
        "weighted": float(f1_score(y_te, y_pred, average="weighted"))
    },
    "per_class": {
        str(int(lbl)): {"precision": float(pi), "recall": float(ri), "f1": float(fi), "support": int(si)}
        for lbl, pi, ri, fi, si in zip(labels, p, r, f1, s)
    },
    "best_params": gs.best_params_
}

# Write all eval artifacts into the same folder
with open(OUT_KNN / "metrics.json","w") as f:
    json.dump(metrics, f, indent=2)

with open(OUT_KNN / "classification_report.txt","w") as f:
    f.write(rep)

cm = confusion_matrix(y_te, y_pred, labels=labels)
cm_df = pd.DataFrame(
    cm,
    index=[f"true_{int(i)}" for i in labels],
    columns=[f"pred_{int(i)}" for i in labels]
)
cm_df.to_csv(OUT_KNN / "confusion_matrix.csv", index=True)

preds = meta.copy()
preds["y_true"] = y_te
preds["y_pred"] = y_pred
preds.to_csv(OUT_KNN / "test_predictions.csv", index=False)

# Persist model into the same folder
joblib.dump(best_knn, OUT_KNN / "final_model_knn_reduced.joblib")
print(f"[OK] Saved model to {OUT_KNN / 'final_model_knn_reduced.joblib'}")

# Console summary
print(f"[TEST KNN-Reduced] accuracy: {acc_pct:.2f}%")
print("\n[TEST KNN-Reduced] classification report:\n", rep)
print("\n[TEST KNN-Reduced] confusion matrix:\n", cm_df)

# Print per-cycle prediction table to console (full table; no summary)
print("\n[TEST KNN-Reduced] per_cycle_predictions (full table):")
print(preds.to_string(index=False))

print("\n[TEST KNN-Reduced] outputs saved to:", OUT_KNN.resolve())


[CV KNN-Reduced] best mean acc: 0.9500
[CV KNN-Reduced] best params: {'clf__n_neighbors': 5, 'clf__weights': 'uniform'}
[OK] Saved model to /content/drive/My Drive/Final_Year_Project/Attempt_1_4_and_5_combined_version_2/Features_Reduced/outputs/KNN/final_model_knn_reduced.joblib
[TEST KNN-Reduced] accuracy: 70.00%

[TEST KNN-Reduced] classification report:
               precision    recall  f1-score   support

           0     0.6250    1.0000    0.7692         5
           1     1.0000    0.2000    0.3333         5
           2     0.5714    0.8000    0.6667         5
           3     1.0000    0.8000    0.8889         5

    accuracy                         0.7000        20
   macro avg     0.7991    0.7000    0.6645        20
weighted avg     0.7991    0.7000    0.6645        20


[TEST KNN-Reduced] confusion matrix:
         pred_0  pred_1  pred_2  pred_3
true_0       5       0       0       0
true_1       1       1       3       0
true_2       1       0       4       0
true_3    