In [1]:
# Reviewer note: Colab Drive mount and path configuration for resistance feature

import os
import sys
from pathlib import Path
from google.colab import drive

# Mount Google Drive if not already mounted
if not os.path.ismount("/content/drive"):
    drive.mount("/content/drive", force_remount=False)

# Folders containing the CSVs for resistance feature
TRAIN_DIR = Path("/content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/resistance/train")
TEST_DIR  = Path("/content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/resistance/test")

# Base output directory for all resistance models; model cells will create their own subfolders
BASE_OUT = Path("/content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/resistance/outputs")
BASE_OUT.mkdir(parents=True, exist_ok=True)

# Helper to resolve a single CSV inside a directory
def resolve_single_csv(dir_path: Path) -> Path:
    candidates = list(dir_path.glob("*.csv"))
    if len(candidates) == 0:
        raise FileNotFoundError(f"No CSV found in {dir_path}. Put exactly one CSV file in that folder.")
    if len(candidates) > 1:
        names = [c.name for c in candidates]
        raise FileExistsError(f"Multiple CSV files found in {dir_path}: {names}. Keep exactly one or specify the exact file.")
    return candidates[0]

TRAIN_CSV = resolve_single_csv(TRAIN_DIR)
TEST_CSV  = resolve_single_csv(TEST_DIR)

print("Train CSV:", TRAIN_CSV)
print("Test  CSV:", TEST_CSV)
print("Base out :", BASE_OUT)


Mounted at /content/drive
Train CSV: /content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/resistance/train/Train_All_Specimens_Resistance.csv
Test  CSV: /content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/resistance/test/Test_All_Specimens_Resistance.csv
Base out : /content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/resistance/outputs


In [2]:
# Reviewer note: dataset loading for single feature 'resistance_gassensor'; no preprocessing or scaling

import pandas as pd
import numpy as np

# Single raw feature column and label column
FEATURES = ["resistance_gassensor"]
LABEL_COL = "target"

# Load train and test exactly as-is
train_df = pd.read_csv(TRAIN_CSV)
test_df  = pd.read_csv(TEST_CSV)

# Basic checks to fail fast if the schema is off
missing_train = [c for c in FEATURES + [LABEL_COL] if c not in train_df.columns]
missing_test  = [c for c in FEATURES + [LABEL_COL] if c not in test_df.columns]
if missing_train:
    raise ValueError(f"Train is missing columns: {missing_train}")
if missing_test:
    raise ValueError(f"Test is missing columns: {missing_test}")

# Extract raw numpy arrays for scikit-learn
X_train = train_df[FEATURES].values
y_train = train_df[LABEL_COL].values
X_test  = test_df[FEATURES].values
y_test  = test_df[LABEL_COL].values

print("Train shapes:", X_train.shape, y_train.shape)
print("Test shapes :", X_test.shape, y_test.shape)
print("Unique train labels:", np.unique(y_train))
print("Unique test  labels:", np.unique(y_test))


Train shapes: (107200, 1) (107200,)
Test shapes : (107200, 1) (107200,)
Unique train labels: [0 1 2 3]
Unique test  labels: [0 1 2 3]


In [3]:
# Reviewer note: Random Forest on single raw feature; add model persistence, labeled confusion-matrix CSV, and console prints; no change to core behavior

import time
import sys
import json
import joblib
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Normalize BASE_OUT with a default and derive OUT_DIR for this model
BASE_OUT = Path(globals().get(
    "BASE_OUT",
    "/content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/resistance/outputs"
))
OUT_DIR = BASE_OUT / "RandomForest"

# Safety guard: stop if an existing folder is not empty to protect prior runs
if OUT_DIR.exists() and any(OUT_DIR.iterdir()):
    print(f"Safety guard: output folder already exists and is not empty: {OUT_DIR}")
    print("Create a new folder or archive/clear the existing one before proceeding.")
    sys.exit(1)

OUT_DIR.mkdir(parents=True, exist_ok=True)

clf = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    n_jobs=-1,
    random_state=42
)

start = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start

# Predict on test set
y_pred = clf.predict(X_test)

# Metrics
acc = accuracy_score(y_test, y_pred)
acc_percent = round(100.0 * acc, 2)
report_dict = classification_report(y_test, y_pred, digits=4, output_dict=True)
cm = confusion_matrix(y_test, y_pred)

# Optional spice mapping if present; falls back to blank if column absent
spice_map = {}
if "spice" in test_df.columns:
    spice_map = test_df.groupby(LABEL_COL)["spice"].agg(lambda s: s.mode().iat[0]).to_dict()

# Build labeled headers for confusion matrix
unique_classes = np.sort(np.unique(y_test))
label_names = []
for c in unique_classes:
    if spice_map:
        label_names.append(f"{spice_map.get(int(c), '')} ({int(c)})".strip())
    else:
        label_names.append(str(int(c)))

# Save classification report
report_csv = OUT_DIR / "rf_resistance_classification_report.csv"
pd.DataFrame(report_dict).transpose().to_csv(report_csv, index=True)

# Save confusion matrix with clear headers
cm_df = pd.DataFrame(
    cm,
    index=[f"true_{n}" for n in label_names],
    columns=[f"pred_{n}" for n in label_names]
)
cm_df.index.name = "true_label"
cm_df.columns.name = "pred_label"
cm_csv = OUT_DIR / "rf_resistance_confusion_matrix.csv"
cm_df.to_csv(cm_csv, index=True)

# Persist the trained model; avoid overwriting by timestamping if needed
model_base = OUT_DIR / "rf_resistance_model.joblib"
model_path = model_base if not model_base.exists() else OUT_DIR / f"rf_resistance_model_{time.strftime('%Y%m%d_%H%M%S')}.joblib"
joblib.dump(clf, model_path)

# Save metrics summary
metrics_json = OUT_DIR / "rf_resistance_metrics.json"
with metrics_json.open("w") as f:
    json.dump(
        {
            "model": "RandomForestClassifier",
            "n_estimators": 300,
            "test_accuracy": acc,
            "test_accuracy_percent": acc_percent,
            "train_time_sec": round(train_time, 4),
            "report_csv": str(report_csv),
            "confusion_matrix_csv": str(cm_csv),
            "model_path": str(model_path)
        },
        f,
        indent=2
    )

# Per-class correct and incorrect counts; spice mapping when available
summary = []
for c in unique_classes:
    mask = (y_test == c)
    total = int(mask.sum())
    correct = int((y_pred[mask] == y_test[mask]).sum())
    incorrect = int(total - correct)
    cls_acc = round(100.0 * correct / total, 2) if total > 0 else 0.0
    summary.append({
        "target": int(c),
        "spice": spice_map.get(int(c), "") if spice_map else "",
        "total_rows": total,
        "correct_rows": correct,
        "incorrect_rows": incorrect,
        "class_accuracy_percent": cls_acc
    })

per_class_df = pd.DataFrame(summary).sort_values(by="target")
per_class_csv = OUT_DIR / "rf_resistance_per_class_outcomes.csv"
per_class_df.to_csv(per_class_csv, index=False)

# Console output
print(f"Random Forest (resistance) accuracy: {acc_percent}%")
print("\nClassification report:")
print(classification_report(y_test, y_pred, digits=4))

print("\nConfusion matrix (rows=true, columns=pred):")
print(cm_df.to_string())

print("\nPer-class outcomes:")
print(per_class_df.to_string(index=False))

print("\nSaved files:")
print(" ", report_csv)
print(" ", cm_csv)
print(" ", model_path)
print(" ", metrics_json)
print(" ", per_class_csv)


Random Forest (resistance) accuracy: 39.89%

Classification report:
              precision    recall  f1-score   support

           0     0.4841    0.4997    0.4918     26800
           1     0.4006    0.4039    0.4022     26800
           2     0.3572    0.3426    0.3498     26800
           3     0.3494    0.3494    0.3494     26800

    accuracy                         0.3989    107200
   macro avg     0.3978    0.3989    0.3983    107200
weighted avg     0.3978    0.3989    0.3983    107200


Confusion matrix (rows=true, columns=pred):
pred_label         pred_Anise (0)  pred_Chilli (1)  pred_Cinnamon (2)  pred_Nutmeg (3)
true_label                                                                            
true_Anise (0)              13392             3113               7100             3195
true_Chilli (1)              2494            10825               4046             9435
true_Cinnamon (2)            8899             3908               9182             4811
true_Nutmeg (3)  

In [4]:
# Reviewer note: Logistic Regression on single raw feature; add model persistence, labeled confusion-matrix CSV, and console prints; no change to core behavior

import time
import sys
import json
import joblib
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Normalize BASE_OUT with a default and derive OUT_DIR for this model
BASE_OUT = Path(globals().get(
    "BASE_OUT",
    "/content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/resistance/outputs"
))
OUT_DIR = BASE_OUT / "LogisticRegression"

# Safety guard: stop if an existing folder is not empty to protect prior runs
if OUT_DIR.exists() and any(OUT_DIR.iterdir()):
    print(f"Safety guard: output folder already exists and is not empty: {OUT_DIR}")
    print("Create a new folder or archive/clear the existing one before proceeding.")
    sys.exit(1)

OUT_DIR.mkdir(parents=True, exist_ok=True)

# Safety fallback for label column
LABEL_COL = LABEL_COL if "LABEL_COL" in globals() else "target"

# Multinomial logistic regression suitable for 4 classes
clf = LogisticRegression(
    multi_class="multinomial",
    solver="lbfgs",
    max_iter=2000
)

start = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start

# Predict on test set
y_pred = clf.predict(X_test)

# Metrics
acc = accuracy_score(y_test, y_pred)
acc_percent = round(100.0 * acc, 2)
report_dict = classification_report(y_test, y_pred, digits=4, output_dict=True)
cm = confusion_matrix(y_test, y_pred)

# Optional spice mapping if present; falls back to blank if column absent
spice_map = {}
if "spice" in test_df.columns:
    spice_map = test_df.groupby(LABEL_COL)["spice"].agg(lambda s: s.mode().iat[0]).to_dict()

# Labeled headers for confusion matrix
unique_classes = np.sort(np.unique(y_test))
label_names = []
for c in unique_classes:
    if spice_map:
        label_names.append(f"{spice_map.get(int(c), '')} ({int(c)})".strip())
    else:
        label_names.append(str(int(c)))

# Save classification report
report_csv = OUT_DIR / "lr_resistance_classification_report.csv"
pd.DataFrame(report_dict).transpose().to_csv(report_csv, index=True)

# Save confusion matrix with clear headers
cm_df = pd.DataFrame(
    cm,
    index=[f"true_{n}" for n in label_names],
    columns=[f"pred_{n}" for n in label_names]
)
cm_df.index.name = "true_label"
cm_df.columns.name = "pred_label"
cm_csv = OUT_DIR / "lr_resistance_confusion_matrix.csv"
cm_df.to_csv(cm_csv, index=True)

# Persist the trained model; avoid overwriting by timestamping if needed
model_base = OUT_DIR / "lr_resistance_model.joblib"
model_path = model_base if not model_base.exists() else OUT_DIR / f"lr_resistance_model_{time.strftime('%Y%m%d_%H%M%S')}.joblib"
joblib.dump(clf, model_path)

# Save metrics summary
metrics_json = OUT_DIR / "lr_resistance_metrics.json"
with metrics_json.open("w") as f:
    json.dump(
        {
            "model": "LogisticRegression",
            "multi_class": "multinomial",
            "solver": "lbfgs",
            "max_iter": 2000,
            "test_accuracy": acc,
            "test_accuracy_percent": acc_percent,
            "train_time_sec": round(train_time, 4),
            "report_csv": str(report_csv),
            "confusion_matrix_csv": str(cm_csv),
            "model_path": str(model_path)
        },
        f,
        indent=2
    )

# Per-class correct and incorrect counts; spice mapping added when available
summary = []
for c in unique_classes:
    mask = (y_test == c)
    total = int(mask.sum())
    correct = int((y_pred[mask] == y_test[mask]).sum())
    incorrect = int(total - correct)
    cls_acc = round(100.0 * correct / total, 2) if total > 0 else 0.0
    summary.append({
        "target": int(c),
        "spice": spice_map.get(int(c), "") if spice_map else "",
        "total_rows": total,
        "correct_rows": correct,
        "incorrect_rows": incorrect,
        "class_accuracy_percent": cls_acc
    })

per_class_df = pd.DataFrame(summary).sort_values(by="target")
per_class_csv = OUT_DIR / "lr_resistance_per_class_outcomes.csv"
per_class_df.to_csv(per_class_csv, index=False)

# Console output
print(f"Logistic Regression (resistance) accuracy: {acc_percent}%")
print("\nClassification report:")
print(classification_report(y_test, y_pred, digits=4))

print("\nConfusion matrix (rows=true, columns=pred):")
print(cm_df.to_string())

print("\nPer-class outcomes:")
print(per_class_df.to_string(index=False))

print("\nSaved files:")
print(" ", report_csv)
print(" ", cm_csv)
print(" ", model_path)
print(" ", metrics_json)
print(" ", per_class_csv)




Logistic Regression (resistance) accuracy: 25.0%

Classification report:
              precision    recall  f1-score   support

           0     0.2500    1.0000    0.4000     26800
           1     0.0000    0.0000    0.0000     26800
           2     0.0000    0.0000    0.0000     26800
           3     0.0000    0.0000    0.0000     26800

    accuracy                         0.2500    107200
   macro avg     0.0625    0.2500    0.1000    107200
weighted avg     0.0625    0.2500    0.1000    107200


Confusion matrix (rows=true, columns=pred):
pred_label         pred_Anise (0)  pred_Chilli (1)  pred_Cinnamon (2)  pred_Nutmeg (3)
true_label                                                                            
true_Anise (0)              26800                0                  0                0
true_Chilli (1)             26800                0                  0                0
true_Cinnamon (2)           26800                0                  0                0
true_Nutmeg 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [5]:
# Reviewer note: MLP on single raw feature; add model persistence, labeled confusion-matrix CSV, and console prints; no change to core behavior

import time
import sys
import json
import joblib
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Normalize BASE_OUT with a default and derive OUT_DIR for this model
BASE_OUT = Path(globals().get(
    "BASE_OUT",
    "/content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/resistance/outputs"
))
OUT_DIR = BASE_OUT / "MLP"

# Safety guard: stop if an existing folder is not empty to protect prior runs
if OUT_DIR.exists() and any(OUT_DIR.iterdir()):
    print(f"Safety guard: output folder already exists and is not empty: {OUT_DIR}")
    print("Create a new folder or archive/clear the existing one before proceeding.")
    sys.exit(1)

OUT_DIR.mkdir(parents=True, exist_ok=True)

# Safety fallback for label column
LABEL_COL = LABEL_COL if "LABEL_COL" in globals() else "target"

# MLP configured for multiclass on a single numeric feature
clf = MLPClassifier(
    hidden_layer_sizes=(128,),
    activation="relu",
    solver="adam",
    max_iter=600,
    random_state=42,
    early_stopping=False
)

start = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start

# Predict on test set
y_pred = clf.predict(X_test)

# Metrics
acc = accuracy_score(y_test, y_pred)
acc_percent = round(100.0 * acc, 2)
report_dict = classification_report(y_test, y_pred, digits=4, output_dict=True)
cm = confusion_matrix(y_test, y_pred)

# Optional spice mapping if present; falls back to blank if column absent
spice_map = {}
if "spice" in test_df.columns:
    spice_map = test_df.groupby(LABEL_COL)["spice"].agg(lambda s: s.mode().iat[0]).to_dict()

# Labeled headers for confusion matrix
unique_classes = np.sort(np.unique(y_test))
label_names = []
for c in unique_classes:
    if spice_map:
        label_names.append(f"{spice_map.get(int(c), '')} ({int(c)})".strip())
    else:
        label_names.append(str(int(c)))

# Save classification report
report_csv = OUT_DIR / "mlp_resistance_classification_report.csv"
pd.DataFrame(report_dict).transpose().to_csv(report_csv, index=True)

# Save confusion matrix with clear headers
cm_df = pd.DataFrame(
    cm,
    index=[f"true_{n}" for n in label_names],
    columns=[f"pred_{n}" for n in label_names]
)
cm_df.index.name = "true_label"
cm_df.columns.name = "pred_label"
cm_csv = OUT_DIR / "mlp_resistance_confusion_matrix.csv"
cm_df.to_csv(cm_csv, index=True)

# Persist the trained model; avoid overwriting by timestamping if needed
model_base = OUT_DIR / "mlp_resistance_model.joblib"
model_path = model_base if not model_base.exists() else OUT_DIR / f"mlp_resistance_model_{time.strftime('%Y%m%d_%H%M%S')}.joblib"
joblib.dump(clf, model_path)

# Save metrics summary
metrics_json = OUT_DIR / "mlp_resistance_metrics.json"
with metrics_json.open("w") as f:
    json.dump(
        {
            "model": "MLPClassifier",
            "hidden_layer_sizes": [128],
            "activation": "relu",
            "solver": "adam",
            "max_iter": 600,
            "early_stopping": False,
            "test_accuracy": acc,
            "test_accuracy_percent": acc_percent,
            "train_time_sec": round(train_time, 4),
            "report_csv": str(report_csv),
            "confusion_matrix_csv": str(cm_csv),
            "model_path": str(model_path)
        },
        f,
        indent=2
    )

# Per-class correct and incorrect counts; spice mapping added when available
summary = []
for c in unique_classes:
    mask = (y_test == c)
    total = int(mask.sum())
    correct = int((y_pred[mask] == y_test[mask]).sum())
    incorrect = int(total - correct)
    cls_acc = round(100.0 * correct / total, 2) if total > 0 else 0.0
    summary.append({
        "target": int(c),
        "spice": spice_map.get(int(c), "") if spice_map else "",
        "total_rows": total,
        "correct_rows": correct,
        "incorrect_rows": incorrect,
        "class_accuracy_percent": cls_acc
    })

per_class_df = pd.DataFrame(summary).sort_values(by="target")
per_class_csv = OUT_DIR / "mlp_resistance_per_class_outcomes.csv"
per_class_df.to_csv(per_class_csv, index=False)

# Console output
print(f"MLP (resistance) accuracy: {acc_percent}%")
print("\nClassification report:")
print(classification_report(y_test, y_pred, digits=4))

print("\nConfusion matrix (rows=true, columns=pred):")
print(cm_df.to_string())

print("\nPer-class outcomes:")
print(per_class_df.to_string(index=False))

print("\nSaved files:")
print(" ", report_csv)
print(" ", cm_csv)
print(" ", model_path)
print(" ", metrics_json)
print(" ", per_class_csv)


MLP (resistance) accuracy: 25.0%

Classification report:
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000     26800
           1     0.2500    1.0000    0.4000     26800
           2     0.0000    0.0000    0.0000     26800
           3     0.0000    0.0000    0.0000     26800

    accuracy                         0.2500    107200
   macro avg     0.0625    0.2500    0.1000    107200
weighted avg     0.0625    0.2500    0.1000    107200


Confusion matrix (rows=true, columns=pred):
pred_label         pred_Anise (0)  pred_Chilli (1)  pred_Cinnamon (2)  pred_Nutmeg (3)
true_label                                                                            
true_Anise (0)                  0            26800                  0                0
true_Chilli (1)                 0            26800                  0                0
true_Cinnamon (2)               0            26800                  0                0
true_Nutmeg (3)             

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [6]:
# Reviewer note: linear SVM for single raw feature; add model persistence, labeled confusion-matrix CSV, and console prints; no change to core behavior

import time
import sys
import json
import joblib
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Normalize BASE_OUT with a default and derive OUT_DIR for this model
BASE_OUT = Path(globals().get(
    "BASE_OUT",
    "/content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/resistance/outputs"
))
OUT_DIR = BASE_OUT / "SVM"

# Safety guard: stop if an existing folder is not empty to protect prior runs
if OUT_DIR.exists() and any(OUT_DIR.iterdir()):
    print(f"Safety guard: output folder already exists and is not empty: {OUT_DIR}")
    print("Create a new folder or archive/clear the existing one before proceeding.")
    sys.exit(1)

OUT_DIR.mkdir(parents=True, exist_ok=True)

# Safety fallback for label column
LABEL_COL = LABEL_COL if "LABEL_COL" in globals() else "target"

# Linear SVM configuration suitable for large sample count and small feature count
clf = LinearSVC(
    C=1.0,
    dual=False,
    max_iter=5000,
    random_state=42
)

start = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start

# Predict on test set
y_pred = clf.predict(X_test)

# Metrics
acc = accuracy_score(y_test, y_pred)
acc_percent = round(100.0 * acc, 2)
report_dict = classification_report(y_test, y_pred, digits=4, output_dict=True)
cm = confusion_matrix(y_test, y_pred)

# Optional spice mapping if present; falls back to blank if column absent
spice_map = {}
if "spice" in test_df.columns:
    spice_map = test_df.groupby(LABEL_COL)["spice"].agg(lambda s: s.mode().iat[0]).to_dict()

# Labeled headers for confusion matrix
unique_classes = np.sort(np.unique(y_test))
label_names = []
for c in unique_classes:
    if spice_map:
        label_names.append(f"{spice_map.get(int(c), '')} ({int(c)})".strip())
    else:
        label_names.append(str(int(c)))

# Save classification report
report_csv = OUT_DIR / "svm_resistance_classification_report.csv"
pd.DataFrame(report_dict).transpose().to_csv(report_csv, index=True)

# Save confusion matrix with clear headers
cm_df = pd.DataFrame(
    cm,
    index=[f"true_{n}" for n in label_names],
    columns=[f"pred_{n}" for n in label_names]
)
cm_df.index.name = "true_label"
cm_df.columns.name = "pred_label"
cm_csv = OUT_DIR / "svm_resistance_confusion_matrix.csv"
cm_df.to_csv(cm_csv, index=True)

# Persist the trained model; avoid overwriting by timestamping if needed
model_base = OUT_DIR / "svm_resistance_model.joblib"
model_path = model_base if not model_base.exists() else OUT_DIR / f"svm_resistance_model_{time.strftime('%Y%m%d_%H%M%S')}.joblib"
joblib.dump(clf, model_path)

# Save metrics summary
metrics_json = OUT_DIR / "svm_resistance_metrics.json"
with metrics_json.open("w") as f:
    json.dump(
        {
            "model": "LinearSVC",
            "C": 1.0,
            "dual": False,
            "max_iter": 5000,
            "test_accuracy": acc,
            "test_accuracy_percent": acc_percent,
            "train_time_sec": round(train_time, 4),
            "report_csv": str(report_csv),
            "confusion_matrix_csv": str(cm_csv),
            "model_path": str(model_path)
        },
        f,
        indent=2
    )

# Per-class correct and incorrect counts; spice mapping added when available
summary = []
for c in unique_classes:
    mask = (y_test == c)
    total = int(mask.sum())
    correct = int((y_pred[mask] == y_test[mask]).sum())
    incorrect = int(total - correct)
    cls_acc = round(100.0 * correct / total, 2) if total > 0 else 0.0
    summary.append({
        "target": int(c),
        "spice": spice_map.get(int(c), "") if spice_map else "",
        "total_rows": total,
        "correct_rows": correct,
        "incorrect_rows": incorrect,
        "class_accuracy_percent": cls_acc
    })

per_class_df = pd.DataFrame(summary).sort_values(by="target")
per_class_csv = OUT_DIR / "svm_resistance_per_class_outcomes.csv"
per_class_df.to_csv(per_class_csv, index=False)

# Console output
print(f"SVM (LinearSVC, resistance) accuracy: {acc_percent}%")
print("\nClassification report:")
print(classification_report(y_test, y_pred, digits=4))

print("\nConfusion matrix (rows=true, columns=pred):")
print(cm_df.to_string())

print("\nPer-class outcomes:")
print(per_class_df.to_string(index=False))

print("\nSaved files:")
print(" ", report_csv)
print(" ", cm_csv)
print(" ", model_path)
print(" ", metrics_json)
print(" ", per_class_csv)


SVM (LinearSVC, resistance) accuracy: 25.0%

Classification report:
              precision    recall  f1-score   support

           0     0.2500    1.0000    0.4000     26800
           1     0.0000    0.0000    0.0000     26800
           2     0.0000    0.0000    0.0000     26800
           3     0.0000    0.0000    0.0000     26800

    accuracy                         0.2500    107200
   macro avg     0.0625    0.2500    0.1000    107200
weighted avg     0.0625    0.2500    0.1000    107200


Confusion matrix (rows=true, columns=pred):
pred_label         pred_Anise (0)  pred_Chilli (1)  pred_Cinnamon (2)  pred_Nutmeg (3)
true_label                                                                            
true_Anise (0)              26800                0                  0                0
true_Chilli (1)             26800                0                  0                0
true_Cinnamon (2)           26800                0                  0                0
true_Nutmeg (3)  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [7]:
# Reviewer note: SGD linear classifier on single raw feature; add model persistence, labeled confusion-matrix CSV, and console prints; no change to core behavior

import time
import sys
import json
import joblib
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Normalize BASE_OUT with a default and derive OUT_DIR for this model
BASE_OUT = Path(globals().get(
    "BASE_OUT",
    "/content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/resistance/outputs"
))
OUT_DIR = BASE_OUT / "SGD"

# Safety guard: stop if an existing folder is not empty to protect prior runs
if OUT_DIR.exists() and any(OUT_DIR.iterdir()):
    print(f"Safety guard: output folder already exists and is not empty: {OUT_DIR}")
    print("Create a new folder or archive/clear the existing one before proceeding.")
    sys.exit(1)

OUT_DIR.mkdir(parents=True, exist_ok=True)

# Safety fallback for label column
LABEL_COL = LABEL_COL if "LABEL_COL" in globals() else "target"

# SGD configured for multinomial logistic loss on a single numeric feature
clf = SGDClassifier(
    loss="log_loss",
    penalty="l2",
    alpha=1e-4,
    max_iter=5000,
    tol=1e-4,
    random_state=42
)

start = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start

# Predict on test set
y_pred = clf.predict(X_test)

# Metrics
acc = accuracy_score(y_test, y_pred)
acc_percent = round(100.0 * acc, 2)
report_dict = classification_report(y_test, y_pred, digits=4, output_dict=True)
cm = confusion_matrix(y_test, y_pred)

# Optional spice mapping if present; falls back to blank if column absent
spice_map = {}
if "spice" in test_df.columns:
    spice_map = test_df.groupby(LABEL_COL)["spice"].agg(lambda s: s.mode().iat[0]).to_dict()

# Labeled headers for confusion matrix
unique_classes = np.sort(np.unique(y_test))
label_names = []
for c in unique_classes:
    if spice_map:
        label_names.append(f"{spice_map.get(int(c), '')} ({int(c)})".strip())
    else:
        label_names.append(str(int(c)))

# Save classification report
report_csv = OUT_DIR / "sgd_resistance_classification_report.csv"
pd.DataFrame(report_dict).transpose().to_csv(report_csv, index=True)

# Save confusion matrix with clear headers
cm_df = pd.DataFrame(
    cm,
    index=[f"true_{n}" for n in label_names],
    columns=[f"pred_{n}" for n in label_names]
)
cm_df.index.name = "true_label"
cm_df.columns.name = "pred_label"
cm_csv = OUT_DIR / "sgd_resistance_confusion_matrix.csv"
cm_df.to_csv(cm_csv, index=True)

# Persist the trained model; avoid overwriting by timestamping if needed
model_base = OUT_DIR / "sgd_resistance_model.joblib"
model_path = model_base if not model_base.exists() else OUT_DIR / f"sgd_resistance_model_{time.strftime('%Y%m%d_%H%M%S')}.joblib"
joblib.dump(clf, model_path)

# Save metrics summary
metrics_json = OUT_DIR / "sgd_resistance_metrics.json"
with metrics_json.open("w") as f:
    json.dump(
        {
            "model": "SGDClassifier",
            "loss": "log_loss",
            "penalty": "l2",
            "alpha": 1e-4,
            "max_iter": 5000,
            "tol": 1e-4,
            "test_accuracy": acc,
            "test_accuracy_percent": acc_percent,
            "train_time_sec": round(train_time, 4),
            "report_csv": str(report_csv),
            "confusion_matrix_csv": str(cm_csv),
            "model_path": str(model_path)
        },
        f,
        indent=2
    )

# Per-class correct and incorrect counts; spice mapping added when available
summary = []
for c in unique_classes:
    mask = (y_test == c)
    total = int(mask.sum())
    correct = int((y_pred[mask] == y_test[mask]).sum())
    incorrect = int(total - correct)
    cls_acc = round(100.0 * correct / total, 2) if total > 0 else 0.0
    summary.append({
        "target": int(c),
        "spice": spice_map.get(int(c), "") if spice_map else "",
        "total_rows": total,
        "correct_rows": correct,
        "incorrect_rows": incorrect,
        "class_accuracy_percent": cls_acc
    })

per_class_df = pd.DataFrame(summary).sort_values(by="target")
per_class_csv = OUT_DIR / "sgd_resistance_per_class_outcomes.csv"
per_class_df.to_csv(per_class_csv, index=False)

# Console output
print(f"SGD (resistance) accuracy: {acc_percent}%")
print("\nClassification report:")
print(classification_report(y_test, y_pred, digits=4))

print("\nConfusion matrix (rows=true, columns=pred):")
print(cm_df.to_string())

print("\nPer-class outcomes:")
print(per_class_df.to_string(index=False))

print("\nSaved files:")
print(" ", report_csv)
print(" ", cm_csv)
print(" ", model_path)
print(" ", metrics_json)
print(" ", per_class_csv)


SGD (resistance) accuracy: 25.0%

Classification report:
              precision    recall  f1-score   support

           0     0.2500    1.0000    0.4000     26800
           1     0.0000    0.0000    0.0000     26800
           2     0.0000    0.0000    0.0000     26800
           3     0.0000    0.0000    0.0000     26800

    accuracy                         0.2500    107200
   macro avg     0.0625    0.2500    0.1000    107200
weighted avg     0.0625    0.2500    0.1000    107200


Confusion matrix (rows=true, columns=pred):
pred_label         pred_Anise (0)  pred_Chilli (1)  pred_Cinnamon (2)  pred_Nutmeg (3)
true_label                                                                            
true_Anise (0)              26800                0                  0                0
true_Chilli (1)             26800                0                  0                0
true_Cinnamon (2)           26800                0                  0                0
true_Nutmeg (3)             

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [8]:
# Reviewer note: HistGradientBoosting classifier on single raw feature; add model persistence, labeled confusion-matrix CSV, and console prints; no change to core behavior

import time
import sys
import json
import joblib
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Normalize BASE_OUT with a default and derive OUT_DIR for this model
BASE_OUT = Path(globals().get(
    "BASE_OUT",
    "/content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/resistance/outputs"
))
OUT_DIR = BASE_OUT / "GradientBoosting"

# Safety guard: stop if an existing folder is not empty to protect prior runs
if OUT_DIR.exists() and any(OUT_DIR.iterdir()):
    print(f"Safety guard: output folder already exists and is not empty: {OUT_DIR}")
    print("Create a new folder or archive/clear the existing one before proceeding.")
    sys.exit(1)

OUT_DIR.mkdir(parents=True, exist_ok=True)

# Safety fallback for label column
LABEL_COL = LABEL_COL if "LABEL_COL" in globals() else "target"

# HistGradientBoosting scales to large datasets; early_stopping disabled to avoid internal validation split
clf = HistGradientBoostingClassifier(
    loss="log_loss",
    learning_rate=0.1,
    max_iter=200,
    max_depth=None,
    early_stopping=False,
    random_state=42
)

start = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start

# Predict on test set
y_pred = clf.predict(X_test)

# Metrics
acc = accuracy_score(y_test, y_pred)
acc_percent = round(100.0 * acc, 2)
report_dict = classification_report(y_test, y_pred, digits=4, output_dict=True)
cm = confusion_matrix(y_test, y_pred)

# Optional spice mapping if present; falls back to blank if column absent
spice_map = {}
if "spice" in test_df.columns:
    spice_map = test_df.groupby(LABEL_COL)["spice"].agg(lambda s: s.mode().iat[0]).to_dict()

# Labeled headers for confusion matrix
unique_classes = np.sort(np.unique(y_test))
label_names = []
for c in unique_classes:
    if spice_map:
        label_names.append(f"{spice_map.get(int(c), '')} ({int(c)})".strip())
    else:
        label_names.append(str(int(c)))

# Save classification report
report_csv = OUT_DIR / "gb_resistance_classification_report.csv"
pd.DataFrame(report_dict).transpose().to_csv(report_csv, index=True)

# Save confusion matrix with clear headers
cm_df = pd.DataFrame(
    cm,
    index=[f"true_{n}" for n in label_names],
    columns=[f"pred_{n}" for n in label_names]
)
cm_df.index.name = "true_label"
cm_df.columns.name = "pred_label"
cm_csv = OUT_DIR / "gb_resistance_confusion_matrix.csv"
cm_df.to_csv(cm_csv, index=True)

# Persist the trained model; avoid overwriting by timestamping if needed
model_base = OUT_DIR / "gb_resistance_model.joblib"
model_path = model_base if not model_base.exists() else OUT_DIR / f"gb_resistance_model_{time.strftime('%Y%m%d_%H%M%S')}.joblib"
joblib.dump(clf, model_path)

# Save metrics summary
metrics_json = OUT_DIR / "gb_resistance_metrics.json"
with metrics_json.open("w") as f:
    json.dump(
        {
            "model": "HistGradientBoostingClassifier",
            "loss": "log_loss",
            "learning_rate": 0.1,
            "max_iter": 200,
            "max_depth": None,
            "early_stopping": False,
            "test_accuracy": acc,
            "test_accuracy_percent": acc_percent,
            "train_time_sec": round(train_time, 4),
            "report_csv": str(report_csv),
            "confusion_matrix_csv": str(cm_csv),
            "model_path": str(model_path)
        },
        f,
        indent=2
    )

# Per-class correct and incorrect counts; spice mapping added when available
summary = []
for c in unique_classes:
    mask = (y_test == c)
    total = int(mask.sum())
    correct = int((y_pred[mask] == y_test[mask]).sum())
    incorrect = int(total - correct)
    cls_acc = round(100.0 * correct / total, 2) if total > 0 else 0.0
    summary.append({
        "target": int(c),
        "spice": spice_map.get(int(c), "") if spice_map else "",
        "total_rows": total,
        "correct_rows": correct,
        "incorrect_rows": incorrect,
        "class_accuracy_percent": cls_acc
    })

per_class_df = pd.DataFrame(summary).sort_values(by="target")
per_class_csv = OUT_DIR / "gb_resistance_per_class_outcomes.csv"
per_class_df.to_csv(per_class_csv, index=False)

# Console output
print(f"Gradient Boosting (resistance) accuracy: {acc_percent}%")
print("\nClassification report:")
print(classification_report(y_test, y_pred, digits=4))

print("\nConfusion matrix (rows=true, columns=pred):")
print(cm_df.to_string())

print("\nPer-class outcomes:")
print(per_class_df.to_string(index=False))

print("\nSaved files:")
print(" ", report_csv)
print(" ", cm_csv)
print(" ", model_path)
print(" ", metrics_json)
print(" ", per_class_csv)


Gradient Boosting (resistance) accuracy: 41.93%

Classification report:
              precision    recall  f1-score   support

           0     0.5371    0.4829    0.5086     26800
           1     0.4233    0.4560    0.4390     26800
           2     0.3785    0.3785    0.3785     26800
           3     0.3515    0.3597    0.3556     26800

    accuracy                         0.4193    107200
   macro avg     0.4226    0.4193    0.4204    107200
weighted avg     0.4226    0.4193    0.4204    107200


Confusion matrix (rows=true, columns=pred):
pred_label         pred_Anise (0)  pred_Chilli (1)  pred_Cinnamon (2)  pred_Nutmeg (3)
true_label                                                                            
true_Anise (0)              12943             2734               7592             3531
true_Chilli (1)              1628            12222               3613             9337
true_Cinnamon (2)            7934             3806              10145             4915
true_Nutmeg (

In [9]:
# Reviewer note: AdaBoost on single raw feature; add model persistence, labeled confusion-matrix CSV, and console prints; no change to core behavior

import time
import sys
import json
import joblib
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Normalize BASE_OUT with a default and derive OUT_DIR for this model
BASE_OUT = Path(globals().get(
    "BASE_OUT",
    "/content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/resistance/outputs"
))
OUT_DIR = BASE_OUT / "AdaBoost"

# Safety guard: stop if an existing folder is not empty to protect prior runs
if OUT_DIR.exists() and any(OUT_DIR.iterdir()):
    print(f"Safety guard: output folder already exists and is not empty: {OUT_DIR}")
    print("Create a new folder or archive/clear the existing one before proceeding.")
    sys.exit(1)

OUT_DIR.mkdir(parents=True, exist_ok=True)

# Safety fallback for label column
LABEL_COL = LABEL_COL if "LABEL_COL" in globals() else "target"

# AdaBoost with decision stump base estimator; SAMME for multiclass on current sklearn in Colab
clf = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(max_depth=1, random_state=42),
    n_estimators=300,
    learning_rate=0.5,
    algorithm="SAMME",
    random_state=42
)

start = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start

# Predict on test set
y_pred = clf.predict(X_test)

# Metrics
acc = accuracy_score(y_test, y_pred)
acc_percent = round(100.0 * acc, 2)
report_dict = classification_report(y_test, y_pred, digits=4, output_dict=True)
cm = confusion_matrix(y_test, y_pred)

# Optional spice mapping if present; falls back to blank if column absent
spice_map = {}
if "spice" in test_df.columns:
    spice_map = test_df.groupby(LABEL_COL)["spice"].agg(lambda s: s.mode().iat[0]).to_dict()

# Labeled headers for confusion matrix
unique_classes = np.sort(np.unique(y_test))
label_names = []
for c in unique_classes:
    if spice_map:
        label_names.append(f"{spice_map.get(int(c), '')} ({int(c)})".strip())
    else:
        label_names.append(str(int(c)))

# Save classification report
report_csv = OUT_DIR / "ada_resistance_classification_report.csv"
pd.DataFrame(report_dict).transpose().to_csv(report_csv, index=True)

# Save confusion matrix with clear headers
cm_df = pd.DataFrame(
    cm,
    index=[f"true_{n}" for n in label_names],
    columns=[f"pred_{n}" for n in label_names]
)
cm_df.index.name = "true_label"
cm_df.columns.name = "pred_label"
cm_csv = OUT_DIR / "ada_resistance_confusion_matrix.csv"
cm_df.to_csv(cm_csv, index=True)

# Persist the trained model; avoid overwriting by timestamping if needed
model_base = OUT_DIR / "ada_resistance_model.joblib"
model_path = model_base if not model_base.exists() else OUT_DIR / f"ada_resistance_model_{time.strftime('%Y%m%d_%H%M%S')}.joblib"
joblib.dump(clf, model_path)

# Save metrics summary
metrics_json = OUT_DIR / "ada_resistance_metrics.json"
with metrics_json.open("w") as f:
    json.dump(
        {
            "model": "AdaBoostClassifier",
            "base_estimator": "DecisionTree(max_depth=1)",
            "n_estimators": 300,
            "learning_rate": 0.5,
            "algorithm": "SAMME",
            "test_accuracy": acc,
            "test_accuracy_percent": acc_percent,
            "train_time_sec": round(train_time, 4),
            "report_csv": str(report_csv),
            "confusion_matrix_csv": str(cm_csv),
            "model_path": str(model_path)
        },
        f,
        indent=2
    )

# Per-class correct and incorrect counts; spice mapping added when available
summary = []
for c in unique_classes:
    mask = (y_test == c)
    total = int(mask.sum())
    correct = int((y_pred[mask] == y_test[mask]).sum())
    incorrect = int(total - correct)
    cls_acc = round(100.0 * correct / total, 2) if total > 0 else 0.0
    summary.append({
        "target": int(c),
        "spice": spice_map.get(int(c), "") if spice_map else "",
        "total_rows": total,
        "correct_rows": correct,
        "incorrect_rows": incorrect,
        "class_accuracy_percent": cls_acc
    })

per_class_df = pd.DataFrame(summary).sort_values(by="target")
per_class_csv = OUT_DIR / "ada_resistance_per_class_outcomes.csv"
per_class_df.to_csv(per_class_csv, index=False)

# Console output
print(f"AdaBoost (resistance) accuracy: {acc_percent}%")
print("\nClassification report:")
print(classification_report(y_test, y_pred, digits=4))

print("\nConfusion matrix (rows=true, columns=pred):")
print(cm_df.to_string())

print("\nPer-class outcomes:")
print(per_class_df.to_string(index=False))

print("\nSaved files:")
print(" ", report_csv)
print(" ", cm_csv)
print(" ", model_path)
print(" ", metrics_json)
print(" ", per_class_csv)




AdaBoost (resistance) accuracy: 36.92%

Classification report:
              precision    recall  f1-score   support

           0     0.4959    0.6204    0.5512     26800
           1     0.4157    0.4238    0.4197     26800
           2     0.2268    0.2889    0.2541     26800
           3     0.3154    0.1438    0.1975     26800

    accuracy                         0.3692    107200
   macro avg     0.3634    0.3692    0.3556    107200
weighted avg     0.3634    0.3692    0.3556    107200


Confusion matrix (rows=true, columns=pred):
pred_label         pred_Anise (0)  pred_Chilli (1)  pred_Cinnamon (2)  pred_Nutmeg (3)
true_label                                                                            
true_Anise (0)              16627             1690               6795             1688
true_Chilli (1)              1790            11357               9213             4440
true_Cinnamon (2)           12940             3883               7742             2235
true_Nutmeg (3)       

In [10]:
# Reviewer note: KNN on single raw feature; add model persistence, labeled confusion-matrix CSV, and console prints; no change to core behavior

import time
import sys
import json
import joblib
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Normalize BASE_OUT with a default and derive OUT_DIR for this model
BASE_OUT = Path(globals().get(
    "BASE_OUT",
    "/content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/resistance/outputs"
))
OUT_DIR = BASE_OUT / "KNN"

# Safety guard: stop if an existing folder is not empty to protect prior runs
if OUT_DIR.exists() and any(OUT_DIR.iterdir()):
    print(f"Safety guard: output folder already exists and is not empty: {OUT_DIR}")
    print("Create a new folder or archive/clear the existing one before proceeding.")
    sys.exit(1)

OUT_DIR.mkdir(parents=True, exist_ok=True)

# Safety fallback for label column
LABEL_COL = LABEL_COL if "LABEL_COL" in globals() else "target"

# KNN configuration suitable for low-dimensional numeric input
clf = KNeighborsClassifier(
    n_neighbors=7,
    weights="distance",
    algorithm="kd_tree",
    leaf_size=30,
    p=2,
    n_jobs=-1
)

start = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start

# Predict on test set
y_pred = clf.predict(X_test)

# Metrics
acc = accuracy_score(y_test, y_pred)
acc_percent = round(100.0 * acc, 2)
report_dict = classification_report(y_test, y_pred, digits=4, output_dict=True)
cm = confusion_matrix(y_test, y_pred)

# Optional spice mapping if present; falls back to blank if column absent
spice_map = {}
if "spice" in test_df.columns:
    spice_map = test_df.groupby(LABEL_COL)["spice"].agg(lambda s: s.mode().iat[0]).to_dict()

# Labeled headers for confusion matrix
unique_classes = np.sort(np.unique(y_test))
label_names = []
for c in unique_classes:
    if spice_map:
        label_names.append(f"{spice_map.get(int(c), '')} ({int(c)})".strip())
    else:
        label_names.append(str(int(c)))

# Save classification report
report_csv = OUT_DIR / "knn_resistance_classification_report.csv"
pd.DataFrame(report_dict).transpose().to_csv(report_csv, index=True)

# Save confusion matrix with clear headers
cm_df = pd.DataFrame(
    cm,
    index=[f"true_{n}" for n in label_names],
    columns=[f"pred_{n}" for n in label_names]
)
cm_df.index.name = "true_label"
cm_df.columns.name = "pred_label"
cm_csv = OUT_DIR / "knn_resistance_confusion_matrix.csv"
cm_df.to_csv(cm_csv, index=True)

# Persist the trained model; avoid overwriting by timestamping if needed
model_base = OUT_DIR / "knn_resistance_model.joblib"
model_path = model_base if not model_base.exists() else OUT_DIR / f"knn_resistance_model_{time.strftime('%Y%m%d_%H%M%S')}.joblib"
joblib.dump(clf, model_path)

# Save metrics summary
metrics_json = OUT_DIR / "knn_resistance_metrics.json"
with metrics_json.open("w") as f:
    json.dump(
        {
            "model": "KNeighborsClassifier",
            "n_neighbors": 7,
            "weights": "distance",
            "algorithm": "kd_tree",
            "leaf_size": 30,
            "p": 2,
            "test_accuracy": acc,
            "test_accuracy_percent": acc_percent,
            "train_time_sec": round(train_time, 4),
            "report_csv": str(report_csv),
            "confusion_matrix_csv": str(cm_csv),
            "model_path": str(model_path)
        },
        f,
        indent=2
    )

# Per-class correct and incorrect counts; spice mapping added when available
summary = []
for c in unique_classes:
    mask = (y_test == c)
    total = int(mask.sum())
    correct = int((y_pred[mask] == y_test[mask]).sum())
    incorrect = int(total - correct)
    cls_acc = round(100.0 * correct / total, 2) if total > 0 else 0.0
    summary.append({
        "target": int(c),
        "spice": spice_map.get(int(c), "") if spice_map else "",
        "total_rows": total,
        "correct_rows": correct,
        "incorrect_rows": incorrect,
        "class_accuracy_percent": cls_acc
    })

per_class_df = pd.DataFrame(summary).sort_values(by="target")
per_class_csv = OUT_DIR / "knn_resistance_per_class_outcomes.csv"
per_class_df.to_csv(per_class_csv, index=False)

# Console output
print(f"KNN (resistance) accuracy: {acc_percent}%")
print("\nClassification report:")
print(classification_report(y_test, y_pred, digits=4))

print("\nConfusion matrix (rows=true, columns=pred):")
print(cm_df.to_string())

print("\nPer-class outcomes:")
print(per_class_df.to_string(index=False))

print("\nSaved files:")
print(" ", report_csv)
print(" ", cm_csv)
print(" ", model_path)
print(" ", metrics_json)
print(" ", per_class_csv)


KNN (resistance) accuracy: 33.45%

Classification report:
              precision    recall  f1-score   support

           0     0.3036    0.7812    0.4372     26800
           1     0.4101    0.3794    0.3942     26800
           2     0.3475    0.1056    0.1620     26800
           3     0.3626    0.0716    0.1196     26800

    accuracy                         0.3345    107200
   macro avg     0.3559    0.3345    0.2782    107200
weighted avg     0.3559    0.3345    0.2782    107200


Confusion matrix (rows=true, columns=pred):
pred_label         pred_Anise (0)  pred_Chilli (1)  pred_Cinnamon (2)  pred_Nutmeg (3)
true_label                                                                            
true_Anise (0)              20936             2674               2409              781
true_Chilli (1)             13899            10169               1195             1537
true_Cinnamon (2)           19240             3674               2830             1056
true_Nutmeg (3)            

In [11]:
# Reviewer note: 1D CNN on a single raw feature; add model persistence, labeled confusion-matrix CSV, and console prints; no change to core behavior

import os
import sys
import time
import json
import numpy as np
import pandas as pd
from pathlib import Path

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Normalize BASE_OUT with a default and derive OUT_DIR for this model
BASE_OUT = Path(globals().get(
    "BASE_OUT",
    "/content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/resistance/outputs"
))
OUT_DIR = BASE_OUT / "CNN"

# Safety guard: stop if an existing folder is not empty to protect prior runs
if OUT_DIR.exists() and any(OUT_DIR.iterdir()):
    print(f"Safety guard: output folder already exists and is not empty: {OUT_DIR}")
    print("Create a new folder or archive/clear the existing one before proceeding.")
    sys.exit(1)

OUT_DIR.mkdir(parents=True, exist_ok=True)

# Safety fallback for label column
LABEL_COL = LABEL_COL if "LABEL_COL" in globals() else "target"

# Prepare tensors for CNN; reshape only, no scaling or normalization
X_train_cnn = X_train.astype(np.float32).reshape(-1, 1, 1)
X_test_cnn  = X_test.astype(np.float32).reshape(-1, 1, 1)
num_classes = int(len(np.unique(y_train)))

# Minimal 1D CNN for a length-1 sequence; kernel_size=1 acts as a learned linear transform
inputs = keras.Input(shape=(1, 1))
x = layers.Conv1D(filters=16, kernel_size=1, activation="relu")(inputs)
x = layers.Conv1D(filters=16, kernel_size=1, activation="relu")(x)
x = layers.Flatten()(x)
x = layers.Dense(32, activation="relu")(x)
outputs = layers.Dense(num_classes, activation="softmax")(x)

model = keras.Model(inputs=inputs, outputs=outputs, name="cnn_resistance_1d")

model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-3),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

# Train without validation split to keep datasets unchanged
start = time.time()
history = model.fit(
    X_train_cnn, y_train,
    epochs=25,
    batch_size=1024,
    verbose=1
)
train_time = time.time() - start

# Predictions and metrics
y_pred_proba = model.predict(X_test_cnn, batch_size=4096, verbose=0)
y_pred = np.argmax(y_pred_proba, axis=1)

acc = accuracy_score(y_test, y_pred)
acc_percent = round(100.0 * acc, 2)
report_dict = classification_report(y_test, y_pred, digits=4, output_dict=True)
cm = confusion_matrix(y_test, y_pred)

# Optional spice mapping if present; falls back to blank if column absent
spice_map = {}
if "spice" in test_df.columns:
    spice_map = test_df.groupby(LABEL_COL)["spice"].agg(lambda s: s.mode().iat[0]).to_dict()

# Labeled headers for confusion matrix
unique_classes = np.sort(np.unique(y_test))
label_names = []
for c in unique_classes:
    if spice_map:
        label_names.append(f"{spice_map.get(int(c), '')} ({int(c)})".strip())
    else:
        label_names.append(str(int(c)))

# Save classification report
report_csv = OUT_DIR / "cnn_resistance_classification_report.csv"
pd.DataFrame(report_dict).transpose().to_csv(report_csv, index=True)

# Save confusion matrix with clear headers
cm_df = pd.DataFrame(
    cm,
    index=[f"true_{n}" for n in label_names],
    columns=[f"pred_{n}" for n in label_names]
)
cm_df.index.name = "true_label"
cm_df.columns.name = "pred_label"
cm_csv = OUT_DIR / "cnn_resistance_confusion_matrix.csv"
cm_df.to_csv(cm_csv, index=True)

# Persist the trained model as .keras; avoid overwriting by timestamping if needed
model_base = OUT_DIR / "cnn_resistance_model.keras"
model_path = model_base if not model_base.exists() else OUT_DIR / f"cnn_resistance_model_{time.strftime('%Y%m%d_%H%M%S')}.keras"
model.save(model_path)

# Save metrics summary
metrics_json = OUT_DIR / "cnn_resistance_metrics.json"
with metrics_json.open("w") as f:
    json.dump(
        {
            "model": "Keras 1D CNN",
            "input_shape": [1, 1],
            "epochs": 25,
            "batch_size": 1024,
            "optimizer": "adam",
            "loss": "sparse_categorical_crossentropy",
            "test_accuracy": acc,
            "test_accuracy_percent": acc_percent,
            "train_time_sec": round(train_time, 4),
            "report_csv": str(report_csv),
            "confusion_matrix_csv": str(cm_csv),
            "model_path": str(model_path)
        },
        f,
        indent=2
    )

# Per-class correct and incorrect counts; spice mapping added when available
summary = []
for c in unique_classes:
    mask = (y_test == c)
    total = int(mask.sum())
    correct = int((y_pred[mask] == y_test[mask]).sum())
    incorrect = int(total - correct)
    cls_acc = round(100.0 * correct / total, 2) if total > 0 else 0.0
    summary.append({
        "target": int(c),
        "spice": spice_map.get(int(c), "") if spice_map else "",
        "total_rows": total,
        "correct_rows": correct,
        "incorrect_rows": incorrect,
        "class_accuracy_percent": cls_acc
    })

per_class_df = pd.DataFrame(summary).sort_values(by="target")
per_class_csv = OUT_DIR / "cnn_resistance_per_class_outcomes.csv"
per_class_df.to_csv(per_class_csv, index=False)

# Console output
print(f"CNN (resistance) accuracy: {acc_percent}%")
print("\nClassification report:")
print(classification_report(y_test, y_pred, digits=4))

print("\nConfusion matrix (rows=true, columns=pred):")
print(cm_df.to_string())

print("\nPer-class outcomes:")
print(per_class_df.to_string(index=False))

print("\nSaved files:")
print(" ", report_csv)
print(" ", cm_csv)
print(" ", model_path)
print(" ", metrics_json)
print(" ", per_class_csv)


Epoch 1/25
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - accuracy: 0.2491 - loss: 47655.1602
Epoch 2/25
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.2518 - loss: 1370.4247
Epoch 3/25
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.2499 - loss: 1846.0724
Epoch 4/25
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.2477 - loss: 1637.3838
Epoch 5/25
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.2493 - loss: 1642.0951
Epoch 6/25
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.2503 - loss: 1316.3752
Epoch 7/25
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.2457 - loss: 1297.3737
Epoch 8/25
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.2489 - loss: 1156.9955
Epoch 9/25
[1m

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


CNN (resistance) accuracy: 25.0%

Classification report:
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000     26800
           1     0.2500    1.0000    0.4000     26800
           2     0.0000    0.0000    0.0000     26800
           3     0.0000    0.0000    0.0000     26800

    accuracy                         0.2500    107200
   macro avg     0.0625    0.2500    0.1000    107200
weighted avg     0.0625    0.2500    0.1000    107200


Confusion matrix (rows=true, columns=pred):
pred_label         pred_Anise (0)  pred_Chilli (1)  pred_Cinnamon (2)  pred_Nutmeg (3)
true_label                                                                            
true_Anise (0)                  0            26800                  0                0
true_Chilli (1)                 0            26800                  0                0
true_Cinnamon (2)               0            26800                  0                0
true_Nutmeg (3)             

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [12]:
# Reviewer note: XGBoost on a single raw feature; add model persistence, labeled confusion-matrix CSV, and console prints; no change to core behavior

import time
import sys
import json
import joblib
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from xgboost import XGBClassifier

# Normalize BASE_OUT with a default and derive OUT_DIR for this model
BASE_OUT = Path(globals().get(
    "BASE_OUT",
    "/content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/resistance/outputs"
))
OUT_DIR = BASE_OUT / "XGBoost"

# Safety guard: stop if an existing folder is not empty to protect prior runs
if OUT_DIR.exists() and any(OUT_DIR.iterdir()):
    print(f"Safety guard: output folder already exists and is not empty: {OUT_DIR}")
    print("Create a new folder or archive/clear the existing one before proceeding.")
    sys.exit(1)

OUT_DIR.mkdir(parents=True, exist_ok=True)

# Safety fallback for label column
LABEL_COL = LABEL_COL if "LABEL_COL" in globals() else "target"

num_classes = int(np.unique(y_train).size)

# Configuration for multiclass classification on CPU
clf = XGBClassifier(
    objective="multi:softmax",
    num_class=num_classes,
    n_estimators=400,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    tree_method="hist",
    eval_metric="mlogloss",
    random_state=42,
    n_jobs=-1,
    verbosity=0
)

start = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start

# Predict on test set
y_pred = clf.predict(X_test)

# Metrics
acc = accuracy_score(y_test, y_pred)
acc_percent = round(100.0 * acc, 2)
report_dict = classification_report(y_test, y_pred, digits=4, output_dict=True)
cm = confusion_matrix(y_test, y_pred)

# Optional spice mapping if present; falls back to blank if column absent
spice_map = {}
if "spice" in test_df.columns:
    spice_map = test_df.groupby(LABEL_COL)["spice"].agg(lambda s: s.mode().iat[0]).to_dict()

# Labeled headers for confusion matrix
unique_classes = np.sort(np.unique(y_test))
label_names = []
for c in unique_classes:
    if spice_map:
        label_names.append(f"{spice_map.get(int(c), '')} ({int(c)})".strip())
    else:
        label_names.append(str(int(c)))

# Save classification report
report_csv = OUT_DIR / "xgb_resistance_classification_report.csv"
pd.DataFrame(report_dict).transpose().to_csv(report_csv, index=True)

# Save confusion matrix with clear headers
cm_df = pd.DataFrame(
    cm,
    index=[f"true_{n}" for n in label_names],
    columns=[f"pred_{n}" for n in label_names]
)
cm_df.index.name = "true_label"
cm_df.columns.name = "pred_label"
cm_csv = OUT_DIR / "xgb_resistance_confusion_matrix.csv"
cm_df.to_csv(cm_csv, index=True)

# Persist the trained model; avoid overwriting by timestamping if needed
model_base = OUT_DIR / "xgb_resistance_model.joblib"
model_path = model_base if not model_base.exists() else OUT_DIR / f"xgb_resistance_model_{time.strftime('%Y%m%d_%H%M%S')}.joblib"
joblib.dump(clf, model_path)

# Save metrics summary
metrics_json = OUT_DIR / "xgb_resistance_metrics.json"
with metrics_json.open("w") as f:
    json.dump(
        {
            "model": "XGBClassifier",
            "objective": "multi:softmax",
            "num_class": num_classes,
            "n_estimators": 400,
            "max_depth": 6,
            "learning_rate": 0.1,
            "subsample": 0.8,
            "colsample_bytree": 0.8,
            "reg_lambda": 1.0,
            "tree_method": "hist",
            "eval_metric": "mlogloss",
            "test_accuracy": acc,
            "test_accuracy_percent": acc_percent,
            "train_time_sec": round(train_time, 4),
            "report_csv": str(report_csv),
            "confusion_matrix_csv": str(cm_csv),
            "model_path": str(model_path)
        },
        f,
        indent=2
    )

# Per-class correct and incorrect counts; spice mapping added when available
summary = []
for c in unique_classes:
    mask = (y_test == c)
    total = int(mask.sum())
    correct = int((y_pred[mask] == y_test[mask]).sum())
    incorrect = int(total - correct)
    cls_acc = round(100.0 * correct / total, 2) if total > 0 else 0.0
    summary.append({
        "target": int(c),
        "spice": spice_map.get(int(c), "") if spice_map else "",
        "total_rows": total,
        "correct_rows": correct,
        "incorrect_rows": incorrect,
        "class_accuracy_percent": cls_acc
    })

per_class_df = pd.DataFrame(summary).sort_values(by="target")
per_class_csv = OUT_DIR / "xgb_resistance_per_class_outcomes.csv"
per_class_df.to_csv(per_class_csv, index=False)

# Console output
print(f"XGBoost (resistance) accuracy: {acc_percent}%")
print("\nClassification report:")
print(classification_report(y_test, y_pred, digits=4))

print("\nConfusion matrix (rows=true, columns=pred):")
print(cm_df.to_string())

print("\nPer-class outcomes:")
print(per_class_df.to_string(index=False))

print("\nSaved files:")
print(" ", report_csv)
print(" ", cm_csv)
print(" ", model_path)
print(" ", metrics_json)
print(" ", per_class_csv)


XGBoost (resistance) accuracy: 40.82%

Classification report:
              precision    recall  f1-score   support

           0     0.5095    0.4542    0.4803     26800
           1     0.4111    0.4597    0.4341     26800
           2     0.3744    0.3750    0.3747     26800
           3     0.3476    0.3437    0.3456     26800

    accuracy                         0.4082    107200
   macro avg     0.4107    0.4082    0.4087    107200
weighted avg     0.4107    0.4082    0.4087    107200


Confusion matrix (rows=true, columns=pred):
pred_label         pred_Anise (0)  pred_Chilli (1)  pred_Cinnamon (2)  pred_Nutmeg (3)
true_label                                                                            
true_Anise (0)              12173             3734               7431             3462
true_Chilli (1)              1621            12320               3621             9238
true_Cinnamon (2)            8283             3877              10050             4590
true_Nutmeg (3)        