In [1]:
# Reviewer note: Colab Drive mount and path configuration for pressure feature

import os
from pathlib import Path
from google.colab import drive

# Mount Google Drive if not already mounted
if not os.path.ismount("/content/drive"):
    drive.mount("/content/drive", force_remount=False)

# Folders containing the CSVs for pressure feature
TRAIN_DIR = Path("/content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/pressure/train")
TEST_DIR  = Path("/content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/pressure/test")

# Base output directory for all pressure models; model cells will create their own subfolders
BASE_OUT = Path("/content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/pressure/outputs")
BASE_OUT.mkdir(parents=True, exist_ok=True)

# Helper to resolve a single CSV inside a directory
def resolve_single_csv(dir_path: Path) -> Path:
    candidates = list(dir_path.glob("*.csv"))
    if len(candidates) == 0:
        raise FileNotFoundError(f"No CSV found in {dir_path}. Put exactly one CSV file in that folder.")
    if len(candidates) > 1:
        names = [c.name for c in candidates]
        raise FileExistsError(f"Multiple CSV files found in {dir_path}: {names}. Keep exactly one or specify the exact file.")
    return candidates[0]

TRAIN_CSV = resolve_single_csv(TRAIN_DIR)
TEST_CSV  = resolve_single_csv(TEST_DIR)

print("Train CSV:", TRAIN_CSV)
print("Test  CSV:", TEST_CSV)
print("Base out :", BASE_OUT)


Mounted at /content/drive
Train CSV: /content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/pressure/train/Train_All_Specimens_Pressure.csv
Test  CSV: /content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/pressure/test/Test_All_Specimens_Pressure.csv
Base out : /content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/pressure/outputs


In [2]:
# Reviewer note: dataset loading for single feature 'pressure'; no preprocessing or scaling

import pandas as pd
import numpy as np

# Single raw feature column and label column
FEATURES = ["pressure"]
LABEL_COL = "target"

# Load train and test exactly as-is
train_df = pd.read_csv(TRAIN_CSV)
test_df  = pd.read_csv(TEST_CSV)

# Basic checks to fail fast if the schema is off
missing_train = [c for c in FEATURES + [LABEL_COL] if c not in train_df.columns]
missing_test  = [c for c in FEATURES + [LABEL_COL] if c not in test_df.columns]
if missing_train:
    raise ValueError(f"Train is missing columns: {missing_train}")
if missing_test:
    raise ValueError(f"Test is missing columns: {missing_test}")

# Extract raw numpy arrays for scikit-learn
X_train = train_df[FEATURES].values
y_train = train_df[LABEL_COL].values
X_test  = test_df[FEATURES].values
y_test  = test_df[LABEL_COL].values

print("Train shapes:", X_train.shape, y_train.shape)
print("Test shapes :", X_test.shape, y_test.shape)
print("Unique train labels:", np.unique(y_train))
print("Unique test  labels:", np.unique(y_test))


Train shapes: (107200, 1) (107200,)
Test shapes : (107200, 1) (107200,)
Unique train labels: [0 1 2 3]
Unique test  labels: [0 1 2 3]


In [3]:
# Reviewer note: Random Forest on single raw feature (pressure); add model persistence, labeled confusion-matrix CSV, and console prints; no change to core behavior

import time
import sys
import json
import joblib
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Normalize BASE_OUT with a default and derive OUT_DIR for this model
BASE_OUT = Path(globals().get(
    "BASE_OUT",
    "/content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/pressure/outputs"
))
OUT_DIR = BASE_OUT / "RandomForest"

# Safety guard: stop if an existing folder is not empty to protect prior runs
if OUT_DIR.exists() and any(OUT_DIR.iterdir()):
    print(f"Safety guard: output folder already exists and is not empty: {OUT_DIR}")
    print("Create a new folder or archive/clear the existing one before proceeding.")
    sys.exit(1)

OUT_DIR.mkdir(parents=True, exist_ok=True)

clf = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    n_jobs=-1,
    random_state=42
)

start = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start

# Predict on test set
y_pred = clf.predict(X_test)

# Metrics
acc = accuracy_score(y_test, y_pred)
acc_percent = round(100.0 * acc, 2)
report_dict = classification_report(y_test, y_pred, digits=4, output_dict=True)
cm = confusion_matrix(y_test, y_pred)

# Optional spice mapping if present; falls back to blank if column absent
spice_map = {}
if "spice" in test_df.columns:
    spice_map = test_df.groupby(LABEL_COL)["spice"].agg(lambda s: s.mode().iat[0]).to_dict()

# Build labeled headers for confusion matrix
unique_classes = np.sort(np.unique(y_test))
label_names = []
for c in unique_classes:
    if spice_map:
        label_names.append(f"{spice_map.get(int(c), '')} ({int(c)})".strip())
    else:
        label_names.append(str(int(c)))

# Save classification report
report_csv = OUT_DIR / "rf_pressure_classification_report.csv"
pd.DataFrame(report_dict).transpose().to_csv(report_csv, index=True)

# Save confusion matrix with clear headers
cm_df = pd.DataFrame(
    cm,
    index=[f"true_{n}" for n in label_names],
    columns=[f"pred_{n}" for n in label_names]
)
cm_df.index.name = "true_label"
cm_df.columns.name = "pred_label"
cm_csv = OUT_DIR / "rf_pressure_confusion_matrix.csv"
cm_df.to_csv(cm_csv, index=True)

# Persist the trained model; avoid overwriting by timestamping if needed
model_base = OUT_DIR / "rf_pressure_model.joblib"
model_path = model_base if not model_base.exists() else OUT_DIR / f"rf_pressure_model_{time.strftime('%Y%m%d_%H%M%S')}.joblib"
joblib.dump(clf, model_path)

# Save metrics summary
metrics_json = OUT_DIR / "rf_pressure_metrics.json"
with metrics_json.open("w") as f:
    json.dump(
        {
            "model": "RandomForestClassifier",
            "n_estimators": 300,
            "test_accuracy": acc,
            "test_accuracy_percent": acc_percent,
            "train_time_sec": round(train_time, 4),
            "report_csv": str(report_csv),
            "confusion_matrix_csv": str(cm_csv),
            "model_path": str(model_path)
        },
        f,
        indent=2
    )

# Per-class correct and incorrect counts; spice mapping when available
summary = []
for c in unique_classes:
    mask = (y_test == c)
    total = int(mask.sum())
    correct = int((y_pred[mask] == y_test[mask]).sum())
    incorrect = int(total - correct)
    cls_acc = round(100.0 * correct / total, 2) if total > 0 else 0.0
    summary.append({
        "target": int(c),
        "spice": spice_map.get(int(c), "") if spice_map else "",
        "total_rows": total,
        "correct_rows": correct,
        "incorrect_rows": incorrect,
        "class_accuracy_percent": cls_acc
    })

per_class_df = pd.DataFrame(summary).sort_values(by="target")
per_class_csv = OUT_DIR / "rf_pressure_per_class_outcomes.csv"
per_class_df.to_csv(per_class_csv, index=False)

# Console output
print(f"Random Forest (pressure) accuracy: {acc_percent}%")
print("\nClassification report:")
print(classification_report(y_test, y_pred, digits=4))

print("\nConfusion matrix (rows=true, columns=pred):")
print(cm_df.to_string())

print("\nPer-class outcomes:")
print(per_class_df.to_string(index=False))

print("\nSaved files:")
print(" ", report_csv)
print(" ", cm_csv)
print(" ", model_path)
print(" ", metrics_json)
print(" ", per_class_csv)


Random Forest (pressure) accuracy: 69.08%

Classification report:
              precision    recall  f1-score   support

           0     0.4581    0.4198    0.4381     26800
           1     1.0000    1.0000    1.0000     26800
           2     0.3733    0.3436    0.3578     26800
           3     0.8597    1.0000    0.9246     26800

    accuracy                         0.6908    107200
   macro avg     0.6728    0.6908    0.6801    107200
weighted avg     0.6728    0.6908    0.6801    107200


Confusion matrix (rows=true, columns=pred):
pred_label         pred_Anise (0)  pred_Chilli (1)  pred_Cinnamon (2)  pred_Nutmeg (3)
true_label                                                                            
true_Anise (0)              11250                0              15461               89
true_Chilli (1)                 0            26800                  0                0
true_Cinnamon (2)           13307                0               9208             4285
true_Nutmeg (3)    

In [4]:
# Reviewer note: Logistic Regression on single raw feature (pressure); add model persistence, labeled confusion-matrix CSV, and console prints; no change to core behavior

import time
import sys
import json
import joblib
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Normalize BASE_OUT with a default and derive OUT_DIR for this model
BASE_OUT = Path(globals().get(
    "BASE_OUT",
    "/content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/pressure/outputs"
))
OUT_DIR = BASE_OUT / "LogisticRegression"

# Safety guard: stop if an existing folder is not empty to protect prior runs
if OUT_DIR.exists() and any(OUT_DIR.iterdir()):
    print(f"Safety guard: output folder already exists and is not empty: {OUT_DIR}")
    print("Create a new folder or archive/clear the existing one before proceeding.")
    sys.exit(1)

OUT_DIR.mkdir(parents=True, exist_ok=True)

# Model configured for multinomial classification without scaling
clf = LogisticRegression(
    multi_class="multinomial",
    solver="lbfgs",
    max_iter=2000
)

start = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start

# Predictions and core metrics
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
acc_percent = round(100.0 * acc, 2)
report_dict = classification_report(y_test, y_pred, digits=4, output_dict=True)
cm = confusion_matrix(y_test, y_pred)

# Optional spice mapping if present; falls back to blank if column absent
spice_map = {}
if "spice" in test_df.columns:
    spice_map = test_df.groupby("target")["spice"].agg(lambda s: s.mode().iat[0]).to_dict()

# Build labeled headers for confusion matrix
unique_classes = np.sort(np.unique(y_test))
label_names = []
for c in unique_classes:
    if spice_map:
        label_names.append(f"{spice_map.get(int(c), '')} ({int(c)})".strip())
    else:
        label_names.append(str(int(c)))

# Persist classification report
report_csv = OUT_DIR / "lr_pressure_classification_report.csv"
pd.DataFrame(report_dict).transpose().to_csv(report_csv, index=True)

# Persist confusion matrix with clear headers
cm_df = pd.DataFrame(
    cm,
    index=[f"true_{n}" for n in label_names],
    columns=[f"pred_{n}" for n in label_names]
)
cm_df.index.name = "true_label"
cm_df.columns.name = "pred_label"
cm_csv = OUT_DIR / "lr_pressure_confusion_matrix.csv"
cm_df.to_csv(cm_csv, index=True)

# Persist the trained model; avoid overwriting by timestamping if needed
model_base = OUT_DIR / "lr_pressure_model.joblib"
model_path = model_base if not model_base.exists() else OUT_DIR / f"lr_pressure_model_{time.strftime('%Y%m%d_%H%M%S')}.joblib"
joblib.dump(clf, model_path)

# Persist scalar metrics and file paths
metrics_json = OUT_DIR / "lr_pressure_metrics.json"
with metrics_json.open("w") as f:
    json.dump(
        {
            "model": "LogisticRegression",
            "multi_class": "multinomial",
            "solver": "lbfgs",
            "max_iter": 2000,
            "feature_set": ["pressure"],
            "test_accuracy": acc,
            "test_accuracy_percent": acc_percent,
            "train_time_sec": round(train_time, 4),
            "report_csv": str(report_csv),
            "confusion_matrix_csv": str(cm_csv),
            "model_path": str(model_path)
        },
        f,
        indent=2
    )

# Build per-class correct and incorrect counts; include spice mapping when available
summary = []
for c in unique_classes:
    mask = (y_test == c)
    total = int(mask.sum())
    correct = int((y_pred[mask] == y_test[mask]).sum())
    incorrect = int(total - correct)
    cls_acc = round(100.0 * correct / total, 2) if total > 0 else 0.0
    summary.append({
        "target": int(c),
        "spice": spice_map.get(int(c), "") if spice_map else "",
        "total_rows": total,
        "correct_rows": correct,
        "incorrect_rows": incorrect,
        "class_accuracy_percent": cls_acc
    })

per_class_df = pd.DataFrame(summary).sort_values(by="target")
per_class_csv = OUT_DIR / "lr_pressure_per_class_outcomes.csv"
per_class_df.to_csv(per_class_csv, index=False)

# Console output with clear summary
print(f"Logistic Regression (pressure) accuracy: {acc_percent}%")
print("\nClassification report:")
print(classification_report(y_test, y_pred, digits=4))

print("\nConfusion matrix (rows=true, columns=pred):")
print(cm_df.to_string())

print("\nPer-class outcomes:")
print(per_class_df.to_string(index=False))

print("\nSaved files:")
print(" ", report_csv)
print(" ", cm_csv)
print(" ", model_path)
print(" ", metrics_json)
print(" ", per_class_csv)




Logistic Regression (pressure) accuracy: 79.39%

Classification report:
              precision    recall  f1-score   support

           0     0.8351    0.3013    0.4428     26800
           1     1.0000    1.0000    1.0000     26800
           2     0.5558    0.8743    0.6796     26800
           3     0.9379    1.0000    0.9680     26800

    accuracy                         0.7939    107200
   macro avg     0.8322    0.7939    0.7726    107200
weighted avg     0.8322    0.7939    0.7726    107200


Confusion matrix (rows=true, columns=pred):
pred_label         pred_Anise (0)  pred_Chilli (1)  pred_Cinnamon (2)  pred_Nutmeg (3)
true_label                                                                            
true_Anise (0)               8075                0              18725                0
true_Chilli (1)                 0            26800                  0                0
true_Cinnamon (2)            1594                0              23432             1774
true_Nutmeg (

In [5]:
# Reviewer note: MLP on single raw feature (pressure); add model persistence, labeled confusion-matrix CSV, and console prints; no change to core behavior

import time
import sys
import json
import joblib
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Normalize BASE_OUT with a default and derive OUT_DIR for this model
BASE_OUT = Path(globals().get(
    "BASE_OUT",
    "/content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/pressure/outputs"
))
OUT_DIR = BASE_OUT / "MLP"

# Safety guard: stop if an existing folder is not empty to protect prior runs
if OUT_DIR.exists() and any(OUT_DIR.iterdir()):
    print(f"Safety guard: output folder already exists and is not empty: {OUT_DIR}")
    print("Create a new folder or archive/clear the existing one before proceeding.")
    sys.exit(1)

OUT_DIR.mkdir(parents=True, exist_ok=True)

clf = MLPClassifier(
    hidden_layer_sizes=(256, 128),
    activation="relu",
    solver="adam",
    max_iter=600,
    random_state=42,
    early_stopping=False
)

start = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start

# Predictions and core metrics
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
acc_percent = round(100.0 * acc, 2)
report_dict = classification_report(y_test, y_pred, digits=4, output_dict=True)
cm = confusion_matrix(y_test, y_pred)

# Optional spice mapping if present; falls back to blank if column absent
spice_map = {}
if "spice" in test_df.columns:
    spice_map = test_df.groupby("target")["spice"].agg(lambda s: s.mode().iat[0]).to_dict()

# Build labeled headers for confusion matrix
unique_classes = np.sort(np.unique(y_test))
label_names = []
for c in unique_classes:
    if spice_map:
        label_names.append(f"{spice_map.get(int(c), '')} ({int(c)})".strip())
    else:
        label_names.append(str(int(c)))

# Persist classification report
report_csv = OUT_DIR / "mlp_pressure_classification_report.csv"
pd.DataFrame(report_dict).transpose().to_csv(report_csv, index=True)

# Persist confusion matrix with clear headers
cm_df = pd.DataFrame(
    cm,
    index=[f"true_{n}" for n in label_names],
    columns=[f"pred_{n}" for n in label_names]
)
cm_df.index.name = "true_label"
cm_df.columns.name = "pred_label"
cm_csv = OUT_DIR / "mlp_pressure_confusion_matrix.csv"
cm_df.to_csv(cm_csv, index=True)

# Persist the trained model; avoid overwriting by timestamping if needed
model_base = OUT_DIR / "mlp_pressure_model.joblib"
model_path = model_base if not model_base.exists() else OUT_DIR / f"mlp_pressure_model_{time.strftime('%Y%m%d_%H%M%S')}.joblib"
joblib.dump(clf, model_path)

# Persist scalar metrics and file paths
metrics_json = OUT_DIR / "mlp_pressure_metrics.json"
with metrics_json.open("w") as f:
    json.dump(
        {
            "model": "MLPClassifier",
            "hidden_layer_sizes": [256, 128],
            "activation": "relu",
            "solver": "adam",
            "max_iter": 600,
            "early_stopping": False,
            "feature_set": ["pressure"],
            "test_accuracy": acc,
            "test_accuracy_percent": acc_percent,
            "train_time_sec": round(train_time, 4),
            "report_csv": str(report_csv),
            "confusion_matrix_csv": str(cm_csv),
            "model_path": str(model_path)
        },
        f,
        indent=2
    )

# Build per-class correct and incorrect counts; include spice mapping when available
summary = []
for c in unique_classes:
    mask = (y_test == c)
    total = int(mask.sum())
    correct = int((y_pred[mask] == y_test[mask]).sum())
    incorrect = int(total - correct)
    cls_acc = round(100.0 * correct / total, 2) if total > 0 else 0.0
    summary.append({
        "target": int(c),
        "spice": spice_map.get(int(c), "") if spice_map else "",
        "total_rows": total,
        "correct_rows": correct,
        "incorrect_rows": incorrect,
        "class_accuracy_percent": cls_acc
    })

per_class_df = pd.DataFrame(summary).sort_values(by="target")
per_class_csv = OUT_DIR / "mlp_pressure_per_class_outcomes.csv"
per_class_df.to_csv(per_class_csv, index=False)

# Console output with clear summary
print(f"MLP (pressure) accuracy: {acc_percent}%")
print("\nClassification report:")
print(classification_report(y_test, y_pred, digits=4))

print("\nConfusion matrix (rows=true, columns=pred):")
print(cm_df.to_string())

print("\nPer-class outcomes:")
print(per_class_df.to_string(index=False))

print("\nSaved files:")
print(" ", report_csv)
print(" ", cm_csv)
print(" ", model_path)
print(" ", metrics_json)
print(" ", per_class_csv)


MLP (pressure) accuracy: 25.0%

Classification report:
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000     26800
           1     0.0000    0.0000    0.0000     26800
           2     0.0000    0.0000    0.0000     26800
           3     0.2500    1.0000    0.4000     26800

    accuracy                         0.2500    107200
   macro avg     0.0625    0.2500    0.1000    107200
weighted avg     0.0625    0.2500    0.1000    107200


Confusion matrix (rows=true, columns=pred):
pred_label         pred_Anise (0)  pred_Chilli (1)  pred_Cinnamon (2)  pred_Nutmeg (3)
true_label                                                                            
true_Anise (0)                  0                0                  0            26800
true_Chilli (1)                 0                0                  0            26800
true_Cinnamon (2)               0                0                  0            26800
true_Nutmeg (3)               

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [6]:
# Reviewer note: Linear SVM on single raw feature (pressure); add model persistence, labeled confusion-matrix CSV, and console prints; no change to core behavior

import time
import sys
import json
import joblib
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Normalize BASE_OUT with a default and derive OUT_DIR for this model
BASE_OUT = Path(globals().get(
    "BASE_OUT",
    "/content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/pressure/outputs"
))
OUT_DIR = BASE_OUT / "SVM"

# Safety guard: stop if an existing folder is not empty to protect prior runs
if OUT_DIR.exists() and any(OUT_DIR.iterdir()):
    print(f"Safety guard: output folder already exists and is not empty: {OUT_DIR}")
    print("Create a new folder or archive/clear the existing one before proceeding.")
    sys.exit(1)

OUT_DIR.mkdir(parents=True, exist_ok=True)

clf = LinearSVC(
    C=1.0,
    random_state=42,
    dual=False,
    max_iter=5000
)

start = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start

# Predictions and core metrics
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
acc_percent = round(100.0 * acc, 2)
report_dict = classification_report(y_test, y_pred, digits=4, output_dict=True)
cm = confusion_matrix(y_test, y_pred)

# Optional spice mapping if present; falls back to blank if column absent
spice_map = {}
if "spice" in test_df.columns:
    spice_map = test_df.groupby("target")["spice"].agg(lambda s: s.mode().iat[0]).to_dict()

# Build labeled headers for confusion matrix
unique_classes = np.sort(np.unique(y_test))
label_names = []
for c in unique_classes:
    if spice_map:
        label_names.append(f"{spice_map.get(int(c), '')} ({int(c)})".strip())
    else:
        label_names.append(str(int(c)))

# Persist classification report
report_csv = OUT_DIR / "svm_pressure_classification_report.csv"
pd.DataFrame(report_dict).transpose().to_csv(report_csv, index=True)

# Persist confusion matrix with clear headers
cm_df = pd.DataFrame(
    cm,
    index=[f"true_{n}" for n in label_names],
    columns=[f"pred_{n}" for n in label_names]
)
cm_df.index.name = "true_label"
cm_df.columns.name = "pred_label"
cm_csv = OUT_DIR / "svm_pressure_confusion_matrix.csv"
cm_df.to_csv(cm_csv, index=True)

# Persist the trained model; avoid overwriting by timestamping if needed
model_base = OUT_DIR / "svm_pressure_model.joblib"
model_path = model_base if not model_base.exists() else OUT_DIR / f"svm_pressure_model_{time.strftime('%Y%m%d_%H%M%S')}.joblib"
joblib.dump(clf, model_path)

# Persist scalar metrics and file paths
metrics_json = OUT_DIR / "svm_pressure_metrics.json"
with metrics_json.open("w") as f:
    json.dump(
        {
            "model": "LinearSVC",
            "C": 1.0,
            "dual": False,
            "max_iter": 5000,
            "feature_set": ["pressure"],
            "test_accuracy": acc,
            "test_accuracy_percent": acc_percent,
            "train_time_sec": round(train_time, 4),
            "report_csv": str(report_csv),
            "confusion_matrix_csv": str(cm_csv),
            "model_path": str(model_path)
        },
        f,
        indent=2
    )

# Build per class correct and incorrect counts with optional spice mapping
summary = []
for c in unique_classes:
    mask = (y_test == c)
    total = int(mask.sum())
    correct = int((y_pred[mask] == y_test[mask]).sum())
    incorrect = int(total - correct)
    cls_acc = round(100.0 * correct / total, 2) if total > 0 else 0.0
    summary.append({
        "target": int(c),
        "spice": spice_map.get(int(c), "") if spice_map else "",
        "total_rows": total,
        "correct_rows": correct,
        "incorrect_rows": incorrect,
        "class_accuracy_percent": cls_acc
    })

per_class_df = pd.DataFrame(summary).sort_values(by="target")
per_class_csv = OUT_DIR / "svm_pressure_per_class_outcomes.csv"
per_class_df.to_csv(per_class_csv, index=False)

# Console output
print(f"SVM (pressure) accuracy: {acc_percent}%")
print("\nClassification report:")
print(classification_report(y_test, y_pred, digits=4))

print("\nConfusion matrix (rows=true, columns=pred):")
print(cm_df.to_string())

print("\nPer-class outcomes:")
print(per_class_df.to_string(index=False))

print("\nSaved files:")
print(" ", report_csv)
print(" ", cm_csv)
print(" ", model_path)
print(" ", metrics_json)
print(" ", per_class_csv)


SVM (pressure) accuracy: 25.0%

Classification report:
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000     26800
           1     0.0000    0.0000    0.0000     26800
           2     0.0000    0.0000    0.0000     26800
           3     0.2500    1.0000    0.4000     26800

    accuracy                         0.2500    107200
   macro avg     0.0625    0.2500    0.1000    107200
weighted avg     0.0625    0.2500    0.1000    107200


Confusion matrix (rows=true, columns=pred):
pred_label         pred_Anise (0)  pred_Chilli (1)  pred_Cinnamon (2)  pred_Nutmeg (3)
true_label                                                                            
true_Anise (0)                  0                0                  0            26800
true_Chilli (1)                 0                0                  0            26800
true_Cinnamon (2)               0                0                  0            26800
true_Nutmeg (3)               

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [7]:
# Reviewer note: SGD on single raw feature (pressure); add model persistence, labeled confusion-matrix CSV, and console prints; no change to core behavior

import time
import sys
import json
import joblib
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Normalize BASE_OUT with a default and derive OUT_DIR for this model
BASE_OUT = Path(globals().get(
    "BASE_OUT",
    "/content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/pressure/outputs"
))
OUT_DIR = BASE_OUT / "SGD"

# Safety guard: stop if an existing folder is not empty to protect prior runs
if OUT_DIR.exists() and any(OUT_DIR.iterdir()):
    print(f"Safety guard: output folder already exists and is not empty: {OUT_DIR}")
    print("Create a new folder or archive/clear the existing one before proceeding.")
    sys.exit(1)

OUT_DIR.mkdir(parents=True, exist_ok=True)

clf = SGDClassifier(
    loss="log_loss",
    penalty="l2",
    alpha=1e-4,
    max_iter=5000,
    tol=1e-4,
    random_state=42
)

start = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start

# Predictions and core metrics
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
acc_percent = round(100.0 * acc, 2)
report_dict = classification_report(y_test, y_pred, digits=4, output_dict=True)
cm = confusion_matrix(y_test, y_pred)

# Optional spice mapping if present; falls back to blank if column absent
spice_map = {}
if "spice" in test_df.columns:
    spice_map = test_df.groupby(LABEL_COL)["spice"].agg(lambda s: s.mode().iat[0]).to_dict()

# Build labeled headers for confusion matrix
unique_classes = np.sort(np.unique(y_test))
label_names = []
for c in unique_classes:
    if spice_map:
        label_names.append(f"{spice_map.get(int(c), '')} ({int(c)})".strip())
    else:
        label_names.append(str(int(c)))

# Persist classification report
report_csv = OUT_DIR / "sgd_pressure_classification_report.csv"
pd.DataFrame(report_dict).transpose().to_csv(report_csv, index=True)

# Persist confusion matrix with clear headers
cm_df = pd.DataFrame(
    cm,
    index=[f"true_{n}" for n in label_names],
    columns=[f"pred_{n}" for n in label_names]
)
cm_df.index.name = "true_label"
cm_df.columns.name = "pred_label"
cm_csv = OUT_DIR / "sgd_pressure_confusion_matrix.csv"
cm_df.to_csv(cm_csv, index=True)

# Persist the trained model; avoid overwriting by timestamping if needed
model_base = OUT_DIR / "sgd_pressure_model.joblib"
model_path = model_base if not model_base.exists() else OUT_DIR / f"sgd_pressure_model_{time.strftime('%Y%m%d_%H%M%S')}.joblib"
joblib.dump(clf, model_path)

# Persist scalar metrics and file paths
metrics_json = OUT_DIR / "sgd_pressure_metrics.json"
with metrics_json.open("w") as f:
    json.dump(
        {
            "model": "SGDClassifier",
            "loss": "log_loss",
            "penalty": "l2",
            "alpha": 1e-4,
            "max_iter": 5000,
            "tol": 1e-4,
            "feature_set": ["pressure"],
            "test_accuracy": acc,
            "test_accuracy_percent": acc_percent,
            "train_time_sec": round(train_time, 4),
            "report_csv": str(report_csv),
            "confusion_matrix_csv": str(cm_csv),
            "model_path": str(model_path)
        },
        f,
        indent=2
    )

# Build per-class correct and incorrect counts; include spice mapping when available
summary = []
for c in unique_classes:
    mask = (y_test == c)
    total = int(mask.sum())
    correct = int((y_pred[mask] == y_test[mask]).sum())
    incorrect = int(total - correct)
    cls_acc = round(100.0 * correct / total, 2) if total > 0 else 0.0
    summary.append({
        "target": int(c),
        "spice": spice_map.get(int(c), "") if spice_map else "",
        "total_rows": total,
        "correct_rows": correct,
        "incorrect_rows": incorrect,
        "class_accuracy_percent": cls_acc
    })

per_class_df = pd.DataFrame(summary).sort_values(by="target")
per_class_csv = OUT_DIR / "sgd_pressure_per_class_outcomes.csv"
per_class_df.to_csv(per_class_csv, index=False)

# Console output
print(f"SGD (pressure) accuracy: {acc_percent}%")
print("\nClassification report:")
print(classification_report(y_test, y_pred, digits=4))

print("\nConfusion matrix (rows=true, columns=pred):")
print(cm_df.to_string())

print("\nPer-class outcomes:")
print(per_class_df.to_string(index=False))

print("\nSaved files:")
print(" ", report_csv)
print(" ", cm_csv)
print(" ", model_path)
print(" ", metrics_json)
print(" ", per_class_csv)


SGD (pressure) accuracy: 25.0%

Classification report:
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000     26800
           1     0.0000    0.0000    0.0000     26800
           2     0.0000    0.0000    0.0000     26800
           3     0.2500    1.0000    0.4000     26800

    accuracy                         0.2500    107200
   macro avg     0.0625    0.2500    0.1000    107200
weighted avg     0.0625    0.2500    0.1000    107200


Confusion matrix (rows=true, columns=pred):
pred_label         pred_Anise (0)  pred_Chilli (1)  pred_Cinnamon (2)  pred_Nutmeg (3)
true_label                                                                            
true_Anise (0)                  0                0                  0            26800
true_Chilli (1)                 0                0                  0            26800
true_Cinnamon (2)               0                0                  0            26800
true_Nutmeg (3)               

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [8]:
# Reviewer note: Gradient Boosting on single raw feature (pressure); add model persistence, labeled confusion-matrix CSV, and console prints; no change to core behavior

import time
import sys
import json
import joblib
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Normalize BASE_OUT with a default and derive OUT_DIR for this model
BASE_OUT = Path(globals().get(
    "BASE_OUT",
    "/content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/pressure/outputs"
))
OUT_DIR = BASE_OUT / "GradientBoosting"

# Safety guard: stop if an existing folder is not empty to protect prior runs
if OUT_DIR.exists() and any(OUT_DIR.iterdir()):
    print(f"Safety guard: output folder already exists and is not empty: {OUT_DIR}")
    print("Create a new folder or archive/clear the existing one before proceeding.")
    sys.exit(1)

OUT_DIR.mkdir(parents=True, exist_ok=True)

clf = HistGradientBoostingClassifier(
    loss="log_loss",
    learning_rate=0.1,
    max_iter=200,
    max_depth=None,
    early_stopping=False,
    random_state=42
)

start = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start

# Predictions and core metrics
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
acc_percent = round(100.0 * acc, 2)
report_dict = classification_report(y_test, y_pred, digits=4, output_dict=True)
cm = confusion_matrix(y_test, y_pred)

# Optional spice mapping if present; falls back to blank if column absent
spice_map = {}
if "spice" in test_df.columns:
    spice_map = test_df.groupby(LABEL_COL)["spice"].agg(lambda s: s.mode().iat[0]).to_dict()

# Build labeled headers for confusion matrix
unique_classes = np.sort(np.unique(y_test))
label_names = []
for c in unique_classes:
    if spice_map:
        label_names.append(f"{spice_map.get(int(c), '')} ({int(c)})".strip())
    else:
        label_names.append(str(int(c)))

# Persist classification report
report_csv = OUT_DIR / "gb_pressure_classification_report.csv"
pd.DataFrame(report_dict).transpose().to_csv(report_csv, index=True)

# Persist confusion matrix with clear headers
cm_df = pd.DataFrame(
    cm,
    index=[f"true_{n}" for n in label_names],
    columns=[f"pred_{n}" for n in label_names]
)
cm_df.index.name = "true_label"
cm_df.columns.name = "pred_label"
cm_csv = OUT_DIR / "gb_pressure_confusion_matrix.csv"
cm_df.to_csv(cm_csv, index=True)

# Persist the trained model; avoid overwriting by timestamping if needed
model_base = OUT_DIR / "gb_pressure_model.joblib"
model_path = model_base if not model_base.exists() else OUT_DIR / f"gb_pressure_model_{time.strftime('%Y%m%d_%H%M%S')}.joblib"
joblib.dump(clf, model_path)

# Persist scalar metrics and file paths
metrics_json = OUT_DIR / "gb_pressure_metrics.json"
with metrics_json.open("w") as f:
    json.dump(
        {
            "model": "HistGradientBoostingClassifier",
            "loss": "log_loss",
            "learning_rate": 0.1,
            "max_iter": 200,
            "max_depth": None,
            "early_stopping": False,
            "feature_set": ["pressure"],
            "test_accuracy": acc,
            "test_accuracy_percent": acc_percent,
            "train_time_sec": round(train_time, 4),
            "report_csv": str(report_csv),
            "confusion_matrix_csv": str(cm_csv),
            "model_path": str(model_path)
        },
        f,
        indent=2
    )

# Build per class correct and incorrect counts with optional spice mapping
summary = []
for c in unique_classes:
    mask = (y_test == c)
    total = int(mask.sum())
    correct = int((y_pred[mask] == y_test[mask]).sum())
    incorrect = int(total - correct)
    cls_acc = round(100.0 * correct / total, 2) if total > 0 else 0.0
    summary.append({
        "target": int(c),
        "spice": spice_map.get(int(c), "") if spice_map else "",
        "total_rows": total,
        "correct_rows": correct,
        "incorrect_rows": incorrect,
        "class_accuracy_percent": cls_acc
    })

per_class_df = pd.DataFrame(summary).sort_values(by="target")
per_class_csv = OUT_DIR / "gb_pressure_per_class_outcomes.csv"
per_class_df.to_csv(per_class_csv, index=False)

# Console output
print(f"Gradient Boosting (pressure) accuracy: {acc_percent}%")
print("\nClassification report:")
print(classification_report(y_test, y_pred, digits=4))

print("\nConfusion matrix (rows=true, columns=pred):")
print(cm_df.to_string())

print("\nPer-class outcomes:")
print(per_class_df.to_string(index=False))

print("\nSaved files:")
print(" ", report_csv)
print(" ", cm_csv)
print(" ", model_path)
print(" ", metrics_json)
print(" ", per_class_csv)


Gradient Boosting (pressure) accuracy: 64.11%

Classification report:
              precision    recall  f1-score   support

           0     0.3669    0.4071    0.3859     26800
           1     1.0000    1.0000    1.0000     26800
           2     0.2098    0.1574    0.1798     26800
           3     0.8770    1.0000    0.9344     26800

    accuracy                         0.6411    107200
   macro avg     0.6134    0.6411    0.6251    107200
weighted avg     0.6134    0.6411    0.6251    107200


Confusion matrix (rows=true, columns=pred):
pred_label         pred_Anise (0)  pred_Chilli (1)  pred_Cinnamon (2)  pred_Nutmeg (3)
true_label                                                                            
true_Anise (0)              10909                0              15891                0
true_Chilli (1)                 0            26800                  0                0
true_Cinnamon (2)           18822                0               4218             3760
true_Nutmeg (3)

In [9]:
# Reviewer note: AdaBoost on single raw feature (pressure); add model persistence, labeled confusion-matrix CSV, and console prints; no change to core behavior

import time
import sys
import json
import joblib
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Normalize BASE_OUT with a default and derive OUT_DIR for this model
BASE_OUT = Path(globals().get(
    "BASE_OUT",
    "/content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/pressure/outputs"
))
OUT_DIR = BASE_OUT / "AdaBoost"

# Safety guard: stop if an existing folder is not empty to protect prior runs
if OUT_DIR.exists() and any(OUT_DIR.iterdir()):
    print(f"Safety guard: output folder already exists and is not empty: {OUT_DIR}")
    print("Create a new folder or archive/clear the existing one before proceeding.")
    sys.exit(1)

OUT_DIR.mkdir(parents=True, exist_ok=True)

clf = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(max_depth=1, random_state=42),
    n_estimators=300,
    learning_rate=0.5,
    algorithm="SAMME",
    random_state=42
)

start = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start

# Predictions and core metrics
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
acc_percent = round(100.0 * acc, 2)
report_dict = classification_report(y_test, y_pred, digits=4, output_dict=True)
cm = confusion_matrix(y_test, y_pred)

# Optional spice mapping if present; falls back to blank if column absent
spice_map = {}
if "spice" in test_df.columns:
    spice_map = test_df.groupby(LABEL_COL)["spice"].agg(lambda s: s.mode().iat[0]).to_dict()

# Build labeled headers for confusion matrix
unique_classes = np.sort(np.unique(y_test))
label_names = []
for c in unique_classes:
    if spice_map:
        label_names.append(f"{spice_map.get(int(c), '')} ({int(c)})".strip())
    else:
        label_names.append(str(int(c)))

# Persist classification report
report_csv = OUT_DIR / "ada_pressure_classification_report.csv"
pd.DataFrame(report_dict).transpose().to_csv(report_csv, index=True)

# Persist confusion matrix with clear headers
cm_df = pd.DataFrame(
    cm,
    index=[f"true_{n}" for n in label_names],
    columns=[f"pred_{n}" for n in label_names]
)
cm_df.index.name = "true_label"
cm_df.columns.name = "pred_label"
cm_csv = OUT_DIR / "ada_pressure_confusion_matrix.csv"
cm_df.to_csv(cm_csv, index=True)

# Persist the trained model; avoid overwriting by timestamping if needed
model_base = OUT_DIR / "ada_pressure_model.joblib"
model_path = model_base if not model_base.exists() else OUT_DIR / f"ada_pressure_model_{time.strftime('%Y%m%d_%H%M%S')}.joblib"
joblib.dump(clf, model_path)

# Persist scalar metrics and file paths
metrics_json = OUT_DIR / "ada_pressure_metrics.json"
with metrics_json.open("w") as f:
    json.dump(
        {
            "model": "AdaBoostClassifier",
            "base_estimator": "DecisionTree(max_depth=1)",
            "n_estimators": 300,
            "learning_rate": 0.5,
            "algorithm": "SAMME",
            "feature_set": ["pressure"],
            "test_accuracy": acc,
            "test_accuracy_percent": acc_percent,
            "train_time_sec": round(train_time, 4),
            "report_csv": str(report_csv),
            "confusion_matrix_csv": str(cm_csv),
            "model_path": str(model_path)
        },
        f,
        indent=2
    )

# Build per class correct and incorrect counts with optional spice mapping
summary = []
for c in unique_classes:
    mask = (y_test == c)
    total = int(mask.sum())
    correct = int((y_pred[mask] == y_test[mask]).sum())
    incorrect = int(total - correct)
    cls_acc = round(100.0 * correct / total, 2) if total > 0 else 0.0
    summary.append({
        "target": int(c),
        "spice": spice_map.get(int(c), "") if spice_map else "",
        "total_rows": total,
        "correct_rows": correct,
        "incorrect_rows": incorrect,
        "class_accuracy_percent": cls_acc
    })

per_class_df = pd.DataFrame(summary).sort_values(by="target")
per_class_csv = OUT_DIR / "ada_pressure_per_class_outcomes.csv"
per_class_df.to_csv(per_class_csv, index=False)

# Console output
print(f"AdaBoost (pressure) accuracy: {acc_percent}%")
print("\nClassification report:")
print(classification_report(y_test, y_pred, digits=4))

print("\nConfusion matrix (rows=true, columns=pred):")
print(cm_df.to_string())

print("\nPer-class outcomes:")
print(per_class_df.to_string(index=False))

print("\nSaved files:")
print(" ", report_csv)
print(" ", cm_csv)
print(" ", model_path)
print(" ", metrics_json)
print(" ", per_class_csv)




AdaBoost (pressure) accuracy: 63.94%

Classification report:
              precision    recall  f1-score   support

           0     0.3687    0.4251    0.3949     26800
           1     1.0000    1.0000    1.0000     26800
           2     0.1876    0.1327    0.1554     26800
           3     0.8777    1.0000    0.9349     26800

    accuracy                         0.6394    107200
   macro avg     0.6085    0.6394    0.6213    107200
weighted avg     0.6085    0.6394    0.6213    107200


Confusion matrix (rows=true, columns=pred):
pred_label         pred_Anise (0)  pred_Chilli (1)  pred_Cinnamon (2)  pred_Nutmeg (3)
true_label                                                                            
true_Anise (0)              11392                0              15408                0
true_Chilli (1)                 0            26800                  0                0
true_Cinnamon (2)           19509                0               3557             3734
true_Nutmeg (3)         

In [10]:
# Reviewer note: KNN on single raw feature (pressure); add model persistence, labeled confusion-matrix CSV, and console prints; no change to core behavior

import time
import sys
import json
import joblib
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Normalize BASE_OUT with a default and derive OUT_DIR for this model
BASE_OUT = Path(globals().get(
    "BASE_OUT",
    "/content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/pressure/outputs"
))
OUT_DIR = BASE_OUT / "KNN"

# Safety guard: stop if an existing folder is not empty to protect prior runs
if OUT_DIR.exists() and any(OUT_DIR.iterdir()):
    print(f"Safety guard: output folder already exists and is not empty: {OUT_DIR}")
    print("Create a new folder or archive/clear the existing one before proceeding.")
    sys.exit(1)

OUT_DIR.mkdir(parents=True, exist_ok=True)

clf = KNeighborsClassifier(
    n_neighbors=7,
    weights="distance",
    algorithm="kd_tree",
    leaf_size=30,
    p=2,
    n_jobs=-1
)

start = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start

# Predictions and core metrics
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
acc_percent = round(100.0 * acc, 2)
report_dict = classification_report(y_test, y_pred, digits=4, output_dict=True)
cm = confusion_matrix(y_test, y_pred)

# Optional spice mapping if present; falls back to blank if column absent
spice_map = {}
if "spice" in test_df.columns:
    spice_map = test_df.groupby("target")["spice"].agg(lambda s: s.mode().iat[0]).to_dict()

# Build labeled headers for confusion matrix
unique_classes = np.sort(np.unique(y_test))
label_names = []
for c in unique_classes:
    if spice_map:
        label_names.append(f"{spice_map.get(int(c), '')} ({int(c)})".strip())
    else:
        label_names.append(str(int(c)))

# Persist classification report
report_csv = OUT_DIR / "knn_pressure_classification_report.csv"
pd.DataFrame(report_dict).transpose().to_csv(report_csv, index=True)

# Persist confusion matrix with clear headers
cm_df = pd.DataFrame(
    cm,
    index=[f"true_{n}" for n in label_names],
    columns=[f"pred_{n}" for n in label_names]
)
cm_df.index.name = "true_label"
cm_df.columns.name = "pred_label"
cm_csv = OUT_DIR / "knn_pressure_confusion_matrix.csv"
cm_df.to_csv(cm_csv, index=True)

# Persist the trained model; avoid overwriting by timestamping if needed
model_base = OUT_DIR / "knn_pressure_model.joblib"
model_path = model_base if not model_base.exists() else OUT_DIR / f"knn_pressure_model_{time.strftime('%Y%m%d_%H%M%S')}.joblib"
joblib.dump(clf, model_path)

# Persist scalar metrics and file paths
metrics_json = OUT_DIR / "knn_pressure_metrics.json"
with metrics_json.open("w") as f:
    json.dump(
        {
            "model": "KNeighborsClassifier",
            "n_neighbors": 7,
            "weights": "distance",
            "algorithm": "kd_tree",
            "leaf_size": 30,
            "p": 2,
            "feature_set": ["pressure"],
            "test_accuracy": acc,
            "test_accuracy_percent": acc_percent,
            "train_time_sec": round(train_time, 4),
            "report_csv": str(report_csv),
            "confusion_matrix_csv": str(cm_csv),
            "model_path": str(model_path)
        },
        f,
        indent=2
    )

# Build per class correct and incorrect counts with optional spice mapping
summary = []
for c in unique_classes:
    mask = (y_test == c)
    total = int(mask.sum())
    correct = int((y_pred[mask] == y_test[mask]).sum())
    incorrect = int(total - correct)
    cls_acc = round(100.0 * correct / total, 2) if total > 0 else 0.0
    summary.append({
        "target": int(c),
        "spice": spice_map.get(int(c), "") if spice_map else "",
        "total_rows": total,
        "correct_rows": correct,
        "incorrect_rows": incorrect,
        "class_accuracy_percent": cls_acc
    })

per_class_df = pd.DataFrame(summary).sort_values(by="target")
per_class_csv = OUT_DIR / "knn_pressure_per_class_outcomes.csv"
per_class_df.to_csv(per_class_csv, index=False)

# Console output
print(f"KNN (pressure) accuracy: {acc_percent}%")
print("\nClassification report:")
print(classification_report(y_test, y_pred, digits=4))

print("\nConfusion matrix (rows=true, columns=pred):")
print(cm_df.to_string())

print("\nPer-class outcomes:")
print(per_class_df.to_string(index=False))

print("\nSaved files:")
print(" ", report_csv)
print(" ", cm_csv)
print(" ", model_path)
print(" ", metrics_json)
print(" ", per_class_csv)


KNN (pressure) accuracy: 70.17%

Classification report:
              precision    recall  f1-score   support

           0     0.4817    0.5146    0.4976     26800
           1     1.0000    1.0000    1.0000     26800
           2     0.3763    0.2921    0.3289     26800
           3     0.8653    1.0000    0.9278     26800

    accuracy                         0.7017    107200
   macro avg     0.6808    0.7017    0.6886    107200
weighted avg     0.6808    0.7017    0.6886    107200


Confusion matrix (rows=true, columns=pred):
pred_label         pred_Anise (0)  pred_Chilli (1)  pred_Cinnamon (2)  pred_Nutmeg (3)
true_label                                                                            
true_Anise (0)              13790                0              12973               37
true_Chilli (1)                 0            26800                  0                0
true_Cinnamon (2)           14838                0               7827             4135
true_Nutmeg (3)              

In [11]:
# Reviewer note: 1D CNN on single raw feature (pressure); add model persistence, labeled confusion-matrix CSV, and console prints; no change to core behavior

import time
import sys
import json
import numpy as np
import pandas as pd
from pathlib import Path

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Normalize BASE_OUT with a default and derive OUT_DIR for this model
BASE_OUT = Path(globals().get(
    "BASE_OUT",
    "/content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/pressure/outputs"
))
OUT_DIR = BASE_OUT / "CNN"

# Safety guard: stop if an existing folder is not empty to protect prior runs
if OUT_DIR.exists() and any(OUT_DIR.iterdir()):
    print(f"Safety guard: output folder already exists and is not empty: {OUT_DIR}")
    print("Create a new folder or archive/clear the existing one before proceeding.")
    sys.exit(1)

OUT_DIR.mkdir(parents=True, exist_ok=True)

# Reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Label column fallback for safety
LABEL_COL = LABEL_COL if "LABEL_COL" in globals() else "target"

# Prepare tensors for CNN by reshaping only (sequence length = 1, channels = 1)
X_train_cnn = X_train.astype(np.float32).reshape(-1, 1, 1)
X_test_cnn  = X_test.astype(np.float32).reshape(-1, 1, 1)
num_classes = int(len(np.unique(y_train)))

# Build a minimal 1D CNN for sequence length one
inputs = keras.Input(shape=(1, 1))
x = layers.Conv1D(filters=16, kernel_size=1, activation="relu")(inputs)
x = layers.Conv1D(filters=16, kernel_size=1, activation="relu")(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dense(32, activation="relu")(x)
outputs = layers.Dense(num_classes, activation="softmax")(x)
model = keras.Model(inputs=inputs, outputs=outputs, name="cnn_pressure_1d")

model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-3),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

# Train without creating any derived validation split to keep datasets untouched
start = time.time()
history = model.fit(
    X_train_cnn, y_train,
    epochs=25,
    batch_size=1024,
    verbose=1
)
train_time = time.time() - start

# Predictions and metrics
y_pred_proba = model.predict(X_test_cnn, batch_size=4096, verbose=0)
y_pred = np.argmax(y_pred_proba, axis=1)

acc = accuracy_score(y_test, y_pred)
acc_percent = round(100.0 * acc, 2)
report_dict = classification_report(y_test, y_pred, digits=4, output_dict=True)
cm = confusion_matrix(y_test, y_pred)

# Optional spice mapping if present; falls back to blank if column absent
spice_map = {}
if "spice" in test_df.columns:
    spice_map = test_df.groupby(LABEL_COL)["spice"].agg(lambda s: s.mode().iat[0]).to_dict()

# Build labeled headers for confusion matrix
unique_classes = np.sort(np.unique(y_test))
label_names = []
for c in unique_classes:
    if spice_map:
        label_names.append(f"{spice_map.get(int(c), '')} ({int(c)})".strip())
    else:
        label_names.append(str(int(c)))

# Persist classification report
report_csv = OUT_DIR / "cnn_pressure_classification_report.csv"
pd.DataFrame(report_dict).transpose().to_csv(report_csv, index=True)

# Persist confusion matrix with clear headers
cm_df = pd.DataFrame(
    cm,
    index=[f"true_{n}" for n in label_names],
    columns=[f"pred_{n}" for n in label_names]
)
cm_df.index.name = "true_label"
cm_df.columns.name = "pred_label"
cm_csv = OUT_DIR / "cnn_pressure_confusion_matrix.csv"
cm_df.to_csv(cm_csv, index=True)

# Persist the trained model as .keras; avoid overwriting by timestamping if needed
model_base = OUT_DIR / "cnn_pressure_model.keras"
model_path = model_base if not model_base.exists() else OUT_DIR / f"cnn_pressure_model_{time.strftime('%Y%m%d_%H%M%S')}.keras"
model.save(model_path)

# Persist scalar metrics and file paths
metrics_json = OUT_DIR / "cnn_pressure_metrics.json"
with metrics_json.open("w") as f:
    json.dump(
        {
            "model": "Keras 1D CNN",
            "input_shape": [1, 1],
            "epochs": 25,
            "batch_size": 1024,
            "optimizer": "adam",
            "loss": "sparse_categorical_crossentropy",
            "feature_set": ["pressure"],
            "test_accuracy": acc,
            "test_accuracy_percent": acc_percent,
            "train_time_sec": round(train_time, 4),
            "report_csv": str(report_csv),
            "confusion_matrix_csv": str(cm_csv),
            "model_path": str(model_path)
        },
        f,
        indent=2
    )

# Build per class correct and incorrect counts with optional spice mapping
summary = []
for c in unique_classes:
    mask = (y_test == c)
    total = int(mask.sum())
    correct = int((y_pred[mask] == y_test[mask]).sum())
    incorrect = int(total - correct)
    cls_acc = round(100.0 * correct / total, 2) if total > 0 else 0.0
    summary.append({
        "target": int(c),
        "spice": spice_map.get(int(c), "") if spice_map else "",
        "total_rows": total,
        "correct_rows": correct,
        "incorrect_rows": incorrect,
        "class_accuracy_percent": cls_acc
    })

per_class_df = pd.DataFrame(summary).sort_values(by="target")
per_class_csv = OUT_DIR / "cnn_pressure_per_class_outcomes.csv"
per_class_df.to_csv(per_class_csv, index=False)

# Console output
print(f"CNN (pressure) accuracy: {acc_percent}%")
print("\nClassification report:")
print(classification_report(y_test, y_pred, digits=4))

print("\nConfusion matrix (rows=true, columns=pred):")
print(cm_df.to_string())

print("\nPer-class outcomes:")
print(per_class_df.to_string(index=False))

print("\nSaved files:")
print(" ", report_csv)
print(" ", cm_csv)
print(" ", model_path)
print(" ", metrics_json)
print(" ", per_class_csv)


Epoch 1/25
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.2534 - loss: 51.9115
Epoch 2/25
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.2491 - loss: 1.4023
Epoch 3/25
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.2526 - loss: 1.3940
Epoch 4/25
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.2510 - loss: 1.3945
Epoch 5/25
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.2500 - loss: 1.3985
Epoch 6/25
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.2511 - loss: 1.3990
Epoch 7/25
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.2507 - loss: 1.4017
Epoch 8/25
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.2474 - loss: 1.4092
Epoch 9/25
[1m105/105[0m [32m━━━━━━━

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [12]:
# Reviewer note: XGBoost on single raw feature (pressure); add model persistence, labeled confusion-matrix CSV, and console prints; no change to core behavior

import time
import sys
import json
import joblib
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from xgboost import XGBClassifier

# Normalize BASE_OUT with a default and derive OUT_DIR for this model
BASE_OUT = Path(globals().get(
    "BASE_OUT",
    "/content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/pressure/outputs"
))
OUT_DIR = BASE_OUT / "XGBoost"

# Safety guard: stop if an existing folder is not empty to protect prior runs
if OUT_DIR.exists() and any(OUT_DIR.iterdir()):
    print(f"Safety guard: output folder already exists and is not empty: {OUT_DIR}")
    print("Create a new folder or archive/clear the existing one before proceeding.")
    sys.exit(1)

OUT_DIR.mkdir(parents=True, exist_ok=True)

# Label column fallback for safety
LABEL_COL = LABEL_COL if "LABEL_COL" in globals() else "target"

num_classes = int(np.unique(y_train).size)

clf = XGBClassifier(
    objective="multi:softmax",
    num_class=num_classes,
    n_estimators=400,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    tree_method="hist",
    eval_metric="mlogloss",
    random_state=42,
    n_jobs=-1,
    verbosity=0
)

start = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start

# Predictions and core metrics
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
acc_percent = round(100.0 * acc, 2)
report_dict = classification_report(y_test, y_pred, digits=4, output_dict=True)
cm = confusion_matrix(y_test, y_pred)

# Optional spice mapping if present; falls back to blank if column absent
spice_map = {}
if "spice" in test_df.columns:
    spice_map = test_df.groupby(LABEL_COL)["spice"].agg(lambda s: s.mode().iat[0]).to_dict()

# Build labeled headers for confusion matrix
unique_classes = np.sort(np.unique(y_test))
label_names = []
for c in unique_classes:
    if spice_map:
        label_names.append(f"{spice_map.get(int(c), '')} ({int(c)})".strip())
    else:
        label_names.append(str(int(c)))

# Persist classification report
report_csv = OUT_DIR / "xgb_pressure_classification_report.csv"
pd.DataFrame(report_dict).transpose().to_csv(report_csv, index=True)

# Persist confusion matrix with clear headers
cm_df = pd.DataFrame(
    cm,
    index=[f"true_{n}" for n in label_names],
    columns=[f"pred_{n}" for n in label_names]
)
cm_df.index.name = "true_label"
cm_df.columns.name = "pred_label"
cm_csv = OUT_DIR / "xgb_pressure_confusion_matrix.csv"
cm_df.to_csv(cm_csv, index=True)

# Persist the trained model; avoid overwriting by timestamping if needed
model_base = OUT_DIR / "xgb_pressure_model.joblib"
model_path = model_base if not model_base.exists() else OUT_DIR / f"xgb_pressure_model_{time.strftime('%Y%m%d_%H%M%S')}.joblib"
joblib.dump(clf, model_path)

# Persist scalar metrics and file paths
metrics_json = OUT_DIR / "xgb_pressure_metrics.json"
with metrics_json.open("w") as f:
    json.dump(
        {
            "model": "XGBClassifier",
            "objective": "multi:softmax",
            "num_class": num_classes,
            "n_estimators": 400,
            "max_depth": 6,
            "learning_rate": 0.1,
            "subsample": 0.8,
            "colsample_bytree": 0.8,
            "reg_lambda": 1.0,
            "tree_method": "hist",
            "eval_metric": "mlogloss",
            "feature_set": ["pressure"],
            "test_accuracy": acc,
            "test_accuracy_percent": acc_percent,
            "train_time_sec": round(train_time, 4),
            "report_csv": str(report_csv),
            "confusion_matrix_csv": str(cm_csv),
            "model_path": str(model_path)
        },
        f,
        indent=2
    )

# Build per class correct and incorrect counts with optional spice mapping
summary = []
for c in unique_classes:
    mask = (y_test == c)
    total = int(mask.sum())
    correct = int((y_pred[mask] == y_test[mask]).sum())
    incorrect = int(total - correct)
    cls_acc = round(100.0 * correct / total, 2) if total > 0 else 0.0
    summary.append({
        "target": int(c),
        "spice": spice_map.get(int(c), "") if spice_map else "",
        "total_rows": total,
        "correct_rows": correct,
        "incorrect_rows": incorrect,
        "class_accuracy_percent": cls_acc
    })

per_class_df = pd.DataFrame(summary).sort_values(by="target")
per_class_csv = OUT_DIR / "xgb_pressure_per_class_outcomes.csv"
per_class_df.to_csv(per_class_csv, index=False)

# Console output
print(f"XGBoost (pressure) accuracy: {acc_percent}%")
print("\nClassification report:")
print(classification_report(y_test, y_pred, digits=4))

print("\nConfusion matrix (rows=true, columns=pred):")
print(cm_df.to_string())

print("\nPer-class outcomes:")
print(per_class_df.to_string(index=False))

print("\nSaved files:")
print(" ", report_csv)
print(" ", cm_csv)
print(" ", model_path)
print(" ", metrics_json)
print(" ", per_class_csv)


XGBoost (pressure) accuracy: 64.05%

Classification report:
              precision    recall  f1-score   support

           0     0.3299    0.3973    0.3605     26800
           1     1.0000    1.0000    1.0000     26800
           2     0.2145    0.1646    0.1863     26800
           3     0.9724    1.0000    0.9860     26800

    accuracy                         0.6405    107200
   macro avg     0.6292    0.6405    0.6332    107200
weighted avg     0.6292    0.6405    0.6332    107200


Confusion matrix (rows=true, columns=pred):
pred_label         pred_Anise (0)  pred_Chilli (1)  pred_Cinnamon (2)  pred_Nutmeg (3)
true_label                                                                            
true_Anise (0)              10648                0              16152                0
true_Chilli (1)                 0            26800                  0                0
true_Cinnamon (2)           21628                0               4411              761
true_Nutmeg (3)          