In [1]:
# Reviewer note: mount and path configuration for temperature-only run; paths kept isolated for portability

import os
import sys
from pathlib import Path
from google.colab import drive

# Mount Google Drive if not already mounted
if not os.path.ismount("/content/drive"):
    drive.mount("/content/drive", force_remount=False)

# Folders containing the CSVs for temperature feature
TRAIN_DIR = Path("/content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/temperature/train")
TEST_DIR  = Path("/content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/temperature/test")

# Root output directory for all temperature models; per-model cells will create subfolders under this
from pathlib import Path
BASE_OUT = Path("/content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/temperature/outputs")
BASE_OUT.mkdir(parents=True, exist_ok=True)

# Helper to resolve a single CSV inside a directory
def resolve_single_csv(dir_path: Path) -> Path:
    candidates = list(dir_path.glob("*.csv"))
    if len(candidates) == 0:
        raise FileNotFoundError(f"No CSV found in {dir_path}. Put exactly one CSV file in that folder.")
    if len(candidates) > 1:
        names = [c.name for c in candidates]
        raise FileExistsError(f"Multiple CSV files found in {dir_path}: {names}. Keep exactly one or specify the exact file.")
    return candidates[0]

TRAIN_CSV = resolve_single_csv(TRAIN_DIR)
TEST_CSV  = resolve_single_csv(TEST_DIR)

print("Train CSV:", TRAIN_CSV)
print("Test  CSV:", TEST_CSV)
print("Base out :", BASE_OUT)


Mounted at /content/drive
Train CSV: /content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/temperature/train/Train_All_Specimens_Temperature.csv
Test  CSV: /content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/temperature/test/Test_All_Specimens_Temperature.csv
Base out : /content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/temperature/outputs


In [2]:
# Reviewer note: load temperature-only feature and labels without any preprocessing

import pandas as pd
import numpy as np

# Single raw feature column
FEATURES = ["temperature"]

# Expected label column
LABEL_COL = "target"

# Load train and test exactly as-is
train_df = pd.read_csv(TRAIN_CSV)
test_df  = pd.read_csv(TEST_CSV)

# Basic checks to fail fast if the schema is off
missing_train = [c for c in FEATURES + [LABEL_COL] if c not in train_df.columns]
missing_test  = [c for c in FEATURES + [LABEL_COL] if c not in test_df.columns]
if missing_train:
    raise ValueError(f"Train is missing columns: {missing_train}")
if missing_test:
    raise ValueError(f"Test is missing columns: {missing_test}")

# Extract numpy arrays for scikit-learn
X_train = train_df[FEATURES].values
y_train = train_df[LABEL_COL].values
X_test  = test_df[FEATURES].values
y_test  = test_df[LABEL_COL].values

print("Train shapes:", X_train.shape, y_train.shape)
print("Test shapes :", X_test.shape, y_test.shape)
print("Unique train labels:", np.unique(y_train))
print("Unique test  labels:", np.unique(y_test))


Train shapes: (107200, 1) (107200,)
Test shapes : (107200, 1) (107200,)
Unique train labels: [0 1 2 3]
Unique test  labels: [0 1 2 3]


In [3]:
# Reviewer note: Random Forest on raw temperature; add model persistence, labeled confusion-matrix CSV, and console prints; no change to core behavior

import time
import sys
import json
import joblib
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Normalize BASE_OUT with a default and derive OUT_DIR for this model
BASE_OUT = Path(globals().get(
    "BASE_OUT",
    "/content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/temperature/outputs"
))
OUT_DIR = BASE_OUT / "RandomForest"

# Safety guard: stop if an existing folder is not empty to protect prior runs
if OUT_DIR.exists() and any(OUT_DIR.iterdir()):
    print(f"Safety guard: output folder already exists and is not empty: {OUT_DIR}")
    print("Create a new folder or archive/clear the existing one before proceeding.")
    sys.exit(1)

OUT_DIR.mkdir(parents=True, exist_ok=True)

clf = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    n_jobs=-1,
    random_state=42
)

start = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start

# Predictions and metrics on the test set only
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
acc_percent = round(100.0 * acc, 2)
report_dict = classification_report(y_test, y_pred, digits=4, output_dict=True)
cm = confusion_matrix(y_test, y_pred)

# Optional spice mapping if present; falls back to blank if column absent
spice_map = {}
if "spice" in test_df.columns:
    spice_map = test_df.groupby(LABEL_COL)["spice"].agg(lambda s: s.mode().iat[0]).to_dict()

# Build labeled headers for confusion matrix
unique_classes = np.sort(np.unique(y_test))
label_names = []
for c in unique_classes:
    if spice_map:
        label_names.append(f"{spice_map.get(int(c), '')} ({int(c)})".strip())
    else:
        label_names.append(str(int(c)))

# Persist classification report
report_csv = OUT_DIR / "rf_temperature_classification_report.csv"
pd.DataFrame(report_dict).transpose().to_csv(report_csv, index=True)

# Persist confusion matrix with clear headers
cm_df = pd.DataFrame(
    cm,
    index=[f"true_{n}" for n in label_names],
    columns=[f"pred_{n}" for n in label_names]
)
cm_df.index.name = "true_label"
cm_df.columns.name = "pred_label"
cm_csv = OUT_DIR / "rf_temperature_confusion_matrix.csv"
cm_df.to_csv(cm_csv, index=True)

# Persist the trained model; avoid overwriting by timestamping if needed
model_base = OUT_DIR / "rf_temperature_model.joblib"
model_path = model_base if not model_base.exists() else OUT_DIR / f"rf_temperature_model_{time.strftime('%Y%m%d_%H%M%S')}.joblib"
joblib.dump(clf, model_path)

# Persist scalar metrics and file paths
metrics_json = OUT_DIR / "rf_temperature_metrics.json"
with metrics_json.open("w") as f:
    json.dump(
        {
            "model": "RandomForestClassifier",
            "n_estimators": 300,
            "test_accuracy": acc,
            "test_accuracy_percent": acc_percent,
            "train_time_sec": round(train_time, 4),
            "report_csv": str(report_csv),
            "confusion_matrix_csv": str(cm_csv),
            "model_path": str(model_path)
        },
        f,
        indent=2
    )

# Per-class correct and incorrect counts; include spice mapping when available
summary = []
for c in unique_classes:
    mask = (y_test == c)
    total = int(mask.sum())
    correct = int((y_pred[mask] == y_test[mask]).sum())
    incorrect = int(total - correct)
    cls_acc = round(100.0 * correct / total, 2) if total > 0 else 0.0
    summary.append({
        "target": int(c),
        "spice": spice_map.get(int(c), "") if spice_map else "",
        "total_rows": total,
        "correct_rows": correct,
        "incorrect_rows": incorrect,
        "class_accuracy_percent": cls_acc
    })

per_class_df = pd.DataFrame(summary).sort_values(by="target")
per_class_csv = OUT_DIR / "rf_temperature_per_class_outcomes.csv"
per_class_df.to_csv(per_class_csv, index=False)

# Console output
print(f"Random Forest (temperature) accuracy: {acc_percent}%")
print("\nClassification report:")
print(classification_report(y_test, y_pred, digits=4))

print("\nConfusion matrix (rows=true, columns=pred):")
print(cm_df.to_string())

print("\nPer-class outcomes:")
print(per_class_df.to_string(index=False))

print("\nSaved files:")
print(" ", report_csv)
print(" ", cm_csv)
print(" ", model_path)
print(" ", metrics_json)
print(" ", per_class_csv)


Random Forest (temperature) accuracy: 29.04%

Classification report:
              precision    recall  f1-score   support

           0     0.2512    0.2010    0.2233     26800
           1     0.2325    0.2341    0.2333     26800
           2     0.4548    0.3240    0.3784     26800
           3     0.2718    0.4024    0.3244     26800

    accuracy                         0.2904    107200
   macro avg     0.3026    0.2904    0.2899    107200
weighted avg     0.3026    0.2904    0.2899    107200


Confusion matrix (rows=true, columns=pred):
pred_label         pred_Anise (0)  pred_Chilli (1)  pred_Cinnamon (2)  pred_Nutmeg (3)
true_label                                                                            
true_Anise (0)               5387             6795               4518            10100
true_Chilli (1)              6463             6275               3044            11018
true_Cinnamon (2)            2442             7900               8684             7774
true_Nutmeg (3) 

In [4]:
# Reviewer note: Logistic Regression on raw temperature; add model persistence, labeled confusion-matrix CSV, and console prints; no change to core behavior

import time
import sys
import json
import joblib
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Normalize BASE_OUT with a default and derive OUT_DIR for this model
BASE_OUT = Path(globals().get(
    "BASE_OUT",
    "/content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/temperature/outputs"
))
OUT_DIR = BASE_OUT / "LogisticRegression"

# Safety guard: stop if an existing folder is not empty to protect prior runs
if OUT_DIR.exists() and any(OUT_DIR.iterdir()):
    print(f"Safety guard: output folder already exists and is not empty: {OUT_DIR}")
    print("Create a new folder or archive/clear the existing one before proceeding.")
    sys.exit(1)

OUT_DIR.mkdir(parents=True, exist_ok=True)

# Label column fallback for safety
LABEL_COL = LABEL_COL if "LABEL_COL" in globals() else "target"

# Multinomial logistic regression configuration without scaling
clf = LogisticRegression(
    multi_class="multinomial",
    solver="lbfgs",
    max_iter=2000
)

start = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start

# Predictions and metrics on the test set only
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
acc_percent = round(100.0 * acc, 2)
report_dict = classification_report(y_test, y_pred, digits=4, output_dict=True)
cm = confusion_matrix(y_test, y_pred)

# Optional spice mapping if present; falls back to blank if column absent
spice_map = {}
if "spice" in test_df.columns:
    spice_map = test_df.groupby(LABEL_COL)["spice"].agg(lambda s: s.mode().iat[0]).to_dict()

# Build labeled headers for confusion matrix
unique_classes = np.sort(np.unique(y_test))
label_names = []
for c in unique_classes:
    if spice_map:
        label_names.append(f"{spice_map.get(int(c), '')} ({int(c)})".strip())
    else:
        label_names.append(str(int(c)))

# Persist classification report
report_csv = OUT_DIR / "lr_temperature_classification_report.csv"
pd.DataFrame(report_dict).transpose().to_csv(report_csv, index=True)

# Persist confusion matrix with clear headers
cm_df = pd.DataFrame(
    cm,
    index=[f"true_{n}" for n in label_names],
    columns=[f"pred_{n}" for n in label_names]
)
cm_df.index.name = "true_label"
cm_df.columns.name = "pred_label"
cm_csv = OUT_DIR / "lr_temperature_confusion_matrix.csv"
cm_df.to_csv(cm_csv, index=True)

# Persist the trained model; avoid overwriting by timestamping if needed
model_base = OUT_DIR / "lr_temperature_model.joblib"
model_path = model_base if not model_base.exists() else OUT_DIR / f"lr_temperature_model_{time.strftime('%Y%m%d_%H%M%S')}.joblib"
joblib.dump(clf, model_path)

# Persist scalar metrics and file paths
metrics_json = OUT_DIR / "lr_temperature_metrics.json"
with metrics_json.open("w") as f:
    json.dump(
        {
            "model": "LogisticRegression",
            "multi_class": "multinomial",
            "solver": "lbfgs",
            "max_iter": 2000,
            "test_accuracy": acc,
            "test_accuracy_percent": acc_percent,
            "train_time_sec": round(train_time, 4),
            "report_csv": str(report_csv),
            "confusion_matrix_csv": str(cm_csv),
            "model_path": str(model_path)
        },
        f,
        indent=2
    )

# Per-class correct and incorrect counts; include spice mapping when available
summary = []
for c in unique_classes:
    mask = (y_test == c)
    total = int(mask.sum())
    correct = int((y_pred[mask] == y_test[mask]).sum())
    incorrect = int(total - correct)
    cls_acc = round(100.0 * correct / total, 2) if total > 0 else 0.0
    summary.append({
        "target": int(c),
        "spice": spice_map.get(int(c), "") if spice_map else "",
        "total_rows": total,
        "correct_rows": correct,
        "incorrect_rows": incorrect,
        "class_accuracy_percent": cls_acc
    })

per_class_df = pd.DataFrame(summary).sort_values(by="target")
per_class_csv = OUT_DIR / "lr_temperature_per_class_outcomes.csv"
per_class_df.to_csv(per_class_csv, index=False)

# Console output with clear summary
print(f"Logistic Regression (temperature) accuracy: {acc_percent}%")
print("\nClassification report:")
print(classification_report(y_test, y_pred, digits=4))

print("\nConfusion matrix (rows=true, columns=pred):")
print(cm_df.to_string())

print("\nPer-class outcomes:")
print(per_class_df.to_string(index=False))

print("\nSaved files:")
print(" ", report_csv)
print(" ", cm_csv)
print(" ", model_path)
print(" ", metrics_json)
print(" ", per_class_csv)




Logistic Regression (temperature) accuracy: 38.44%

Classification report:
              precision    recall  f1-score   support

           0     0.2511    0.0548    0.0900     26800
           1     0.2113    0.0507    0.0818     26800
           2     0.6300    0.5581    0.5919     26800
           3     0.3291    0.8740    0.4781     26800

    accuracy                         0.3844    107200
   macro avg     0.3554    0.3844    0.3104    107200
weighted avg     0.3554    0.3844    0.3104    107200


Confusion matrix (rows=true, columns=pred):
pred_label         pred_Anise (0)  pred_Chilli (1)  pred_Cinnamon (2)  pred_Nutmeg (3)
true_label                                                                            
true_Anise (0)               1469             1634               5568            18129
true_Chilli (1)              1415             1359               2090            21936
true_Cinnamon (2)            1765             2388              14956             7691
true_Nutme

In [5]:
# Reviewer note: MLP on raw temperature; add model persistence, labeled confusion-matrix CSV, and console prints; no change to core behavior

import time
import sys
import json
import joblib
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Normalize BASE_OUT with a default and derive OUT_DIR for this model
BASE_OUT = Path(globals().get(
    "BASE_OUT",
    "/content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/temperature/outputs"
))
OUT_DIR = BASE_OUT / "MLP"

# Safety guard: stop if an existing folder is not empty to protect prior runs
if OUT_DIR.exists() and any(OUT_DIR.iterdir()):
    print(f"Safety guard: output folder already exists and is not empty: {OUT_DIR}")
    print("Create a new folder or archive/clear the existing one before proceeding.")
    sys.exit(1)

OUT_DIR.mkdir(parents=True, exist_ok=True)

# Label column fallback for safety
LABEL_COL = LABEL_COL if "LABEL_COL" in globals() else "target"

# MLP configured for multiclass on a single raw feature
clf = MLPClassifier(
    hidden_layer_sizes=(128, 64),
    activation="relu",
    solver="adam",
    max_iter=600,
    random_state=42,
    early_stopping=False
)

start = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start

# Predictions and core metrics on the test set only
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
acc_percent = round(100.0 * acc, 2)
report_dict = classification_report(y_test, y_pred, digits=4, output_dict=True)
cm = confusion_matrix(y_test, y_pred)

# Optional spice mapping if present; falls back to blank if column absent
spice_map = {}
if "spice" in test_df.columns:
    spice_map = test_df.groupby(LABEL_COL)["spice"].agg(lambda s: s.mode().iat[0]).to_dict()

# Build labeled headers for confusion matrix
unique_classes = np.sort(np.unique(y_test))
label_names = []
for c in unique_classes:
    if spice_map:
        label_names.append(f"{spice_map.get(int(c), '')} ({int(c)})".strip())
    else:
        label_names.append(str(int(c)))

# Persist classification report
report_csv = OUT_DIR / "mlp_temperature_classification_report.csv"
pd.DataFrame(report_dict).transpose().to_csv(report_csv, index=True)

# Persist confusion matrix with clear headers
cm_df = pd.DataFrame(
    cm,
    index=[f"true_{n}" for n in label_names],
    columns=[f"pred_{n}" for n in label_names]
)
cm_df.index.name = "true_label"
cm_df.columns.name = "pred_label"
cm_csv = OUT_DIR / "mlp_temperature_confusion_matrix.csv"
cm_df.to_csv(cm_csv, index=True)

# Persist the trained model; avoid overwriting by timestamping if needed
model_base = OUT_DIR / "mlp_temperature_model.joblib"
model_path = model_base if not model_base.exists() else OUT_DIR / f"mlp_temperature_model_{time.strftime('%Y%m%d_%H%M%S')}.joblib"
joblib.dump(clf, model_path)

# Persist scalar metrics and file paths
metrics_json = OUT_DIR / "mlp_temperature_metrics.json"
with metrics_json.open("w") as f:
    json.dump(
        {
            "model": "MLPClassifier",
            "hidden_layer_sizes": [128, 64],
            "activation": "relu",
            "solver": "adam",
            "max_iter": 600,
            "early_stopping": False,
            "test_accuracy": acc,
            "test_accuracy_percent": acc_percent,
            "train_time_sec": round(train_time, 4),
            "report_csv": str(report_csv),
            "confusion_matrix_csv": str(cm_csv),
            "model_path": str(model_path)
        },
        f,
        indent=2
    )

# Per-class correct and incorrect counts; include spice mapping when available
summary = []
for c in unique_classes:
    mask = (y_test == c)
    total = int(mask.sum())
    correct = int((y_pred[mask] == y_test[mask]).sum())
    incorrect = int(total - correct)
    cls_acc = round(100.0 * correct / total, 2) if total > 0 else 0.0
    summary.append({
        "target": int(c),
        "spice": spice_map.get(int(c), "") if spice_map else "",
        "total_rows": total,
        "correct_rows": correct,
        "incorrect_rows": incorrect,
        "class_accuracy_percent": cls_acc
    })

per_class_df = pd.DataFrame(summary).sort_values(by="target")
per_class_csv = OUT_DIR / "mlp_temperature_per_class_outcomes.csv"
per_class_df.to_csv(per_class_csv, index=False)

# Console output with clear summary
print(f"MLP (temperature) accuracy: {acc_percent}%")
print("\nClassification report:")
print(classification_report(y_test, y_pred, digits=4))

print("\nConfusion matrix (rows=true, columns=pred):")
print(cm_df.to_string())

print("\nPer-class outcomes:")
print(per_class_df.to_string(index=False))

print("\nSaved files:")
print(" ", report_csv)
print(" ", cm_csv)
print(" ", model_path)
print(" ", metrics_json)
print(" ", per_class_csv)


MLP (temperature) accuracy: 30.27%

Classification report:
              precision    recall  f1-score   support

           0     0.2624    0.2540    0.2581     26800
           1     0.1033    0.0066    0.0123     26800
           2     0.7485    0.3102    0.4386     26800
           3     0.2506    0.6399    0.3601     26800

    accuracy                         0.3027    107200
   macro avg     0.3412    0.3027    0.2673    107200
weighted avg     0.3412    0.3027    0.2673    107200


Confusion matrix (rows=true, columns=pred):
pred_label         pred_Anise (0)  pred_Chilli (1)  pred_Cinnamon (2)  pred_Nutmeg (3)
true_label                                                                            
true_Anise (0)               6807              424               2437            17132
true_Chilli (1)              8578              176                303            17743
true_Cinnamon (2)            1028             1040               8314            16418
true_Nutmeg (3)           

In [6]:
# Reviewer note: Linear SVM on raw temperature; add model persistence, labeled confusion-matrix CSV, and console prints; no change to core behavior

import time
import sys
import json
import joblib
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Normalize BASE_OUT with a default and derive OUT_DIR for this model
BASE_OUT = Path(globals().get(
    "BASE_OUT",
    "/content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/temperature/outputs"
))
OUT_DIR = BASE_OUT / "SVM"

# Safety guard: stop if an existing folder is not empty to protect prior runs
if OUT_DIR.exists() and any(OUT_DIR.iterdir()):
    print(f"Safety guard: output folder already exists and is not empty: {OUT_DIR}")
    print("Create a new folder or archive/clear the existing one before proceeding.")
    sys.exit(1)

OUT_DIR.mkdir(parents=True, exist_ok=True)

# Label column fallback for safety
LABEL_COL = LABEL_COL if "LABEL_COL" in globals() else "target"

# Linear SVM configuration suitable for large n_samples and single feature
clf = LinearSVC(
    C=1.0,
    dual=False,
    max_iter=5000,
    random_state=42
)

start = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start

# Predictions and core metrics on the test set only
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
acc_percent = round(100.0 * acc, 2)
report_dict = classification_report(y_test, y_pred, digits=4, output_dict=True)
cm = confusion_matrix(y_test, y_pred)

# Optional spice mapping if present; falls back to blank if column absent
spice_map = {}
if "spice" in test_df.columns:
    spice_map = test_df.groupby(LABEL_COL)["spice"].agg(lambda s: s.mode().iat[0]).to_dict()

# Build labeled headers for confusion matrix
unique_classes = np.sort(np.unique(y_test))
label_names = []
for c in unique_classes:
    if spice_map:
        label_names.append(f"{spice_map.get(int(c), '')} ({int(c)})".strip())
    else:
        label_names.append(str(int(c)))

# Persist classification report
report_csv = OUT_DIR / "svm_temperature_classification_report.csv"
pd.DataFrame(report_dict).transpose().to_csv(report_csv, index=True)

# Persist confusion matrix with clear headers
cm_df = pd.DataFrame(
    cm,
    index=[f"true_{n}" for n in label_names],
    columns=[f"pred_{n}" for n in label_names]
)
cm_df.index.name = "true_label"
cm_df.columns.name = "pred_label"
cm_csv = OUT_DIR / "svm_temperature_confusion_matrix.csv"
cm_df.to_csv(cm_csv, index=True)

# Persist the trained model; avoid overwriting by timestamping if needed
model_base = OUT_DIR / "svm_temperature_model.joblib"
model_path = model_base if not model_base.exists() else OUT_DIR / f"svm_temperature_model_{time.strftime('%Y%m%d_%H%M%S')}.joblib"
joblib.dump(clf, model_path)

# Persist scalar metrics and file paths
metrics_json = OUT_DIR / "svm_temperature_metrics.json"
with metrics_json.open("w") as f:
    json.dump(
        {
            "model": "LinearSVC",
            "C": 1.0,
            "dual": False,
            "max_iter": 5000,
            "test_accuracy": acc,
            "test_accuracy_percent": acc_percent,
            "train_time_sec": round(train_time, 4),
            "report_csv": str(report_csv),
            "confusion_matrix_csv": str(cm_csv),
            "model_path": str(model_path)
        },
        f,
        indent=2
    )

# Per-class correct and incorrect counts; include spice mapping when available
summary = []
for c in unique_classes:
    mask = (y_test == c)
    total = int(mask.sum())
    correct = int((y_pred[mask] == y_test[mask]).sum())
    incorrect = int(total - correct)
    cls_acc = round(100.0 * correct / total, 2) if total > 0 else 0.0
    summary.append({
        "target": int(c),
        "spice": spice_map.get(int(c), "") if spice_map else "",
        "total_rows": total,
        "correct_rows": correct,
        "incorrect_rows": incorrect,
        "class_accuracy_percent": cls_acc
    })

per_class_df = pd.DataFrame(summary).sort_values(by="target")
per_class_csv = OUT_DIR / "svm_temperature_per_class_outcomes.csv"
per_class_df.to_csv(per_class_csv, index=False)

# Console output with clear summary
print(f"SVM (LinearSVC, temperature) accuracy: {acc_percent}%")
print("\nClassification report:")
print(classification_report(y_test, y_pred, digits=4))

print("\nConfusion matrix (rows=true, columns=pred):")
print(cm_df.to_string())

print("\nPer-class outcomes:")
print(per_class_df.to_string(index=False))

print("\nSaved files:")
print(" ", report_csv)
print(" ", cm_csv)
print(" ", model_path)
print(" ", metrics_json)
print(" ", per_class_csv)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


SVM (LinearSVC, temperature) accuracy: 39.08%

Classification report:
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000     26800
           1     0.2325    0.0035    0.0068     26800
           2     0.5798    0.6396    0.6082     26800
           3     0.3192    0.9201    0.4740     26800

    accuracy                         0.3908    107200
   macro avg     0.2829    0.3908    0.2723    107200
weighted avg     0.2829    0.3908    0.2723    107200


Confusion matrix (rows=true, columns=pred):
pred_label         pred_Anise (0)  pred_Chilli (1)  pred_Cinnamon (2)  pred_Nutmeg (3)
true_label                                                                            
true_Anise (0)                  0              117               7040            19643
true_Chilli (1)                 0               93               3314            23393
true_Cinnamon (2)               0              116              17140             9544
true_Nutmeg (3)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [7]:
# Reviewer note: SGD on raw temperature; add model persistence, labeled confusion-matrix CSV, and console prints; no change to core behavior

import time
import sys
import json
import joblib
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Normalize BASE_OUT with a default and derive OUT_DIR for this model
BASE_OUT = Path(globals().get(
    "BASE_OUT",
    "/content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/temperature/outputs"
))
OUT_DIR = BASE_OUT / "SGD"

# Safety guard: stop if an existing folder is not empty to protect prior runs
if OUT_DIR.exists() and any(OUT_DIR.iterdir()):
    print(f"Safety guard: output folder already exists and is not empty: {OUT_DIR}")
    print("Create a new folder or archive/clear the existing one before proceeding.")
    sys.exit(1)

OUT_DIR.mkdir(parents=True, exist_ok=True)

# Label column fallback for safety
LABEL_COL = LABEL_COL if "LABEL_COL" in globals() else "target"

# SGD configured for multinomial logistic regression style training on a single feature
clf = SGDClassifier(
    loss="log_loss",
    penalty="l2",
    alpha=1e-4,
    max_iter=5000,
    tol=1e-4,
    random_state=42
)

start = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start

# Predictions and core metrics on the test set only
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
acc_percent = round(100.0 * acc, 2)
report_dict = classification_report(y_test, y_pred, digits=4, output_dict=True)
cm = confusion_matrix(y_test, y_pred)

# Optional spice mapping if present; falls back to blank if column absent
spice_map = {}
if "spice" in test_df.columns:
    spice_map = test_df.groupby(LABEL_COL)["spice"].agg(lambda s: s.mode().iat[0]).to_dict()

# Build labeled headers for confusion matrix
unique_classes = np.sort(np.unique(y_test))
label_names = []
for c in unique_classes:
    if spice_map:
        label_names.append(f"{spice_map.get(int(c), '')} ({int(c)})".strip())
    else:
        label_names.append(str(int(c)))

# Persist classification report
report_csv = OUT_DIR / "sgd_temperature_classification_report.csv"
pd.DataFrame(report_dict).transpose().to_csv(report_csv, index=True)

# Persist confusion matrix with clear headers
cm_df = pd.DataFrame(
    cm,
    index=[f"true_{n}" for n in label_names],
    columns=[f"pred_{n}" for n in label_names]
)
cm_df.index.name = "true_label"
cm_df.columns.name = "pred_label"
cm_csv = OUT_DIR / "sgd_temperature_confusion_matrix.csv"
cm_df.to_csv(cm_csv, index=True)

# Persist the trained model; avoid overwriting by timestamping if needed
model_base = OUT_DIR / "sgd_temperature_model.joblib"
model_path = model_base if not model_base.exists() else OUT_DIR / f"sgd_temperature_model_{time.strftime('%Y%m%d_%H%M%S')}.joblib"
joblib.dump(clf, model_path)

# Persist scalar metrics and file paths
metrics_json = OUT_DIR / "sgd_temperature_metrics.json"
with metrics_json.open("w") as f:
    json.dump(
        {
            "model": "SGDClassifier",
            "loss": "log_loss",
            "penalty": "l2",
            "alpha": 1e-4,
            "max_iter": 5000,
            "tol": 1e-4,
            "test_accuracy": acc,
            "test_accuracy_percent": acc_percent,
            "train_time_sec": round(train_time, 4),
            "report_csv": str(report_csv),
            "confusion_matrix_csv": str(cm_csv),
            "model_path": str(model_path)
        },
        f,
        indent=2
    )

# Per-class correct and incorrect counts; include spice mapping when available
summary = []
for c in unique_classes:
    mask = (y_test == c)
    total = int(mask.sum())
    correct = int((y_pred[mask] == y_test[mask]).sum())
    incorrect = int(total - correct)
    cls_acc = round(100.0 * correct / total, 2) if total > 0 else 0.0
    summary.append({
        "target": int(c),
        "spice": spice_map.get(int(c), "") if spice_map else "",
        "total_rows": total,
        "correct_rows": correct,
        "incorrect_rows": incorrect,
        "class_accuracy_percent": cls_acc
    })

per_class_df = pd.DataFrame(summary).sort_values(by="target")
per_class_csv = OUT_DIR / "sgd_temperature_per_class_outcomes.csv"
per_class_df.to_csv(per_class_csv, index=False)

# Console output with clear summary
print(f"SGD (temperature) accuracy: {acc_percent}%")
print("\nClassification report:")
print(classification_report(y_test, y_pred, digits=4))

print("\nConfusion matrix (rows=true, columns=pred):")
print(cm_df.to_string())

print("\nPer-class outcomes:")
print(per_class_df.to_string(index=False))

print("\nSaved files:")
print(" ", report_csv)
print(" ", cm_csv)
print(" ", model_path)
print(" ", metrics_json)
print(" ", per_class_csv)


SGD (temperature) accuracy: 25.0%

Classification report:
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000     26800
           1     0.0000    0.0000    0.0000     26800
           2     0.0000    0.0000    0.0000     26800
           3     0.2500    1.0000    0.4000     26800

    accuracy                         0.2500    107200
   macro avg     0.0625    0.2500    0.1000    107200
weighted avg     0.0625    0.2500    0.1000    107200


Confusion matrix (rows=true, columns=pred):
pred_label         pred_Anise (0)  pred_Chilli (1)  pred_Cinnamon (2)  pred_Nutmeg (3)
true_label                                                                            
true_Anise (0)                  0                0                  0            26800
true_Chilli (1)                 0                0                  0            26800
true_Cinnamon (2)               0                0                  0            26800
true_Nutmeg (3)            

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [8]:
# Reviewer note: Gradient Boosting on raw temperature; add model persistence, labeled confusion-matrix CSV, and console prints; no change to core behavior

import time
import sys
import json
import joblib
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Normalize BASE_OUT with a default and derive OUT_DIR for this model
BASE_OUT = Path(globals().get(
    "BASE_OUT",
    "/content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/temperature/outputs"
))
OUT_DIR = BASE_OUT / "GradientBoosting"

# Safety guard: stop if an existing folder is not empty to protect prior runs
if OUT_DIR.exists() and any(OUT_DIR.iterdir()):
    print(f"Safety guard: output folder already exists and is not empty: {OUT_DIR}")
    print("Create a new folder or archive/clear the existing one before proceeding.")
    sys.exit(1)

OUT_DIR.mkdir(parents=True, exist_ok=True)

# Label column fallback for safety
LABEL_COL = LABEL_COL if "LABEL_COL" in globals() else "target"

# HistGradientBoosting scales well on large datasets; early_stopping disabled to avoid internal validation split
clf = HistGradientBoostingClassifier(
    loss="log_loss",
    learning_rate=0.1,
    max_iter=200,
    max_depth=None,
    early_stopping=False,
    random_state=42
)

start = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start

# Predictions and core metrics on the test set only
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
acc_percent = round(100.0 * acc, 2)
report_dict = classification_report(y_test, y_pred, digits=4, output_dict=True)
cm = confusion_matrix(y_test, y_pred)

# Optional spice mapping if present; falls back to blank if column absent
spice_map = {}
if "spice" in test_df.columns:
    spice_map = test_df.groupby(LABEL_COL)["spice"].agg(lambda s: s.mode().iat[0]).to_dict()

# Build labeled headers for confusion matrix
unique_classes = np.sort(np.unique(y_test))
label_names = []
for c in unique_classes:
    if spice_map:
        label_names.append(f"{spice_map.get(int(c), '')} ({int(c)})".strip())
    else:
        label_names.append(str(int(c)))

# Persist classification report
report_csv = OUT_DIR / "gb_temperature_classification_report.csv"
pd.DataFrame(report_dict).transpose().to_csv(report_csv, index=True)

# Persist confusion matrix with clear headers
cm_df = pd.DataFrame(
    cm,
    index=[f"true_{n}" for n in label_names],
    columns=[f"pred_{n}" for n in label_names]
)
cm_df.index.name = "true_label"
cm_df.columns.name = "pred_label"
cm_csv = OUT_DIR / "gb_temperature_confusion_matrix.csv"
cm_df.to_csv(cm_csv, index=True)

# Persist the trained model; avoid overwriting by timestamping if needed
model_base = OUT_DIR / "gb_temperature_model.joblib"
model_path = model_base if not model_base.exists() else OUT_DIR / f"gb_temperature_model_{time.strftime('%Y%m%d_%H%M%S')}.joblib"
joblib.dump(clf, model_path)

# Persist scalar metrics and file paths
metrics_json = OUT_DIR / "gb_temperature_metrics.json"
with metrics_json.open("w") as f:
    json.dump(
        {
            "model": "HistGradientBoostingClassifier",
            "loss": "log_loss",
            "learning_rate": 0.1,
            "max_iter": 200,
            "max_depth": None,
            "early_stopping": False,
            "test_accuracy": acc,
            "test_accuracy_percent": acc_percent,
            "train_time_sec": round(train_time, 4),
            "report_csv": str(report_csv),
            "confusion_matrix_csv": str(cm_csv),
            "model_path": str(model_path)
        },
        f,
        indent=2
    )

# Per-class correct and incorrect counts; include spice mapping when available
summary = []
for c in unique_classes:
    mask = (y_test == c)
    total = int(mask.sum())
    correct = int((y_pred[mask] == y_test[mask]).sum())
    incorrect = int(total - correct)
    cls_acc = round(100.0 * correct / total, 2) if total > 0 else 0.0
    summary.append({
        "target": int(c),
        "spice": spice_map.get(int(c), "") if spice_map else "",
        "total_rows": total,
        "correct_rows": correct,
        "incorrect_rows": incorrect,
        "class_accuracy_percent": cls_acc
    })

per_class_df = pd.DataFrame(summary).sort_values(by="target")
per_class_csv = OUT_DIR / "gb_temperature_per_class_outcomes.csv"
per_class_df.to_csv(per_class_csv, index=False)

# Console output with clear summary
print(f"Gradient Boosting (temperature) accuracy: {acc_percent}%")
print("\nClassification report:")
print(classification_report(y_test, y_pred, digits=4))

print("\nConfusion matrix (rows=true, columns=pred):")
print(cm_df.to_string())

print("\nPer-class outcomes:")
print(per_class_df.to_string(index=False))

print("\nSaved files:")
print(" ", report_csv)
print(" ", cm_csv)
print(" ", model_path)
print(" ", metrics_json)
print(" ", per_class_csv)


Gradient Boosting (temperature) accuracy: 31.44%

Classification report:
              precision    recall  f1-score   support

           0     0.2590    0.2841    0.2710     26800
           1     0.1320    0.0315    0.0509     26800
           2     0.6812    0.3693    0.4789     26800
           3     0.2699    0.5727    0.3669     26800

    accuracy                         0.3144    107200
   macro avg     0.3355    0.3144    0.2919    107200
weighted avg     0.3355    0.3144    0.2919    107200


Confusion matrix (rows=true, columns=pred):
pred_label         pred_Anise (0)  pred_Chilli (1)  pred_Cinnamon (2)  pred_Nutmeg (3)
true_label                                                                            
true_Anise (0)               7615             1591               3367            14227
true_Chilli (1)              9551              845                885            15519
true_Cinnamon (2)            1675             3450               9896            11779
true_Nutmeg 

In [9]:
# Reviewer note: AdaBoost on raw temperature; add model persistence, labeled confusion-matrix CSV, and console prints; no change to core behavior

import time
import sys
import json
import joblib
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Normalize BASE_OUT with a default and derive OUT_DIR for this model
BASE_OUT = Path(globals().get(
    "BASE_OUT",
    "/content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/temperature/outputs"
))
OUT_DIR = BASE_OUT / "AdaBoost"

# Safety guard: stop if an existing folder is not empty to protect prior runs
if OUT_DIR.exists() and any(OUT_DIR.iterdir()):
    print(f"Safety guard: output folder already exists and is not empty: {OUT_DIR}")
    print("Create a new folder or archive/clear the existing one before proceeding.")
    sys.exit(1)

OUT_DIR.mkdir(parents=True, exist_ok=True)

# Label column fallback for safety
LABEL_COL = LABEL_COL if "LABEL_COL" in globals() else "target"

# AdaBoost with decision-stump base estimator; SAMME for multiclass on current sklearn in Colab
clf = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(max_depth=1, random_state=42),
    n_estimators=300,
    learning_rate=0.5,
    algorithm="SAMME",
    random_state=42
)

start = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start

# Predictions and core metrics on the test set only
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
acc_percent = round(100.0 * acc, 2)
report_dict = classification_report(y_test, y_pred, digits=4, output_dict=True)
cm = confusion_matrix(y_test, y_pred)

# Optional spice mapping if present; falls back to blank if column absent
spice_map = {}
if "spice" in test_df.columns:
    spice_map = test_df.groupby(LABEL_COL)["spice"].agg(lambda s: s.mode().iat[0]).to_dict()

# Build labeled headers for confusion matrix
unique_classes = np.sort(np.unique(y_test))
label_names = []
for c in unique_classes:
    if spice_map:
        label_names.append(f"{spice_map.get(int(c), '')} ({int(c)})".strip())
    else:
        label_names.append(str(int(c)))

# Persist classification report
report_csv = OUT_DIR / "ada_temperature_classification_report.csv"
pd.DataFrame(report_dict).transpose().to_csv(report_csv, index=True)

# Persist confusion matrix with clear headers
cm_df = pd.DataFrame(
    cm,
    index=[f"true_{n}" for n in label_names],
    columns=[f"pred_{n}" for n in label_names]
)
cm_df.index.name = "true_label"
cm_df.columns.name = "pred_label"
cm_csv = OUT_DIR / "ada_temperature_confusion_matrix.csv"
cm_df.to_csv(cm_csv, index=True)

# Persist the trained model; avoid overwriting by timestamping if needed
model_base = OUT_DIR / "ada_temperature_model.joblib"
model_path = model_base if not model_base.exists() else OUT_DIR / f"ada_temperature_model_{time.strftime('%Y%m%d_%H%M%S')}.joblib"
joblib.dump(clf, model_path)

# Persist scalar metrics and file paths
metrics_json = OUT_DIR / "ada_temperature_metrics.json"
with metrics_json.open("w") as f:
    json.dump(
        {
            "model": "AdaBoostClassifier",
            "base_estimator": "DecisionTree(max_depth=1)",
            "n_estimators": 300,
            "learning_rate": 0.5,
            "algorithm": "SAMME",
            "test_accuracy": acc,
            "test_accuracy_percent": acc_percent,
            "train_time_sec": round(train_time, 4),
            "report_csv": str(report_csv),
            "confusion_matrix_csv": str(cm_csv),
            "model_path": str(model_path)
        },
        f,
        indent=2
    )

# Per-class correct and incorrect counts; include spice mapping when available
summary = []
for c in unique_classes:
    mask = (y_test == c)
    total = int(mask.sum())
    correct = int((y_pred[mask] == y_test[mask]).sum())
    incorrect = int(total - correct)
    cls_acc = round(100.0 * correct / total, 2) if total > 0 else 0.0
    summary.append({
        "target": int(c),
        "spice": spice_map.get(int(c), "") if spice_map else "",
        "total_rows": total,
        "correct_rows": correct,
        "incorrect_rows": incorrect,
        "class_accuracy_percent": cls_acc
    })

per_class_df = pd.DataFrame(summary).sort_values(by="target")
per_class_csv = OUT_DIR / "ada_temperature_per_class_outcomes.csv"
per_class_df.to_csv(per_class_csv, index=False)

# Console output with clear summary
print(f"AdaBoost (temperature) accuracy: {acc_percent}%")
print("\nClassification report:")
print(classification_report(y_test, y_pred, digits=4))

print("\nConfusion matrix (rows=true, columns=pred):")
print(cm_df.to_string())

print("\nPer-class outcomes:")
print(per_class_df.to_string(index=False))

print("\nSaved files:")
print(" ", report_csv)
print(" ", cm_csv)
print(" ", model_path)
print(" ", metrics_json)
print(" ", per_class_csv)




AdaBoost (temperature) accuracy: 32.23%

Classification report:
              precision    recall  f1-score   support

           0     0.2569    0.1779    0.2103     26800
           1     0.0000    0.0000    0.0000     26800
           2     0.6925    0.3618    0.4753     26800
           3     0.2691    0.7496    0.3961     26800

    accuracy                         0.3223    107200
   macro avg     0.3046    0.3223    0.2704    107200
weighted avg     0.3046    0.3223    0.2704    107200


Confusion matrix (rows=true, columns=pred):
pred_label         pred_Anise (0)  pred_Chilli (1)  pred_Cinnamon (2)  pred_Nutmeg (3)
true_label                                                                            
true_Anise (0)               4769                0               3281            18750
true_Chilli (1)              5851                0                768            20181
true_Cinnamon (2)            1487                0               9695            15618
true_Nutmeg (3)      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [10]:
# Reviewer note: KNN on raw temperature; add model persistence, labeled confusion-matrix CSV, and console prints; no change to core behavior

import time
import sys
import json
import joblib
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Normalize BASE_OUT with a default and derive OUT_DIR for this model
BASE_OUT = Path(globals().get(
    "BASE_OUT",
    "/content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/temperature/outputs"
))
OUT_DIR = BASE_OUT / "KNN"

# Safety guard: stop if an existing folder is not empty to protect prior runs
if OUT_DIR.exists() and any(OUT_DIR.iterdir()):
    print(f"Safety guard: output folder already exists and is not empty: {OUT_DIR}")
    print("Create a new folder or archive/clear the existing one before proceeding.")
    sys.exit(1)

OUT_DIR.mkdir(parents=True, exist_ok=True)

# Label column fallback for safety
LABEL_COL = LABEL_COL if "LABEL_COL" in globals() else "target"

# KNN configuration; kd_tree preferred for one feature and large sample count
clf = KNeighborsClassifier(
    n_neighbors=7,
    weights="distance",
    algorithm="kd_tree",
    leaf_size=30,
    p=2,
    n_jobs=-1
)

start = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start

# Predictions and core metrics on the test set only
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
acc_percent = round(100.0 * acc, 2)
report_dict = classification_report(y_test, y_pred, digits=4, output_dict=True)
cm = confusion_matrix(y_test, y_pred)

# Optional spice mapping if present; falls back to blank if column absent
spice_map = {}
if "spice" in test_df.columns:
    spice_map = test_df.groupby(LABEL_COL)["spice"].agg(lambda s: s.mode().iat[0]).to_dict()

# Build labeled headers for confusion matrix
unique_classes = np.sort(np.unique(y_test))
label_names = []
for c in unique_classes:
    if spice_map:
        label_names.append(f"{spice_map.get(int(c), '')} ({int(c)})".strip())
    else:
        label_names.append(str(int(c)))

# Persist classification report
report_csv = OUT_DIR / "knn_temperature_classification_report.csv"
pd.DataFrame(report_dict).transpose().to_csv(report_csv, index=True)

# Persist confusion matrix with clear headers
cm_df = pd.DataFrame(
    cm,
    index=[f"true_{n}" for n in label_names],
    columns=[f"pred_{n}" for n in label_names]
)
cm_df.index.name = "true_label"
cm_df.columns.name = "pred_label"
cm_csv = OUT_DIR / "knn_temperature_confusion_matrix.csv"
cm_df.to_csv(cm_csv, index=True)

# Persist the trained model; avoid overwriting by timestamping if needed
model_base = OUT_DIR / "knn_temperature_model.joblib"
model_path = model_base if not model_base.exists() else OUT_DIR / f"knn_temperature_model_{time.strftime('%Y%m%d_%H%M%S')}.joblib"
joblib.dump(clf, model_path)

# Persist scalar metrics and file paths
metrics_json = OUT_DIR / "knn_temperature_metrics.json"
with metrics_json.open("w") as f:
    json.dump(
        {
            "model": "KNeighborsClassifier",
            "n_neighbors": 7,
            "weights": "distance",
            "algorithm": "kd_tree",
            "leaf_size": 30,
            "p": 2,
            "test_accuracy": acc,
            "test_accuracy_percent": acc_percent,
            "train_time_sec": round(train_time, 4),
            "report_csv": str(report_csv),
            "confusion_matrix_csv": str(cm_csv),
            "model_path": str(model_path)
        },
        f,
        indent=2
    )

# Per-class correct and incorrect counts; include spice mapping when available
summary = []
for c in unique_classes:
    mask = (y_test == c)
    total = int(mask.sum())
    correct = int((y_pred[mask] == y_test[mask]).sum())
    incorrect = int(total - correct)
    cls_acc = round(100.0 * correct / total, 2) if total > 0 else 0.0
    summary.append({
        "target": int(c),
        "spice": spice_map.get(int(c), "") if spice_map else "",
        "total_rows": total,
        "correct_rows": correct,
        "incorrect_rows": incorrect,
        "class_accuracy_percent": cls_acc
    })

per_class_df = pd.DataFrame(summary).sort_values(by="target")
per_class_csv = OUT_DIR / "knn_temperature_per_class_outcomes.csv"
per_class_df.to_csv(per_class_csv, index=False)

# Console output with clear summary
print(f"KNN (temperature) accuracy: {acc_percent}%")
print("\nClassification report:")
print(classification_report(y_test, y_pred, digits=4))

print("\nConfusion matrix (rows=true, columns=pred):")
print(cm_df.to_string())

print("\nPer-class outcomes:")
print(per_class_df.to_string(index=False))

print("\nSaved files:")
print(" ", report_csv)
print(" ", cm_csv)
print(" ", model_path)
print(" ", metrics_json)
print(" ", per_class_csv)


KNN (temperature) accuracy: 27.7%

Classification report:
              precision    recall  f1-score   support

           0     0.2517    0.2860    0.2678     26800
           1     0.2212    0.2810    0.2476     26800
           2     0.4346    0.2531    0.3199     26800
           3     0.2847    0.2879    0.2863     26800

    accuracy                         0.2770    107200
   macro avg     0.2981    0.2770    0.2804    107200
weighted avg     0.2981    0.2770    0.2804    107200


Confusion matrix (rows=true, columns=pred):
pred_label         pred_Anise (0)  pred_Chilli (1)  pred_Cinnamon (2)  pred_Nutmeg (3)
true_label                                                                            
true_Anise (0)               7665             8603               3701             6831
true_Chilli (1)              8867             7531               2682             7720
true_Cinnamon (2)            4418            10764               6784             4834
true_Nutmeg (3)            

In [11]:
# Reviewer note: CNN on raw temperature; add model persistence, labeled confusion-matrix CSV, and console prints; no change to core behavior

import time
import sys
import json
import numpy as np
import pandas as pd
from pathlib import Path

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Normalize BASE_OUT with a default and derive OUT_DIR for this model
BASE_OUT = Path(globals().get(
    "BASE_OUT",
    "/content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/temperature/outputs"
))
OUT_DIR = BASE_OUT / "CNN"

# Safety guard: stop if an existing folder is not empty to protect prior runs
if OUT_DIR.exists() and any(OUT_DIR.iterdir()):
    print(f"Safety guard: output folder already exists and is not empty: {OUT_DIR}")
    print("Create a new folder or archive/clear the existing one before proceeding.")
    sys.exit(1)

OUT_DIR.mkdir(parents=True, exist_ok=True)

# Reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Label column fallback for safety
LABEL_COL = LABEL_COL if "LABEL_COL" in globals() else "target"

# Reshape only; values remain unchanged. Single feature -> length-1 sequence with one channel
X_train_cnn = X_train.astype(np.float32).reshape(-1, 1, 1)
X_test_cnn  = X_test.astype(np.float32).reshape(-1, 1, 1)
num_classes = int(np.unique(y_train).size)

# Minimal 1D CNN with kernel_size=1 due to sequence length of 1
inputs = keras.Input(shape=(1, 1))
x = layers.Conv1D(filters=32, kernel_size=1, activation="relu")(inputs)
x = layers.Conv1D(filters=32, kernel_size=1, activation="relu")(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dense(64, activation="relu")(x)
outputs = layers.Dense(num_classes, activation="softmax")(x)
model = keras.Model(inputs=inputs, outputs=outputs, name="cnn_temperature_1d")

model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-3),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

# Train without creating a validation split to avoid dataset alteration
start = time.time()
history = model.fit(
    X_train_cnn, y_train,
    epochs=25,
    batch_size=1024,
    verbose=1
)
train_time = time.time() - start

# Predictions and metrics on the test set only
y_pred_proba = model.predict(X_test_cnn, batch_size=4096, verbose=0)
y_pred = np.argmax(y_pred_proba, axis=1)

acc = accuracy_score(y_test, y_pred)
acc_percent = round(100.0 * acc, 2)
report_dict = classification_report(y_test, y_pred, digits=4, output_dict=True)
cm = confusion_matrix(y_test, y_pred)

# Optional spice mapping if present; falls back to blank if column absent
spice_map = {}
if "spice" in test_df.columns:
    spice_map = test_df.groupby(LABEL_COL)["spice"].agg(lambda s: s.mode().iat[0]).to_dict()

# Build labeled headers for confusion matrix
unique_classes = np.sort(np.unique(y_test))
label_names = []
for c in unique_classes:
    if spice_map:
        label_names.append(f"{spice_map.get(int(c), '')} ({int(c)})".strip())
    else:
        label_names.append(str(int(c)))

# Persist classification report
report_csv = OUT_DIR / "cnn_temperature_classification_report.csv"
pd.DataFrame(report_dict).transpose().to_csv(report_csv, index=True)

# Persist confusion matrix with clear headers
cm_df = pd.DataFrame(
    cm,
    index=[f"true_{n}" for n in label_names],
    columns=[f"pred_{n}" for n in label_names]
)
cm_df.index.name = "true_label"
cm_df.columns.name = "pred_label"
cm_csv = OUT_DIR / "cnn_temperature_confusion_matrix.csv"
cm_df.to_csv(cm_csv, index=True)

# Persist the trained model as .keras; avoid overwriting by timestamping if needed
model_base = OUT_DIR / "cnn_temperature_model.keras"
model_path = model_base if not model_base.exists() else OUT_DIR / f"cnn_temperature_model_{time.strftime('%Y%m%d_%H%M%S')}.keras"
model.save(model_path)

# Persist scalar metrics and file paths
metrics_json = OUT_DIR / "cnn_temperature_metrics.json"
with metrics_json.open("w") as f:
    json.dump(
        {
            "model": "Keras 1D CNN",
            "input_shape": [1, 1],
            "epochs": 25,
            "batch_size": 1024,
            "optimizer": "adam",
            "loss": "sparse_categorical_crossentropy",
            "test_accuracy": acc,
            "test_accuracy_percent": acc_percent,
            "train_time_sec": round(train_time, 4),
            "report_csv": str(report_csv),
            "confusion_matrix_csv": str(cm_csv),
            "model_path": str(model_path)
        },
        f,
        indent=2
    )

# Per-class correct and incorrect counts; include spice mapping when available
summary = []
for c in unique_classes:
    mask = (y_test == c)
    total = int(mask.sum())
    correct = int((y_pred[mask] == y_test[mask]).sum())
    incorrect = int(total - correct)
    cls_acc = round(100.0 * correct / total, 2) if total > 0 else 0.0
    summary.append({
        "target": int(c),
        "spice": spice_map.get(int(c), "") if spice_map else "",
        "total_rows": total,
        "correct_rows": correct,
        "incorrect_rows": incorrect,
        "class_accuracy_percent": cls_acc
    })

per_class_df = pd.DataFrame(summary).sort_values(by="target")
per_class_csv = OUT_DIR / "cnn_temperature_per_class_outcomes.csv"
per_class_df.to_csv(per_class_csv, index=False)

# Console output with clear summary
print(f"CNN (temperature) accuracy: {acc_percent}%")
print("\nClassification report:")
print(classification_report(y_test, y_pred, digits=4))

print("\nConfusion matrix (rows=true, columns=pred):")
print(cm_df.to_string())

print("\nPer-class outcomes:")
print(per_class_df.to_string(index=False))

print("\nSaved files:")
print(" ", report_csv)
print(" ", cm_csv)
print(" ", model_path)
print(" ", metrics_json)
print(" ", per_class_csv)


Epoch 1/25
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.2491 - loss: 1.5148
Epoch 2/25
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.2481 - loss: 1.3868
Epoch 3/25
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.2467 - loss: 1.3874
Epoch 4/25
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.2480 - loss: 1.3873
Epoch 5/25
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.2489 - loss: 1.3871
Epoch 6/25
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.2595 - loss: 1.3868
Epoch 7/25
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.2565 - loss: 1.3864
Epoch 8/25
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.2591 - loss: 1.3862
Epoch 9/25
[1m105/105[0m [32m━━━━━━━━

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [12]:
# Reviewer note: XGBoost on raw temperature; add model persistence, labeled confusion-matrix CSV, and console prints; no change to core behavior

import time
import sys
import json
import joblib
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from xgboost import XGBClassifier

# Normalize BASE_OUT with a default and derive OUT_DIR for this model
BASE_OUT = Path(globals().get(
    "BASE_OUT",
    "/content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/temperature/outputs"
))
OUT_DIR = BASE_OUT / "XGBoost"

# Safety guard: stop if an existing folder is not empty to protect prior runs
if OUT_DIR.exists() and any(OUT_DIR.iterdir()):
    print(f"Safety guard: output folder already exists and is not empty: {OUT_DIR}")
    print("Create a new folder or archive/clear the existing one before proceeding.")
    sys.exit(1)

OUT_DIR.mkdir(parents=True, exist_ok=True)

# Label column fallback for safety
LABEL_COL = LABEL_COL if "LABEL_COL" in globals() else "target"

num_classes = int(np.unique(y_train).size)

# XGBoost configuration suitable for multiclass classification on CPU
clf = XGBClassifier(
    objective="multi:softmax",
    num_class=num_classes,
    n_estimators=400,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    tree_method="hist",
    eval_metric="mlogloss",
    random_state=42,
    n_jobs=-1,
    verbosity=0
)

start = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start

# Predictions and core metrics on the test set only
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
acc_percent = round(100.0 * acc, 2)
report_dict = classification_report(y_test, y_pred, digits=4, output_dict=True)
cm = confusion_matrix(y_test, y_pred)

# Optional spice mapping if present; falls back to blank if column absent
spice_map = {}
if "spice" in test_df.columns:
    spice_map = test_df.groupby(LABEL_COL)["spice"].agg(lambda s: s.mode().iat[0]).to_dict()

# Build labeled headers for confusion matrix
unique_classes = np.sort(np.unique(y_test))
label_names = []
for c in unique_classes:
    if spice_map:
        label_names.append(f"{spice_map.get(int(c), '')} ({int(c)})".strip())
    else:
        label_names.append(str(int(c)))

# Persist classification report
report_csv = OUT_DIR / "xgb_temperature_classification_report.csv"
pd.DataFrame(report_dict).transpose().to_csv(report_csv, index=True)

# Persist confusion matrix with clear headers
cm_df = pd.DataFrame(
    cm,
    index=[f"true_{n}" for n in label_names],
    columns=[f"pred_{n}" for n in label_names]
)
cm_df.index.name = "true_label"
cm_df.columns.name = "pred_label"
cm_csv = OUT_DIR / "xgb_temperature_confusion_matrix.csv"
cm_df.to_csv(cm_csv, index=True)

# Persist the trained model; avoid overwriting by timestamping if needed
model_base = OUT_DIR / "xgb_temperature_model.joblib"
model_path = model_base if not model_base.exists() else OUT_DIR / f"xgb_temperature_model_{time.strftime('%Y%m%d_%H%M%S')}.joblib"
joblib.dump(clf, model_path)

# Persist scalar metrics and file paths
metrics_json = OUT_DIR / "xgb_temperature_metrics.json"
with metrics_json.open("w") as f:
    json.dump(
        {
            "model": "XGBClassifier",
            "objective": "multi:softmax",
            "num_class": num_classes,
            "n_estimators": 400,
            "max_depth": 6,
            "learning_rate": 0.1,
            "subsample": 0.8,
            "colsample_bytree": 0.8,
            "reg_lambda": 1.0,
            "tree_method": "hist",
            "eval_metric": "mlogloss",
            "test_accuracy": acc,
            "test_accuracy_percent": acc_percent,
            "train_time_sec": round(train_time, 4),
            "report_csv": str(report_csv),
            "confusion_matrix_csv": str(cm_csv),
            "model_path": str(model_path)
        },
        f,
        indent=2
    )

# Per-class correct and incorrect counts; include spice mapping when available
summary = []
for c in unique_classes:
    mask = (y_test == c)
    total = int(mask.sum())
    correct = int((y_pred[mask] == y_test[mask]).sum())
    incorrect = int(total - correct)
    cls_acc = round(100.0 * correct / total, 2) if total > 0 else 0.0
    summary.append({
        "target": int(c),
        "spice": spice_map.get(int(c), "") if spice_map else "",
        "total_rows": total,
        "correct_rows": correct,
        "incorrect_rows": incorrect,
        "class_accuracy_percent": cls_acc
    })

per_class_df = pd.DataFrame(summary).sort_values(by="target")
per_class_csv = OUT_DIR / "xgb_temperature_per_class_outcomes.csv"
per_class_df.to_csv(per_class_csv, index=False)

# Console output with clear summary
print(f"XGBoost (temperature) accuracy: {acc_percent}%")
print("\nClassification report:")
print(classification_report(y_test, y_pred, digits=4))

print("\nConfusion matrix (rows=true, columns=pred):")
print(cm_df.to_string())

print("\nPer-class outcomes:")
print(per_class_df.to_string(index=False))

print("\nSaved files:")
print(" ", report_csv)
print(" ", cm_csv)
print(" ", model_path)
print(" ", metrics_json)
print(" ", per_class_csv)


XGBoost (temperature) accuracy: 31.74%

Classification report:
              precision    recall  f1-score   support

           0     0.2594    0.2735    0.2663     26800
           1     0.1422    0.0335    0.0542     26800
           2     0.6750    0.3804    0.4866     26800
           3     0.2712    0.5822    0.3701     26800

    accuracy                         0.3174    107200
   macro avg     0.3370    0.3174    0.2943    107200
weighted avg     0.3370    0.3174    0.2943    107200


Confusion matrix (rows=true, columns=pred):
pred_label         pred_Anise (0)  pred_Chilli (1)  pred_Cinnamon (2)  pred_Nutmeg (3)
true_label                                                                            
true_Anise (0)               7330             1578               3473            14419
true_Chilli (1)              9156              897                983            15764
true_Cinnamon (2)            1599             3260              10194            11747
true_Nutmeg (3)       