In [None]:
# Reviewer note: Colab Drive mount and path configuration for humidity feature

import os
import sys
from pathlib import Path
from google.colab import drive

# Mount Google Drive if not already mounted
if not os.path.ismount("/content/drive"):
    drive.mount("/content/drive", force_remount=False)

# Folders containing the CSVs for humidity feature
TRAIN_DIR = Path("/content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/humidity/train")
TEST_DIR  = Path("/content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/humidity/test")

# Base output directory for all humidity models; model cells will create their own subfolders
BASE_OUT = Path("/content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/humidity/outputs")
BASE_OUT.mkdir(parents=True, exist_ok=True)

# Helper to resolve a single CSV inside a directory
def resolve_single_csv(dir_path: Path) -> Path:
    candidates = list(dir_path.glob("*.csv"))
    if len(candidates) == 0:
        raise FileNotFoundError(f"No CSV found in {dir_path}. Put exactly one CSV file in that folder.")
    if len(candidates) > 1:
        names = [c.name for c in candidates]
        raise FileExistsError(f"Multiple CSV files found in {dir_path}: {names}. Keep exactly one or specify the exact file.")
    return candidates[0]

TRAIN_CSV = resolve_single_csv(TRAIN_DIR)
TEST_CSV  = resolve_single_csv(TEST_DIR)

print("Train CSV:", TRAIN_CSV)
print("Test  CSV:", TEST_CSV)
print("Base out :", BASE_OUT)


Mounted at /content/drive
Train CSV: /content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/humidity/train/Train_All_Specimens_Humidity.csv
Test  CSV: /content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/humidity/test/Test_All_Specimens_Humidity.csv
Base out : /content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/humidity/outputs


In [None]:
# Reviewer note: dataset loading for single feature 'relative_humidity'; no preprocessing or scaling

import pandas as pd
import numpy as np

# Single raw feature column and label column
FEATURES = ["relative_humidity"]
LABEL_COL = "target"

# Load train and test exactly as-is
train_df = pd.read_csv(TRAIN_CSV)
test_df  = pd.read_csv(TEST_CSV)

# Basic checks to fail fast if the schema is off
missing_train = [c for c in FEATURES + [LABEL_COL] if c not in train_df.columns]
missing_test  = [c for c in FEATURES + [LABEL_COL] if c not in test_df.columns]
if missing_train:
    raise ValueError(f"Train is missing columns: {missing_train}")
if missing_test:
    raise ValueError(f"Test is missing columns: {missing_test}")

# Extract raw numpy arrays for scikit-learn
X_train = train_df[FEATURES].values
y_train = train_df[LABEL_COL].values
X_test  = test_df[FEATURES].values
y_test  = test_df[LABEL_COL].values

print("Train shapes:", X_train.shape, y_train.shape)
print("Test shapes :", X_test.shape, y_test.shape)
print("Unique train labels:", np.unique(y_train))
print("Unique test  labels:", np.unique(y_test))


Train shapes: (107200, 1) (107200,)
Test shapes : (107200, 1) (107200,)
Unique train labels: [0 1 2 3]
Unique test  labels: [0 1 2 3]


In [4]:
# Reviewer note: Random Forest on single raw feature; add model persistence, labeled confusion-matrix CSV, and console prints; no change to core behavior

import time
import sys
import json
import joblib
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Normalize BASE_OUT with a default and derive OUT_DIR for this model (humidity feature)
BASE_OUT = Path(globals().get(
    "BASE_OUT",
    "/content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/humidity/outputs"
))
OUT_DIR = BASE_OUT / "RandomForest"

# Safety guard: stop if an existing folder is not empty to protect prior runs
if OUT_DIR.exists() and any(OUT_DIR.iterdir()):
    print(f"Safety guard: output folder already exists and is not empty: {OUT_DIR}")
    print("Create a new folder or archive/clear the existing one before proceeding.")
    sys.exit(1)

OUT_DIR.mkdir(parents=True, exist_ok=True)

clf = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    n_jobs=-1,
    random_state=42
)

start = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start

# Predict on test set
y_pred = clf.predict(X_test)

# Metrics
acc = accuracy_score(y_test, y_pred)
acc_percent = round(100.0 * acc, 2)
report_dict = classification_report(y_test, y_pred, digits=4, output_dict=True)
cm = confusion_matrix(y_test, y_pred)

# Optional spice mapping if present; falls back to blank if column absent
spice_map = {}
if "spice" in test_df.columns:
    spice_map = test_df.groupby(LABEL_COL)["spice"].agg(lambda s: s.mode().iat[0]).to_dict()

# Build labeled headers for confusion matrix
unique_classes = np.sort(np.unique(y_test))
label_names = []
for c in unique_classes:
    if spice_map:
        label_names.append(f"{spice_map.get(int(c), '')} ({int(c)})".strip())
    else:
        label_names.append(str(int(c)))

# Save classification report
report_csv = OUT_DIR / "rf_humidity_classification_report.csv"
pd.DataFrame(report_dict).transpose().to_csv(report_csv, index=True)

# Save confusion matrix with clear headers
cm_df = pd.DataFrame(
    cm,
    index=[f"true_{n}" for n in label_names],
    columns=[f"pred_{n}" for n in label_names]
)
cm_df.index.name = "true_label"
cm_df.columns.name = "pred_label"
cm_csv = OUT_DIR / "rf_humidity_confusion_matrix.csv"
cm_df.to_csv(cm_csv, index=True)

# Persist the trained model; avoid overwriting by timestamping if needed
model_base = OUT_DIR / "rf_humidity_model.joblib"
model_path = model_base if not model_base.exists() else OUT_DIR / f"rf_humidity_model_{time.strftime('%Y%m%d_%H%M%S')}.joblib"
joblib.dump(clf, model_path)

# Save metrics summary
metrics_json = OUT_DIR / "rf_humidity_metrics.json"
with metrics_json.open("w") as f:
    json.dump(
        {
            "model": "RandomForestClassifier",
            "feature": "relative_humidity",
            "n_estimators": 300,
            "test_accuracy": acc,
            "test_accuracy_percent": acc_percent,
            "train_time_sec": round(train_time, 4),
            "report_csv": str(report_csv),
            "confusion_matrix_csv": str(cm_csv),
            "model_path": str(model_path)
        },
        f,
        indent=2
    )

# Per-class correct and incorrect counts; spice mapping when available
summary = []
for c in unique_classes:
    mask = (y_test == c)
    total = int(mask.sum())
    correct = int((y_pred[mask] == y_test[mask]).sum())
    incorrect = int(total - correct)
    cls_acc = round(100.0 * correct / total, 2) if total > 0 else 0.0
    summary.append({
        "target": int(c),
        "spice": spice_map.get(int(c), "") if spice_map else "",
        "total_rows": total,
        "correct_rows": correct,
        "incorrect_rows": incorrect,
        "class_accuracy_percent": cls_acc
    })

per_class_df = pd.DataFrame(summary).sort_values(by="target")
per_class_csv = OUT_DIR / "rf_humidity_per_class_outcomes.csv"
per_class_df.to_csv(per_class_csv, index=False)

# Console output
print(f"Random Forest (humidity) accuracy: {acc_percent}%")
print("\nClassification report:")
print(classification_report(y_test, y_pred, digits=4))

print("\nConfusion matrix (rows=true, columns=pred):")
print(cm_df.to_string())

print("\nPer-class outcomes:")
print(per_class_df.to_string(index=False))

print("\nSaved files:")
print(" ", report_csv)
print(" ", cm_csv)
print(" ", model_path)
print(" ", metrics_json)
print(" ", per_class_csv)


Random Forest (humidity) accuracy: 48.19%

Classification report:
              precision    recall  f1-score   support

           0     0.4331    0.6171    0.5090     26800
           1     0.4515    0.3676    0.4053     26800
           2     0.3185    0.3248    0.3216     26800
           3     0.8340    0.6181    0.7100     26800

    accuracy                         0.4819    107200
   macro avg     0.5093    0.4819    0.4865    107200
weighted avg     0.5093    0.4819    0.4865    107200


Confusion matrix (rows=true, columns=pred):
pred_label         pred_Anise (0)  pred_Chilli (1)  pred_Cinnamon (2)  pred_Nutmeg (3)
true_label                                                                            
true_Anise (0)              16539             2792               6957              512
true_Chilli (1)              6514             9853               8582             1851
true_Cinnamon (2)           13014             4148               8704              934
true_Nutmeg (3)    

In [5]:
# Reviewer note: Logistic Regression on single raw feature; add model persistence, labeled confusion-matrix CSV, and console prints; no change to core behavior

import time
import sys
import json
import joblib
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Normalize BASE_OUT with a default and derive OUT_DIR for this model (humidity feature)
BASE_OUT = Path(globals().get(
    "BASE_OUT",
    "/content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/humidity/outputs"
))
OUT_DIR = BASE_OUT / "LogisticRegression"

# Safety guard: stop if an existing folder is not empty to protect prior runs
if OUT_DIR.exists() and any(OUT_DIR.iterdir()):
    print(f"Safety guard: output folder already exists and is not empty: {OUT_DIR}")
    print("Create a new folder or archive/clear the existing one before proceeding.")
    sys.exit(1)

OUT_DIR.mkdir(parents=True, exist_ok=True)

# Label column fallback for safety
LABEL_COL = LABEL_COL if "LABEL_COL" in globals() else "target"

# Model configuration suitable for multinomial classification without scaling
clf = LogisticRegression(
    multi_class="multinomial",
    solver="lbfgs",
    max_iter=2000
)

start = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start

# Predictions and core metrics on the test set
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
acc_percent = round(100.0 * acc, 2)
report_dict = classification_report(y_test, y_pred, digits=4, output_dict=True)
cm = confusion_matrix(y_test, y_pred)

# Optional spice mapping if present; falls back to blank if column absent
spice_map = {}
if "spice" in test_df.columns:
    spice_map = test_df.groupby(LABEL_COL)["spice"].agg(lambda s: s.mode().iat[0]).to_dict()

# Build labeled headers for confusion matrix
unique_classes = np.sort(np.unique(y_test))
label_names = []
for c in unique_classes:
    if spice_map:
        label_names.append(f"{spice_map.get(int(c), '')} ({int(c)})".strip())
    else:
        label_names.append(str(int(c)))

# Persist classification report
report_csv = OUT_DIR / "lr_humidity_classification_report.csv"
pd.DataFrame(report_dict).transpose().to_csv(report_csv, index=True)

# Persist confusion matrix with clear headers
cm_df = pd.DataFrame(
    cm,
    index=[f"true_{n}" for n in label_names],
    columns=[f"pred_{n}" for n in label_names]
)
cm_df.index.name = "true_label"
cm_df.columns.name = "pred_label"
cm_csv = OUT_DIR / "lr_humidity_confusion_matrix.csv"
cm_df.to_csv(cm_csv, index=True)

# Persist the trained model; avoid overwriting by timestamping if needed
model_base = OUT_DIR / "lr_humidity_model.joblib"
model_path = model_base if not model_base.exists() else OUT_DIR / f"lr_humidity_model_{time.strftime('%Y%m%d_%H%M%S')}.joblib"
joblib.dump(clf, model_path)

# Persist scalar metrics and file paths
metrics_json = OUT_DIR / "lr_humidity_metrics.json"
with metrics_json.open("w") as f:
    json.dump(
        {
            "model": "LogisticRegression",
            "feature": "relative_humidity",
            "multi_class": "multinomial",
            "solver": "lbfgs",
            "max_iter": 2000,
            "test_accuracy": acc,
            "test_accuracy_percent": acc_percent,
            "train_time_sec": round(train_time, 4),
            "report_csv": str(report_csv),
            "confusion_matrix_csv": str(cm_csv),
            "model_path": str(model_path)
        },
        f,
        indent=2
    )

# Build per-class correct and incorrect counts; include spice mapping when available
summary = []
for c in unique_classes:
    mask = (y_test == c)
    total = int(mask.sum())
    correct = int((y_pred[mask] == y_test[mask]).sum())
    incorrect = int(total - correct)
    cls_acc = round(100.0 * correct / total, 2) if total > 0 else 0.0
    summary.append({
        "target": int(c),
        "spice": spice_map.get(int(c), "") if spice_map else "",
        "total_rows": total,
        "correct_rows": correct,
        "incorrect_rows": incorrect,
        "class_accuracy_percent": cls_acc
    })

per_class_df = pd.DataFrame(summary).sort_values(by="target")
per_class_csv = OUT_DIR / "lr_humidity_per_class_outcomes.csv"
per_class_df.to_csv(per_class_csv, index=False)

# Console output with clear summary
print(f"Logistic Regression (humidity) accuracy: {acc_percent}%")
print("\nClassification report:")
print(classification_report(y_test, y_pred, digits=4))

print("\nConfusion matrix (rows=true, columns=pred):")
print(cm_df.to_string())

print("\nPer-class outcomes:")
print(per_class_df.to_string(index=False))

print("\nSaved files:")
print(" ", report_csv)
print(" ", cm_csv)
print(" ", model_path)
print(" ", metrics_json)
print(" ", per_class_csv)




Logistic Regression (humidity) accuracy: 47.13%

Classification report:
              precision    recall  f1-score   support

           0     0.4059    0.8687    0.5533     26800
           1     0.3409    0.2171    0.2652     26800
           2     0.2078    0.0813    0.1169     26800
           3     0.8634    0.7182    0.7841     26800

    accuracy                         0.4713    107200
   macro avg     0.4545    0.4713    0.4299    107200
weighted avg     0.4545    0.4713    0.4299    107200


Confusion matrix (rows=true, columns=pred):
pred_label         pred_Anise (0)  pred_Chilli (1)  pred_Cinnamon (2)  pred_Nutmeg (3)
true_label                                                                            
true_Anise (0)              23280             1700               1522              298
true_Chilli (1)             12298             5817               6785             1900
true_Cinnamon (2)           21777             1997               2179              847
true_Nutmeg (

In [6]:
# Reviewer note: MLP on single raw feature; add model persistence, labeled confusion-matrix CSV, and console prints; no change to core behavior

import time
import sys
import json
import joblib
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Normalize BASE_OUT with a default and derive OUT_DIR for this model (humidity feature)
BASE_OUT = Path(globals().get(
    "BASE_OUT",
    "/content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/humidity/outputs"
))
OUT_DIR = BASE_OUT / "MLP"

# Safety guard: stop if an existing folder is not empty to protect prior runs
if OUT_DIR.exists() and any(OUT_DIR.iterdir()):
    print(f"Safety guard: output folder already exists and is not empty: {OUT_DIR}")
    print("Create a new folder or archive/clear the existing one before proceeding.")
    sys.exit(1)

OUT_DIR.mkdir(parents=True, exist_ok=True)

# Label column fallback for safety
LABEL_COL = LABEL_COL if "LABEL_COL" in globals() else "target"

# MLP configuration for multiclass; no scaling to honor raw-data requirement
clf = MLPClassifier(
    hidden_layer_sizes=(256, 128),
    activation="relu",
    solver="adam",
    max_iter=600,
    random_state=42,
    early_stopping=False
)

start = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start

# Predictions and core metrics on the test set
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
acc_percent = round(100.0 * acc, 2)
report_dict = classification_report(y_test, y_pred, digits=4, output_dict=True)
cm = confusion_matrix(y_test, y_pred)

# Optional spice mapping if present; falls back to blank if column absent
spice_map = {}
if "spice" in test_df.columns:
    spice_map = test_df.groupby(LABEL_COL)["spice"].agg(lambda s: s.mode().iat[0]).to_dict()

# Build labeled headers for confusion matrix
unique_classes = np.sort(np.unique(y_test))
label_names = []
for c in unique_classes:
    if spice_map:
        label_names.append(f"{spice_map.get(int(c), '')} ({int(c)})".strip())
    else:
        label_names.append(str(int(c)))

# Persist classification report
report_csv = OUT_DIR / "mlp_humidity_classification_report.csv"
pd.DataFrame(report_dict).transpose().to_csv(report_csv, index=True)

# Persist confusion matrix with clear headers
cm_df = pd.DataFrame(
    cm,
    index=[f"true_{n}" for n in label_names],
    columns=[f"pred_{n}" for n in label_names]
)
cm_df.index.name = "true_label"
cm_df.columns.name = "pred_label"
cm_csv = OUT_DIR / "mlp_humidity_confusion_matrix.csv"
cm_df.to_csv(cm_csv, index=True)

# Persist the trained model; avoid overwriting by timestamping if needed
model_base = OUT_DIR / "mlp_humidity_model.joblib"
model_path = model_base if not model_base.exists() else OUT_DIR / f"mlp_humidity_model_{time.strftime('%Y%m%d_%H%M%S')}.joblib"
joblib.dump(clf, model_path)

# Persist scalar metrics and file paths
metrics_json = OUT_DIR / "mlp_humidity_metrics.json"
with metrics_json.open("w") as f:
    json.dump(
        {
            "model": "MLPClassifier",
            "feature": "relative_humidity",
            "hidden_layer_sizes": [256, 128],
            "activation": "relu",
            "solver": "adam",
            "max_iter": 600,
            "early_stopping": False,
            "test_accuracy": acc,
            "test_accuracy_percent": acc_percent,
            "train_time_sec": round(train_time, 4),
            "report_csv": str(report_csv),
            "confusion_matrix_csv": str(cm_csv),
            "model_path": str(model_path)
        },
        f,
        indent=2
    )

# Build per-class correct and incorrect counts; include spice mapping when available
summary = []
for c in unique_classes:
    mask = (y_test == c)
    total = int(mask.sum())
    correct = int((y_pred[mask] == y_test[mask]).sum())
    incorrect = int(total - correct)
    cls_acc = round(100.0 * correct / total, 2) if total > 0 else 0.0
    summary.append({
        "target": int(c),
        "spice": spice_map.get(int(c), "") if spice_map else "",
        "total_rows": total,
        "correct_rows": correct,
        "incorrect_rows": incorrect,
        "class_accuracy_percent": cls_acc
    })

per_class_df = pd.DataFrame(summary).sort_values(by="target")
per_class_csv = OUT_DIR / "mlp_humidity_per_class_outcomes.csv"
per_class_df.to_csv(per_class_csv, index=False)

# Console output with clear summary
print(f"MLP (humidity) accuracy: {acc_percent}%")
print("\nClassification report:")
print(classification_report(y_test, y_pred, digits=4))

print("\nConfusion matrix (rows=true, columns=pred):")
print(cm_df.to_string())

print("\nPer-class outcomes:")
print(per_class_df.to_string(index=False))

print("\nSaved files:")
print(" ", report_csv)
print(" ", cm_csv)
print(" ", model_path)
print(" ", metrics_json)
print(" ", per_class_csv)


MLP (humidity) accuracy: 60.85%

Classification report:
              precision    recall  f1-score   support

           0     0.5437    0.6875    0.6072     26800
           1     0.5646    0.6613    0.6091     26800
           2     0.3706    0.1714    0.2344     26800
           3     0.8292    0.9137    0.8694     26800

    accuracy                         0.6085    107200
   macro avg     0.5770    0.6085    0.5800    107200
weighted avg     0.5770    0.6085    0.5800    107200


Confusion matrix (rows=true, columns=pred):
pred_label         pred_Anise (0)  pred_Chilli (1)  pred_Cinnamon (2)  pred_Nutmeg (3)
true_label                                                                            
true_Anise (0)              18425             4800               2758              817
true_Chilli (1)              1266            17722               5044             2768
true_Cinnamon (2)           14199             6550               4594             1457
true_Nutmeg (3)              

In [7]:
# Reviewer note: Linear SVM on single raw feature; add model persistence, labeled confusion-matrix CSV, and console prints; no change to core behavior

import time
import sys
import json
import joblib
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Normalize BASE_OUT with a default and derive OUT_DIR for this model (humidity feature)
BASE_OUT = Path(globals().get(
    "BASE_OUT",
    "/content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/humidity/outputs"
))
OUT_DIR = BASE_OUT / "SVM"

# Safety guard: stop if an existing folder is not empty to protect prior runs
if OUT_DIR.exists() and any(OUT_DIR.iterdir()):
    print(f"Safety guard: output folder already exists and is not empty: {OUT_DIR}")
    print("Create a new folder or archive/clear the existing one before proceeding.")
    sys.exit(1)

OUT_DIR.mkdir(parents=True, exist_ok=True)

# Label column fallback for safety
LABEL_COL = LABEL_COL if "LABEL_COL" in globals() else "target"

# Linear SVM configuration; dual=False preferred when n_samples >> n_features
clf = LinearSVC(
    C=1.0,
    dual=False,
    max_iter=5000,
    random_state=42
)

start = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start

# Predictions and core metrics on the test set
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
acc_percent = round(100.0 * acc, 2)
report_dict = classification_report(y_test, y_pred, digits=4, output_dict=True)
cm = confusion_matrix(y_test, y_pred)

# Optional spice mapping if present; falls back to blank if column absent
spice_map = {}
if "spice" in test_df.columns:
    spice_map = test_df.groupby(LABEL_COL)["spice"].agg(lambda s: s.mode().iat[0]).to_dict()

# Build labeled headers for confusion matrix
unique_classes = np.sort(np.unique(y_test))
label_names = []
for c in unique_classes:
    if spice_map:
        label_names.append(f"{spice_map.get(int(c), '')} ({int(c)})".strip())
    else:
        label_names.append(str(int(c)))

# Persist classification report
report_csv = OUT_DIR / "svm_humidity_classification_report.csv"
pd.DataFrame(report_dict).transpose().to_csv(report_csv, index=True)

# Persist confusion matrix with clear headers
cm_df = pd.DataFrame(
    cm,
    index=[f"true_{n}" for n in label_names],
    columns=[f"pred_{n}" for n in label_names]
)
cm_df.index.name = "true_label"
cm_df.columns.name = "pred_label"
cm_csv = OUT_DIR / "svm_humidity_confusion_matrix.csv"
cm_df.to_csv(cm_csv, index=True)

# Persist the trained model; avoid overwriting by timestamping if needed
model_base = OUT_DIR / "svm_humidity_model.joblib"
model_path = model_base if not model_base.exists() else OUT_DIR / f"svm_humidity_model_{time.strftime('%Y%m%d_%H%M%S')}.joblib"
joblib.dump(clf, model_path)

# Persist scalar metrics and file paths
metrics_json = OUT_DIR / "svm_humidity_metrics.json"
with metrics_json.open("w") as f:
    json.dump(
        {
            "model": "LinearSVC",
            "feature": "relative_humidity",
            "C": 1.0,
            "dual": False,
            "max_iter": 5000,
            "test_accuracy": acc,
            "test_accuracy_percent": acc_percent,
            "train_time_sec": round(train_time, 4),
            "report_csv": str(report_csv),
            "confusion_matrix_csv": str(cm_csv),
            "model_path": str(model_path)
        },
        f,
        indent=2
    )

# Build per-class correct and incorrect counts; include spice mapping when available
summary = []
for c in unique_classes:
    mask = (y_test == c)
    total = int(mask.sum())
    correct = int((y_pred[mask] == y_test[mask]).sum())
    incorrect = int(total - correct)
    cls_acc = round(100.0 * correct / total, 2) if total > 0 else 0.0
    summary.append({
        "target": int(c),
        "spice": spice_map.get(int(c), "") if spice_map else "",
        "total_rows": total,
        "correct_rows": correct,
        "incorrect_rows": incorrect,
        "class_accuracy_percent": cls_acc
    })

per_class_df = pd.DataFrame(summary).sort_values(by="target")
per_class_csv = OUT_DIR / "svm_humidity_per_class_outcomes.csv"
per_class_df.to_csv(per_class_csv, index=False)

# Console output with clear summary
print(f"SVM (LinearSVC, humidity) accuracy: {acc_percent}%")
print("\nClassification report:")
print(classification_report(y_test, y_pred, digits=4))

print("\nConfusion matrix (rows=true, columns=pred):")
print(cm_df.to_string())

print("\nPer-class outcomes:")
print(per_class_df.to_string(index=False))

print("\nSaved files:")
print(" ", report_csv)
print(" ", cm_csv)
print(" ", model_path)
print(" ", metrics_json)
print(" ", per_class_csv)


SVM (LinearSVC, humidity) accuracy: 49.35%

Classification report:
              precision    recall  f1-score   support

           0     0.3568    0.9341    0.5164     26800
           1     0.6690    0.0255    0.0491     26800
           2     0.2321    0.0182    0.0337     26800
           3     0.7872    0.9963    0.8795     26800

    accuracy                         0.4935    107200
   macro avg     0.5113    0.4935    0.3697    107200
weighted avg     0.5113    0.4935    0.3697    107200


Confusion matrix (rows=true, columns=pred):
pred_label         pred_Anise (0)  pred_Chilli (1)  pred_Cinnamon (2)  pred_Nutmeg (3)
true_label                                                                            
true_Anise (0)              25034              152                235             1379
true_Chilli (1)             20559              683               1374             4184
true_Cinnamon (2)           24571               89                487             1653
true_Nutmeg (3)   

In [8]:
# Reviewer note: SGD on single raw feature; add model persistence, labeled confusion-matrix CSV, and console prints; no change to core behavior

import time
import sys
import json
import joblib
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Normalize BASE_OUT with a default and derive OUT_DIR for this model (humidity feature)
BASE_OUT = Path(globals().get(
    "BASE_OUT",
    "/content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/humidity/outputs"
))
OUT_DIR = BASE_OUT / "SGD"

# Safety guard: stop if an existing folder is not empty to protect prior runs
if OUT_DIR.exists() and any(OUT_DIR.iterdir()):
    print(f"Safety guard: output folder already exists and is not empty: {OUT_DIR}")
    print("Create a new folder or archive/clear the existing one before proceeding.")
    sys.exit(1)

OUT_DIR.mkdir(parents=True, exist_ok=True)

# Label column fallback for safety
LABEL_COL = LABEL_COL if "LABEL_COL" in globals() else "target"

# SGD configured for multinomial logistic regression style training
clf = SGDClassifier(
    loss="log_loss",
    penalty="l2",
    alpha=1e-4,
    max_iter=5000,
    tol=1e-4,
    random_state=42
)

start = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start

# Predictions and core metrics on the test set
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
acc_percent = round(100.0 * acc, 2)
report_dict = classification_report(y_test, y_pred, digits=4, output_dict=True)
cm = confusion_matrix(y_test, y_pred)

# Optional spice mapping if present; falls back to blank if column absent
spice_map = {}
if "spice" in test_df.columns:
    spice_map = test_df.groupby(LABEL_COL)["spice"].agg(lambda s: s.mode().iat[0]).to_dict()

# Build labeled headers for confusion matrix
unique_classes = np.sort(np.unique(y_test))
label_names = []
for c in unique_classes:
    if spice_map:
        label_names.append(f"{spice_map.get(int(c), '')} ({int(c)})".strip())
    else:
        label_names.append(str(int(c)))

# Persist classification report
report_csv = OUT_DIR / "sgd_humidity_classification_report.csv"
pd.DataFrame(report_dict).transpose().to_csv(report_csv, index=True)

# Persist confusion matrix with clear headers
cm_df = pd.DataFrame(
    cm,
    index=[f"true_{n}" for n in label_names],
    columns=[f"pred_{n}" for n in label_names]
)
cm_df.index.name = "true_label"
cm_df.columns.name = "pred_label"
cm_csv = OUT_DIR / "sgd_humidity_confusion_matrix.csv"
cm_df.to_csv(cm_csv, index=True)

# Persist the trained model; avoid overwriting by timestamping if needed
model_base = OUT_DIR / "sgd_humidity_model.joblib"
model_path = model_base if not model_base.exists() else OUT_DIR / f"sgd_humidity_model_{time.strftime('%Y%m%d_%H%M%S')}.joblib"
joblib.dump(clf, model_path)

# Persist scalar metrics and file paths
metrics_json = OUT_DIR / "sgd_humidity_metrics.json"
with metrics_json.open("w") as f:
    json.dump(
        {
            "model": "SGDClassifier",
            "feature": "relative_humidity",
            "loss": "log_loss",
            "penalty": "l2",
            "alpha": 1e-4,
            "max_iter": 5000,
            "tol": 1e-4,
            "test_accuracy": acc,
            "test_accuracy_percent": acc_percent,
            "train_time_sec": round(train_time, 4),
            "report_csv": str(report_csv),
            "confusion_matrix_csv": str(cm_csv),
            "model_path": str(model_path)
        },
        f,
        indent=2
    )

# Build per-class correct and incorrect counts; include spice mapping when available
summary = []
for c in unique_classes:
    mask = (y_test == c)
    total = int(mask.sum())
    correct = int((y_pred[mask] == y_test[mask]).sum())
    incorrect = int(total - correct)
    cls_acc = round(100.0 * correct / total, 2) if total > 0 else 0.0
    summary.append({
        "target": int(c),
        "spice": spice_map.get(int(c), "") if spice_map else "",
        "total_rows": total,
        "correct_rows": correct,
        "incorrect_rows": incorrect,
        "class_accuracy_percent": cls_acc
    })

per_class_df = pd.DataFrame(summary).sort_values(by="target")
per_class_csv = OUT_DIR / "sgd_humidity_per_class_outcomes.csv"
per_class_df.to_csv(per_class_csv, index=False)

# Console output with clear summary
print(f"SGD (humidity) accuracy: {acc_percent}%")
print("\nClassification report:")
print(classification_report(y_test, y_pred, digits=4))

print("\nConfusion matrix (rows=true, columns=pred):")
print(cm_df.to_string())

print("\nPer-class outcomes:")
print(per_class_df.to_string(index=False))

print("\nSaved files:")
print(" ", report_csv)
print(" ", cm_csv)
print(" ", model_path)
print(" ", metrics_json)
print(" ", per_class_csv)


SGD (humidity) accuracy: 52.33%

Classification report:
              precision    recall  f1-score   support

           0     0.4876    0.7562    0.5929     26800
           1     0.4296    0.7855    0.5555     26800
           2     0.0000    0.0000    0.0000     26800
           3     0.8881    0.5513    0.6803     26800

    accuracy                         0.5233    107200
   macro avg     0.4513    0.5233    0.4572    107200
weighted avg     0.4513    0.5233    0.4572    107200


Confusion matrix (rows=true, columns=pred):
pred_label         pred_Anise (0)  pred_Chilli (1)  pred_Cinnamon (2)  pred_Nutmeg (3)
true_label                                                                            
true_Anise (0)              20267             6445                  0               88
true_Chilli (1)              4177            21051                  0             1572
true_Cinnamon (2)           17123             9476                  0              201
true_Nutmeg (3)              

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [9]:
# Reviewer note: Gradient Boosting on single raw feature; add model persistence, labeled confusion-matrix CSV, and console prints; no change to core behavior

import time
import sys
import json
import joblib
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Normalize BASE_OUT with a default and derive OUT_DIR for this model (humidity feature)
BASE_OUT = Path(globals().get(
    "BASE_OUT",
    "/content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/humidity/outputs"
))
OUT_DIR = BASE_OUT / "GradientBoosting"

# Safety guard: stop if an existing folder is not empty to protect prior runs
if OUT_DIR.exists() and any(OUT_DIR.iterdir()):
    print(f"Safety guard: output folder already exists and is not empty: {OUT_DIR}")
    print("Create a new folder or archive/clear the existing one before proceeding.")
    sys.exit(1)

OUT_DIR.mkdir(parents=True, exist_ok=True)

# Label column fallback for safety
LABEL_COL = LABEL_COL if "LABEL_COL" in globals() else "target"

# HistGradientBoosting scales well; early_stopping disabled to avoid internal validation split
clf = HistGradientBoostingClassifier(
    loss="log_loss",
    learning_rate=0.1,
    max_iter=200,
    max_depth=None,
    early_stopping=False,
    random_state=42
)

start = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start

# Predictions and core metrics on the test set
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
acc_percent = round(100.0 * acc, 2)
report_dict = classification_report(y_test, y_pred, digits=4, output_dict=True)
cm = confusion_matrix(y_test, y_pred)

# Optional spice mapping if present; falls back to blank if column absent
spice_map = {}
if "spice" in test_df.columns:
    spice_map = test_df.groupby(LABEL_COL)["spice"].agg(lambda s: s.mode().iat[0]).to_dict()

# Build labeled headers for confusion matrix
unique_classes = np.sort(np.unique(y_test))
label_names = []
for c in unique_classes:
    if spice_map:
        label_names.append(f"{spice_map.get(int(c), '')} ({int(c)})".strip())
    else:
        label_names.append(str(int(c)))

# Persist classification report
report_csv = OUT_DIR / "gb_humidity_classification_report.csv"
pd.DataFrame(report_dict).transpose().to_csv(report_csv, index=True)

# Persist confusion matrix with clear headers
cm_df = pd.DataFrame(
    cm,
    index=[f"true_{n}" for n in label_names],
    columns=[f"pred_{n}" for n in label_names]
)
cm_df.index.name = "true_label"
cm_df.columns.name = "pred_label"
cm_csv = OUT_DIR / "gb_humidity_confusion_matrix.csv"
cm_df.to_csv(cm_csv, index=True)

# Persist the trained model; avoid overwriting by timestamping if needed
model_base = OUT_DIR / "gb_humidity_model.joblib"
model_path = model_base if not model_base.exists() else OUT_DIR / f"gb_humidity_model_{time.strftime('%Y%m%d_%H%M%S')}.joblib"
joblib.dump(clf, model_path)

# Persist scalar metrics and file paths
metrics_json = OUT_DIR / "gb_humidity_metrics.json"
with metrics_json.open("w") as f:
    json.dump(
        {
            "model": "HistGradientBoostingClassifier",
            "feature": "relative_humidity",
            "loss": "log_loss",
            "learning_rate": 0.1,
            "max_iter": 200,
            "max_depth": None,
            "early_stopping": False,
            "test_accuracy": acc,
            "test_accuracy_percent": acc_percent,
            "train_time_sec": round(train_time, 4),
            "report_csv": str(report_csv),
            "confusion_matrix_csv": str(cm_csv),
            "model_path": str(model_path)
        },
        f,
        indent=2
    )

# Build per-class correct and incorrect counts; include spice mapping when available
summary = []
for c in unique_classes:
    mask = (y_test == c)
    total = int(mask.sum())
    correct = int((y_pred[mask] == y_test[mask]).sum())
    incorrect = int(total - correct)
    cls_acc = round(100.0 * correct / total, 2) if total > 0 else 0.0
    summary.append({
        "target": int(c),
        "spice": spice_map.get(int(c), "") if spice_map else "",
        "total_rows": total,
        "correct_rows": correct,
        "incorrect_rows": incorrect,
        "class_accuracy_percent": cls_acc
    })

per_class_df = pd.DataFrame(summary).sort_values(by="target")
per_class_csv = OUT_DIR / "gb_humidity_per_class_outcomes.csv"
per_class_df.to_csv(per_class_csv, index=False)

# Console output with clear summary
print(f"Gradient Boosting (humidity) accuracy: {acc_percent}%")
print("\nClassification report:")
print(classification_report(y_test, y_pred, digits=4))

print("\nConfusion matrix (rows=true, columns=pred):")
print(cm_df.to_string())

print("\nPer-class outcomes:")
print(per_class_df.to_string(index=False))

print("\nSaved files:")
print(" ", report_csv)
print(" ", cm_csv)
print(" ", model_path)
print(" ", metrics_json)
print(" ", per_class_csv)


Gradient Boosting (humidity) accuracy: 59.71%

Classification report:
              precision    recall  f1-score   support

           0     0.5507    0.6686    0.6039     26800
           1     0.5553    0.5895    0.5719     26800
           2     0.3647    0.2394    0.2891     26800
           3     0.8345    0.8909    0.8618     26800

    accuracy                         0.5971    107200
   macro avg     0.5763    0.5971    0.5817    107200
weighted avg     0.5763    0.5971    0.5817    107200


Confusion matrix (rows=true, columns=pred):
pred_label         pred_Anise (0)  pred_Chilli (1)  pred_Cinnamon (2)  pred_Nutmeg (3)
true_label                                                                            
true_Anise (0)              17919             4258               3901              722
true_Chilli (1)              1140            15799               7277             2584
true_Cinnamon (2)           13481             5472               6417             1430
true_Nutmeg (3)

In [10]:
# Reviewer note: AdaBoost on single raw feature; add model persistence, labeled confusion-matrix CSV, and console prints; no change to core behavior

import time
import sys
import json
import joblib
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Normalize BASE_OUT with a default and derive OUT_DIR for this model (humidity feature)
BASE_OUT = Path(globals().get(
    "BASE_OUT",
    "/content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/humidity/outputs"
))
OUT_DIR = BASE_OUT / "AdaBoost"

# Safety guard: stop if an existing folder is not empty to protect prior runs
if OUT_DIR.exists() and any(OUT_DIR.iterdir()):
    print(f"Safety guard: output folder already exists and is not empty: {OUT_DIR}")
    print("Create a new folder or archive/clear the existing one before proceeding.")
    sys.exit(1)

OUT_DIR.mkdir(parents=True, exist_ok=True)

# Label column fallback for safety
LABEL_COL = LABEL_COL if "LABEL_COL" in globals() else "target"

# AdaBoost with decision-stump base estimator; SAMME for multiclass
clf = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(max_depth=1, random_state=42),
    n_estimators=300,
    learning_rate=0.5,
    algorithm="SAMME",
    random_state=42
)

start = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start

# Predictions and core metrics on the test set
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
acc_percent = round(100.0 * acc, 2)
report_dict = classification_report(y_test, y_pred, digits=4, output_dict=True)
cm = confusion_matrix(y_test, y_pred)

# Optional spice mapping if present; falls back to blank if column absent
spice_map = {}
if "spice" in test_df.columns:
    spice_map = test_df.groupby(LABEL_COL)["spice"].agg(lambda s: s.mode().iat[0]).to_dict()

# Build labeled headers for confusion matrix
unique_classes = np.sort(np.unique(y_test))
label_names = []
for c in unique_classes:
    if spice_map:
        label_names.append(f"{spice_map.get(int(c), '')} ({int(c)})".strip())
    else:
        label_names.append(str(int(c)))

# Persist classification report
report_csv = OUT_DIR / "ada_humidity_classification_report.csv"
pd.DataFrame(report_dict).transpose().to_csv(report_csv, index=True)

# Persist confusion matrix with clear headers
cm_df = pd.DataFrame(
    cm,
    index=[f"true_{n}" for n in label_names],
    columns=[f"pred_{n}" for n in label_names]
)
cm_df.index.name = "true_label"
cm_df.columns.name = "pred_label"
cm_csv = OUT_DIR / "ada_humidity_confusion_matrix.csv"
cm_df.to_csv(cm_csv, index=True)

# Persist the trained model; avoid overwriting by timestamping if needed
model_base = OUT_DIR / "ada_humidity_model.joblib"
model_path = model_base if not model_base.exists() else OUT_DIR / f"ada_humidity_model_{time.strftime('%Y%m%d_%H%M%S')}.joblib"
joblib.dump(clf, model_path)

# Persist scalar metrics and file paths
metrics_json = OUT_DIR / "ada_humidity_metrics.json"
with metrics_json.open("w") as f:
    json.dump(
        {
            "model": "AdaBoostClassifier",
            "feature": "relative_humidity",
            "base_estimator": "DecisionTree(max_depth=1)",
            "n_estimators": 300,
            "learning_rate": 0.5,
            "algorithm": "SAMME",
            "test_accuracy": acc,
            "test_accuracy_percent": acc_percent,
            "train_time_sec": round(train_time, 4),
            "report_csv": str(report_csv),
            "confusion_matrix_csv": str(cm_csv),
            "model_path": str(model_path)
        },
        f,
        indent=2
    )

# Build per-class correct and incorrect counts; include spice mapping when available
summary = []
for c in unique_classes:
    mask = (y_test == c)
    total = int(mask.sum())
    correct = int((y_pred[mask] == y_test[mask]).sum())
    incorrect = int(total - correct)
    cls_acc = round(100.0 * correct / total, 2) if total > 0 else 0.0
    summary.append({
        "target": int(c),
        "spice": spice_map.get(int(c), "") if spice_map else "",
        "total_rows": total,
        "correct_rows": correct,
        "incorrect_rows": incorrect,
        "class_accuracy_percent": cls_acc
    })

per_class_df = pd.DataFrame(summary).sort_values(by="target")
per_class_csv = OUT_DIR / "ada_humidity_per_class_outcomes.csv"
per_class_df.to_csv(per_class_csv, index=False)

# Console output with clear summary
print(f"AdaBoost (humidity) accuracy: {acc_percent}%")
print("\nClassification report:")
print(classification_report(y_test, y_pred, digits=4))

print("\nConfusion matrix (rows=true, columns=pred):")
print(cm_df.to_string())

print("\nPer-class outcomes:")
print(per_class_df.to_string(index=False))

print("\nSaved files:")
print(" ", report_csv)
print(" ", cm_csv)
print(" ", model_path)
print(" ", metrics_json)
print(" ", per_class_csv)




AdaBoost (humidity) accuracy: 60.04%

Classification report:
              precision    recall  f1-score   support

           0     0.4920    0.7504    0.5943     26800
           1     0.5651    0.6688    0.6126     26800
           2     0.3499    0.0632    0.1071     26800
           3     0.8275    0.9190    0.8709     26800

    accuracy                         0.6004    107200
   macro avg     0.5586    0.6004    0.5462    107200
weighted avg     0.5586    0.6004    0.5462    107200


Confusion matrix (rows=true, columns=pred):
pred_label         pred_Anise (0)  pred_Chilli (1)  pred_Cinnamon (2)  pred_Nutmeg (3)
true_label                                                                            
true_Anise (0)              20111             4872                974              843
true_Chilli (1)              3881            17925               2173             2821
true_Cinnamon (2)           16885             6752               1694             1469
true_Nutmeg (3)         

In [11]:
# Reviewer note: KNN on single raw feature; add model persistence, labeled confusion-matrix CSV, and console prints; no change to core behavior

import time
import sys
import json
import joblib
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Normalize BASE_OUT with a default and derive OUT_DIR for this model (humidity feature)
BASE_OUT = Path(globals().get(
    "BASE_OUT",
    "/content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/humidity/outputs"
))
OUT_DIR = BASE_OUT / "KNN"

# Safety guard: stop if an existing folder is not empty to protect prior runs
if OUT_DIR.exists() and any(OUT_DIR.iterdir()):
    print(f"Safety guard: output folder already exists and is not empty: {OUT_DIR}")
    print("Create a new folder or archive/clear the existing one before proceeding.")
    sys.exit(1)

OUT_DIR.mkdir(parents=True, exist_ok=True)

# Label column fallback for safety
LABEL_COL = LABEL_COL if "LABEL_COL" in globals() else "target"

# KNN configuration; kd_tree preferred for low-dimensional data
clf = KNeighborsClassifier(
    n_neighbors=7,
    weights="distance",
    algorithm="kd_tree",
    leaf_size=30,
    p=2,
    n_jobs=-1
)

start = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start

# Predictions and core metrics on the test set
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
acc_percent = round(100.0 * acc, 2)
report_dict = classification_report(y_test, y_pred, digits=4, output_dict=True)
cm = confusion_matrix(y_test, y_pred)

# Optional spice mapping if present; falls back to blank if column absent
spice_map = {}
if "spice" in test_df.columns:
    spice_map = test_df.groupby(LABEL_COL)["spice"].agg(lambda s: s.mode().iat[0]).to_dict()

# Build labeled headers for confusion matrix
unique_classes = np.sort(np.unique(y_test))
label_names = []
for c in unique_classes:
    if spice_map:
        label_names.append(f"{spice_map.get(int(c), '')} ({int(c)})".strip())
    else:
        label_names.append(str(int(c)))

# Persist classification report
report_csv = OUT_DIR / "knn_humidity_classification_report.csv"
pd.DataFrame(report_dict).transpose().to_csv(report_csv, index=True)

# Persist confusion matrix with clear headers
cm_df = pd.DataFrame(
    cm,
    index=[f"true_{n}" for n in label_names],
    columns=[f"pred_{n}" for n in label_names]
)
cm_df.index.name = "true_label"
cm_df.columns.name = "pred_label"
cm_csv = OUT_DIR / "knn_humidity_confusion_matrix.csv"
cm_df.to_csv(cm_csv, index=True)

# Persist the trained model; avoid overwriting by timestamping if needed
model_base = OUT_DIR / "knn_humidity_model.joblib"
model_path = model_base if not model_base.exists() else OUT_DIR / f"knn_humidity_model_{time.strftime('%Y%m%d_%H%M%S')}.joblib"
joblib.dump(clf, model_path)

# Persist scalar metrics and file paths
metrics_json = OUT_DIR / "knn_humidity_metrics.json"
with metrics_json.open("w") as f:
    json.dump(
        {
            "model": "KNeighborsClassifier",
            "feature": "relative_humidity",
            "n_neighbors": 7,
            "weights": "distance",
            "algorithm": "kd_tree",
            "leaf_size": 30,
            "p": 2,
            "test_accuracy": acc,
            "test_accuracy_percent": acc_percent,
            "train_time_sec": round(train_time, 4),
            "report_csv": str(report_csv),
            "confusion_matrix_csv": str(cm_csv),
            "model_path": str(model_path)
        },
        f,
        indent=2
    )

# Build per-class correct and incorrect counts; include spice mapping when available
summary = []
for c in unique_classes:
    mask = (y_test == c)
    total = int(mask.sum())
    correct = int((y_pred[mask] == y_test[mask]).sum())
    incorrect = int(total - correct)
    cls_acc = round(100.0 * correct / total, 2) if total > 0 else 0.0
    summary.append({
        "target": int(c),
        "spice": spice_map.get(int(c), "") if spice_map else "",
        "total_rows": total,
        "correct_rows": correct,
        "incorrect_rows": incorrect,
        "class_accuracy_percent": cls_acc
    })

per_class_df = pd.DataFrame(summary).sort_values(by="target")
per_class_csv = OUT_DIR / "knn_humidity_per_class_outcomes.csv"
per_class_df.to_csv(per_class_csv, index=False)

# Console output with clear summary
print(f"KNN (humidity) accuracy: {acc_percent}%")
print("\nClassification report:")
print(classification_report(y_test, y_pred, digits=4))

print("\nConfusion matrix (rows=true, columns=pred):")
print(cm_df.to_string())

print("\nPer-class outcomes:")
print(per_class_df.to_string(index=False))

print("\nSaved files:")
print(" ", report_csv)
print(" ", cm_csv)
print(" ", model_path)
print(" ", metrics_json)
print(" ", per_class_csv)


KNN (humidity) accuracy: 51.08%

Classification report:
              precision    recall  f1-score   support

           0     0.4546    0.6379    0.5309     26800
           1     0.4787    0.3945    0.4326     26800
           2     0.3258    0.3015    0.3132     26800
           3     0.8374    0.7095    0.7681     26800

    accuracy                         0.5108    107200
   macro avg     0.5241    0.5108    0.5112    107200
weighted avg     0.5241    0.5108    0.5112    107200


Confusion matrix (rows=true, columns=pred):
pred_label         pred_Anise (0)  pred_Chilli (1)  pred_Cinnamon (2)  pred_Nutmeg (3)
true_label                                                                            
true_Anise (0)              17095             2968               6182              555
true_Chilli (1)              5756            10573               8385             2086
true_Cinnamon (2)           13415             4252               8081             1052
true_Nutmeg (3)              

In [12]:
# Reviewer note: CNN on single raw feature; add model persistence, labeled confusion-matrix CSV, and console prints; no change to core behavior

import os
import time
import sys
import json
import numpy as np
import pandas as pd
from pathlib import Path

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Normalize BASE_OUT with a default and derive OUT_DIR for this model (humidity feature)
BASE_OUT = Path(globals().get(
    "BASE_OUT",
    "/content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/humidity/outputs"
))
OUT_DIR = BASE_OUT / "CNN"

# Safety guard: stop if an existing folder is not empty to protect prior runs
if OUT_DIR.exists() and any(OUT_DIR.iterdir()):
    print(f"Safety guard: output folder already exists and is not empty: {OUT_DIR}")
    print("Create a new folder or archive/clear the existing one before proceeding.")
    sys.exit(1)

OUT_DIR.mkdir(parents=True, exist_ok=True)

# Reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Label column fallback for safety
LABEL_COL = LABEL_COL if "LABEL_COL" in globals() else "target"

# Prepare tensors for CNN; reshape only, no value transformation
# Single feature results in a sequence length of 1 with 1 channel
X_train_cnn = X_train.astype(np.float32).reshape(-1, 1, 1)
X_test_cnn  = X_test.astype(np.float32).reshape(-1, 1, 1)
num_classes = int(len(np.unique(y_train)))

# Build a minimal 1D CNN for very short sequences (length 1)
inputs = keras.Input(shape=(1, 1))
x = layers.Conv1D(filters=16, kernel_size=1, activation="relu")(inputs)
x = layers.Conv1D(filters=16, kernel_size=1, activation="relu")(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dense(32, activation="relu")(x)
outputs = layers.Dense(num_classes, activation="softmax")(x)
model = keras.Model(inputs=inputs, outputs=outputs, name="cnn_humidity_1d")

model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-3),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

# Train without creating any validation split; datasets remain untouched
start = time.time()
history = model.fit(
    X_train_cnn, y_train,
    epochs=25,
    batch_size=1024,
    verbose=1
)
train_time = time.time() - start

# Predictions and metrics
y_pred_proba = model.predict(X_test_cnn, batch_size=4096, verbose=0)
y_pred = np.argmax(y_pred_proba, axis=1)

acc = accuracy_score(y_test, y_pred)
acc_percent = round(100.0 * acc, 2)
report_dict = classification_report(y_test, y_pred, digits=4, output_dict=True)
cm = confusion_matrix(y_test, y_pred)

# Optional spice mapping if present; falls back to blank if column absent
spice_map = {}
if "spice" in test_df.columns:
    spice_map = test_df.groupby(LABEL_COL)["spice"].agg(lambda s: s.mode().iat[0]).to_dict()

# Derive ordered class labels for headings (sorted by numeric target)
unique_classes = np.sort(np.unique(y_test))
label_names = []
for c in unique_classes:
    if spice_map:
        label_names.append(f"{spice_map.get(int(c), '')} ({int(c)})".strip())
    else:
        label_names.append(str(int(c)))

# Persist classification report
report_csv = OUT_DIR / "cnn_humidity_classification_report.csv"
pd.DataFrame(report_dict).transpose().to_csv(report_csv, index=True)

# Persist confusion matrix with clear headers
cm_df = pd.DataFrame(
    cm,
    index=[f"true_{n}" for n in label_names],
    columns=[f"pred_{n}" for n in label_names]
)
cm_df.index.name = "true_label"
cm_df.columns.name = "pred_label"
cm_csv = OUT_DIR / "cnn_humidity_confusion_matrix.csv"
cm_df.to_csv(cm_csv, index=True)

# Persist the trained model as .keras; avoid overwriting by timestamping if needed
model_base = OUT_DIR / "cnn_humidity_model.keras"
model_path = model_base if not model_base.exists() else OUT_DIR / f"cnn_humidity_model_{time.strftime('%Y%m%d_%H%M%S')}.keras"
model.save(model_path)

# Persist scalar metrics and file paths
metrics_json = OUT_DIR / "cnn_humidity_metrics.json"
with metrics_json.open("w") as f:
    json.dump(
        {
            "model": "Keras 1D CNN",
            "feature": "relative_humidity",
            "input_shape": [1, 1],
            "epochs": 25,
            "batch_size": 1024,
            "optimizer": "adam",
            "loss": "sparse_categorical_crossentropy",
            "test_accuracy": acc,
            "test_accuracy_percent": acc_percent,
            "train_time_sec": round(train_time, 4),
            "report_csv": str(report_csv),
            "confusion_matrix_csv": str(cm_csv),
            "model_path": str(model_path)
        },
        f,
        indent=2
    )

# Build per-class correct and incorrect counts; include spice mapping when available
summary = []
for c in unique_classes:
    mask = (y_test == c)
    total = int(mask.sum())
    correct = int((y_pred[mask] == y_test[mask]).sum())
    incorrect = int(total - correct)
    cls_acc = round(100.0 * correct / total, 2) if total > 0 else 0.0
    summary.append({
        "target": int(c),
        "spice": spice_map.get(int(c), "") if spice_map else "",
        "total_rows": total,
        "correct_rows": correct,
        "incorrect_rows": incorrect,
        "class_accuracy_percent": cls_acc
    })

per_class_df = pd.DataFrame(summary).sort_values(by="target")
per_class_csv = OUT_DIR / "cnn_humidity_per_class_outcomes.csv"
per_class_df.to_csv(per_class_csv, index=False)

# Console output with clear summary
print(f"CNN (humidity) accuracy: {acc_percent}%")
print("\nClassification report:")
print(classification_report(y_test, y_pred, digits=4))

print("\nConfusion matrix (rows=true, columns=pred):")
print(cm_df.to_string())

print("\nPer-class outcomes:")
print(per_class_df.to_string(index=False))

print("\nSaved files:")
print(" ", report_csv)
print(" ", cm_csv)
print(" ", model_path)
print(" ", metrics_json)
print(" ", per_class_csv)


Epoch 1/25
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.2518 - loss: 1.9215
Epoch 2/25
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.2459 - loss: 1.3739
Epoch 3/25
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.2477 - loss: 1.3644
Epoch 4/25
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.2929 - loss: 1.3527
Epoch 5/25
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.3550 - loss: 1.3385
Epoch 6/25
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.4218 - loss: 1.3204
Epoch 7/25
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.4765 - loss: 1.2961
Epoch 8/25
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.4952 - loss: 1.2639
Epoch 9/25
[1m105/105[0m [32m━━━━━━━━

In [13]:
# Reviewer note: XGBoost on single raw feature; add model persistence, labeled confusion-matrix CSV, and console prints; no change to core behavior

import time
import sys
import json
import joblib
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from xgboost import XGBClassifier

# Normalize BASE_OUT with a default and derive OUT_DIR for this model (humidity feature)
BASE_OUT = Path(globals().get(
    "BASE_OUT",
    "/content/drive/My Drive/Final_Year_Project/Attempt_3_version_2/humidity/outputs"
))
OUT_DIR = BASE_OUT / "XGBoost"

# Safety guard: stop if an existing folder is not empty to protect prior runs
if OUT_DIR.exists() and any(OUT_DIR.iterdir()):
    print(f"Safety guard: output folder already exists and is not empty: {OUT_DIR}")
    print("Create a new folder or archive/clear the existing one before proceeding.")
    sys.exit(1)

OUT_DIR.mkdir(parents=True, exist_ok=True)

# Label column fallback for safety
LABEL_COL = LABEL_COL if "LABEL_COL" in globals() else "target"

num_classes = int(np.unique(y_train).size)

# XGBoost configuration suitable for multiclass classification on CPU
clf = XGBClassifier(
    objective="multi:softmax",
    num_class=num_classes,
    n_estimators=400,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    tree_method="hist",
    eval_metric="mlogloss",
    random_state=42,
    n_jobs=-1,
    verbosity=0
)

start = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start

# Predictions and core metrics on the test set
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
acc_percent = round(100.0 * acc, 2)
report_dict = classification_report(y_test, y_pred, digits=4, output_dict=True)
cm = confusion_matrix(y_test, y_pred)

# Optional spice mapping if present; falls back to blank if column absent
spice_map = {}
if "spice" in test_df.columns:
    spice_map = test_df.groupby(LABEL_COL)["spice"].agg(lambda s: s.mode().iat[0]).to_dict()

# Build labeled headers for confusion matrix
unique_classes = np.sort(np.unique(y_test))
label_names = []
for c in unique_classes:
    if spice_map:
        label_names.append(f"{spice_map.get(int(c), '')} ({int(c)})".strip())
    else:
        label_names.append(str(int(c)))

# Persist classification report
report_csv = OUT_DIR / "xgb_humidity_classification_report.csv"
pd.DataFrame(report_dict).transpose().to_csv(report_csv, index=True)

# Persist confusion matrix with clear headers
cm_df = pd.DataFrame(
    cm,
    index=[f"true_{n}" for n in label_names],
    columns=[f"pred_{n}" for n in label_names]
)
cm_df.index.name = "true_label"
cm_df.columns.name = "pred_label"
cm_csv = OUT_DIR / "xgb_humidity_confusion_matrix.csv"
cm_df.to_csv(cm_csv, index=True)

# Persist the trained model; avoid overwriting by timestamping if needed
model_base = OUT_DIR / "xgb_humidity_model.joblib"
model_path = model_base if not model_base.exists() else OUT_DIR / f"xgb_humidity_model_{time.strftime('%Y%m%d_%H%M%S')}.joblib"
joblib.dump(clf, model_path)

# Persist scalar metrics and file paths
metrics_json = OUT_DIR / "xgb_humidity_metrics.json"
with metrics_json.open("w") as f:
    json.dump(
        {
            "model": "XGBClassifier",
            "feature": "relative_humidity",
            "objective": "multi:softmax",
            "num_class": num_classes,
            "n_estimators": 400,
            "max_depth": 6,
            "learning_rate": 0.1,
            "subsample": 0.8,
            "colsample_bytree": 0.8,
            "reg_lambda": 1.0,
            "tree_method": "hist",
            "eval_metric": "mlogloss",
            "test_accuracy": acc,
            "test_accuracy_percent": acc_percent,
            "train_time_sec": round(train_time, 4),
            "report_csv": str(report_csv),
            "confusion_matrix_csv": str(cm_csv),
            "model_path": str(model_path)
        },
        f,
        indent=2
    )

# Build per-class correct and incorrect counts; include spice mapping when available
summary = []
for c in unique_classes:
    mask = (y_test == c)
    total = int(mask.sum())
    correct = int((y_pred[mask] == y_test[mask]).sum())
    incorrect = int(total - correct)
    cls_acc = round(100.0 * correct / total, 2) if total > 0 else 0.0
    summary.append({
        "target": int(c),
        "spice": spice_map.get(int(c), "") if spice_map else "",
        "total_rows": total,
        "correct_rows": correct,
        "incorrect_rows": incorrect,
        "class_accuracy_percent": cls_acc
    })

per_class_df = pd.DataFrame(summary).sort_values(by="target")
per_class_csv = OUT_DIR / "xgb_humidity_per_class_outcomes.csv"
per_class_df.to_csv(per_class_csv, index=False)

# Console output with clear summary
print(f"XGBoost (humidity) accuracy: {acc_percent}%")
print("\nClassification report:")
print(classification_report(y_test, y_pred, digits=4))

print("\nConfusion matrix (rows=true, columns=pred):")
print(cm_df.to_string())

print("\nPer-class outcomes:")
print(per_class_df.to_string(index=False))

print("\nSaved files:")
print(" ", report_csv)
print(" ", cm_csv)
print(" ", model_path)
print(" ", metrics_json)
print(" ", per_class_csv)


XGBoost (humidity) accuracy: 59.76%

Classification report:
              precision    recall  f1-score   support

           0     0.5579    0.6535    0.6020     26800
           1     0.5531    0.5925    0.5721     26800
           2     0.3714    0.2590    0.3051     26800
           3     0.8353    0.8855    0.8597     26800

    accuracy                         0.5976    107200
   macro avg     0.5794    0.5976    0.5847    107200
weighted avg     0.5794    0.5976    0.5847    107200


Confusion matrix (rows=true, columns=pred):
pred_label         pred_Anise (0)  pred_Chilli (1)  pred_Cinnamon (2)  pred_Nutmeg (3)
true_label                                                                            
true_Anise (0)              17515             4264               4314              707
true_Chilli (1)               939            15880               7432             2549
true_Cinnamon (2)           12938             5498               6940             1424
true_Nutmeg (3)          