<a href="https://colab.research.google.com/github/SnehaKotte/b20_1332/blob/main/IEEE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import os
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score

import joblib

# --------------------------
# Config
# --------------------------
DATA_PATH = "/content/data_set.xlsx"   # default path in Colab
SHEET_NAME = "Coffe_sales"
TARGET_COL = "coffee_name"
RANDOM_STATE = 42
TEST_SIZE = 0.2

OUT_DIR = "./reports"
MODEL_DIR = "./models"
os.makedirs(OUT_DIR, exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)

# --------------------------
# Load dataset (with fallback upload)
# --------------------------
if not os.path.exists(DATA_PATH):
    try:
        from google.colab import files
        print("⚠️ File not found. Please upload data_set.xlsx")
        uploaded = files.upload()
        DATA_PATH = list(uploaded.keys())[0]   # take the uploaded file name
    except Exception as e:
        raise FileNotFoundError("Please upload 'data_set.xlsx' or update DATA_PATH with the correct location.") from e

df = pd.read_excel(DATA_PATH, sheet_name=SHEET_NAME)

# --------------------------
# Basic preprocessing
# --------------------------
print("Rows, cols:", df.shape)
print("Columns:", df.columns.tolist())

if 'Date' in df.columns:
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
if 'Time' in df.columns:
    df['Time_str'] = df['Time'].astype(str)

if 'hour_of_day' not in df.columns:
    if 'Time' in df.columns:
        try:
            df['hour_of_day'] = pd.to_datetime(df['Time'].astype(str), errors='coerce').dt.hour
        except Exception:
            df['hour_of_day'] = pd.to_datetime(df['Time_str'].astype(str), errors='coerce').dt.hour
    elif 'dt' in df.columns:
        df['hour_of_day'] = pd.to_datetime(df['dt'], errors='coerce').dt.hour
    else:
        df['hour_of_day'] = np.nan

df['hour_sin'] = np.sin(2 * np.pi * df['hour_of_day'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour_of_day'] / 24)

if 'Weekday' in df.columns:
    df['Weekday'] = df['Weekday'].astype(str)

if 'money' in df.columns:
    df['money'] = pd.to_numeric(df['money'], errors='coerce')

candidate_features = [
    'hour_of_day','hour_sin','hour_cos',
    'cash_type','Time_of_Day','Weekday','Month_name',
    'money','Weekdaysort','Monthsort'
]
features = [c for c in candidate_features if c in df.columns]

df = df[~df[TARGET_COL].isna()].copy()
y = df[TARGET_COL].astype(str).copy()

lbl = LabelEncoder()
y_enc = lbl.fit_transform(y)
print("Number of classes:", len(lbl.classes_))

X = df[features].copy()

X_train, X_test, y_train, y_test = train_test_split(
    X, y_enc, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y_enc
)

# --------------------------
# Preprocessing pipeline
# --------------------------
num_cols = [c for c in X_train.columns if X_train[c].dtype.kind in 'biufc']
cat_cols = [c for c in X_train.columns if c not in num_cols]

print("Numeric cols:", num_cols)
print("Categorical cols:", cat_cols)

num_pipe = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('scale', StandardScaler())
])

cat_pipe = Pipeline([
    ('impute', SimpleImputer(strategy='constant', fill_value='MISSING')),
    ('ohe', OneHotEncoder(handle_unknown='ignore')) # Removed sparse=False
])

preprocessor = ColumnTransformer([
    ('num', num_pipe, num_cols),
    ('cat', cat_pipe, cat_cols)
], remainder='drop')

preprocessor.fit(X_train)

X_train_t = preprocessor.transform(X_train)
X_test_t  = preprocessor.transform(X_test)

joblib.dump(preprocessor, os.path.join(MODEL_DIR, "preprocessor.joblib"))

# --------------------------
# Classifiers
# --------------------------
classifiers = {
    "LogisticRegression": LogisticRegression(max_iter=400, multi_class='multinomial', solver='lbfgs', random_state=RANDOM_STATE),
    "RandomForest": RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE, n_jobs=-1),
    "ExtraTrees": ExtraTreesClassifier(n_estimators=200, random_state=RANDOM_STATE, n_jobs=-1),
    "GradientBoosting": GradientBoostingClassifier(n_estimators=200, random_state=RANDOM_STATE),
    "AdaBoost": AdaBoostClassifier(n_estimators=200, random_state=RANDOM_STATE),
    "SVM-RBF": SVC(kernel='rbf', probability=True, random_state=RANDOM_STATE),
    "KNN": KNeighborsClassifier(n_neighbors=7, n_jobs=-1),
    "GaussianNB": GaussianNB()
}

summary_rows = []
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

for name, clf in classifiers.items():
    print("\n========", name, "========")
    clf.fit(X_train_t, y_train)
    joblib.dump(clf, os.path.join(MODEL_DIR, f"{name}.joblib"))

    y_pred = clf.predict(X_test_t)
    acc = accuracy_score(y_test, y_pred)
    f1_macro = f1_score(y_test, y_pred, average='macro')
    f1_micro = f1_score(y_test, y_pred, average='micro')
    precision_macro = precision_score(y_test, y_pred, average='macro', zero_division=0)
    recall_macro = recall_score(y_test, y_pred, average='macro', zero_division=0)

    print(f"Test Accuracy: {acc:.4f}  F1-macro: {f1_macro:.4f}  F1-micro: {f1_micro:.4f}")

    try:
        cv_scores = cross_val_score(clf, preprocessor.transform(X), lbl.transform(y), cv=cv, scoring='f1_macro', n_jobs=-1)
        cv_mean = cv_scores.mean()
    except Exception:
        cv_mean = np.nan

    summary_rows.append({
        "model": name,
        "accuracy": acc,
        "f1_macro": f1_macro,
        "f1_micro": f1_micro,
        "precision_macro": precision_macro,
        "recall_macro": recall_macro,
        "cv_f1_macro": cv_mean
    })

    crep = classification_report(y_test, y_pred, target_names=lbl.classes_, output_dict=True, zero_division=0)
    crep_df = pd.DataFrame(crep).transpose()
    crep_df.to_csv(os.path.join(OUT_DIR, f"classification_report_{name}.csv"), index=True)

    cm = confusion_matrix(y_test, y_pred)
    cm_df = pd.DataFrame(cm, index=lbl.classes_, columns=lbl.classes_)
    cm_df.to_csv(os.path.join(OUT_DIR, f"confusion_matrix_{name}.csv"))

    print("Top-level metrics:")
    print(crep_df.loc[['accuracy','macro avg','weighted avg'], ['precision','recall','f1-score']].round(3))

summary_df = pd.DataFrame(summary_rows).sort_values('f1_macro', ascending=False)
summary_df.to_csv(os.path.join(OUT_DIR, "summary_metrics.csv"), index=False)
print("\nSaved summary metrics to", os.path.join(OUT_DIR, "summary_metrics.csv"))

le_map = pd.DataFrame({
    "class_label": lbl.classes_,
    "class_index": range(len(lbl.classes_))
})
le_map.to_csv(os.path.join(OUT_DIR, "label_mapping.csv"), index=False)

print("✅ Done. Models saved to", MODEL_DIR)

Rows, cols: (3547, 11)
Columns: ['hour_of_day', 'cash_type', 'money', 'coffee_name', 'Time_of_Day', 'Weekday', 'Month_name', 'Weekdaysort', 'Monthsort', 'Date', 'Time']
Number of classes: 8
Numeric cols: ['hour_of_day', 'hour_sin', 'hour_cos', 'money', 'Weekdaysort', 'Monthsort']
Categorical cols: ['cash_type', 'Time_of_Day', 'Weekday', 'Month_name']

Test Accuracy: 0.6352  F1-macro: 0.5413  F1-micro: 0.6352
Top-level metrics:
              precision  recall  f1-score
accuracy          0.635   0.635     0.635
macro avg         0.587   0.535     0.541
weighted avg      0.614   0.635     0.606

Test Accuracy: 0.6141  F1-macro: 0.5399  F1-micro: 0.6141
Top-level metrics:
              precision  recall  f1-score
accuracy          0.614   0.614     0.614
macro avg         0.588   0.523     0.540
weighted avg      0.614   0.614     0.609

Test Accuracy: 0.5944  F1-macro: 0.5617  F1-micro: 0.5944
Top-level metrics:
              precision  recall  f1-score
accuracy          0.594   0.594    