<a href="https://colab.research.google.com/github/SnehaKotte/b20_1332/blob/main/iEEE2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import os
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
import joblib

# -----------------------------
# Config
# -----------------------------
DATA_PATH = "/content/data_set.csv"
OUT_DIR = "/mnt/data/coffee_classification_output"
os.makedirs(OUT_DIR, exist_ok=True)
os.makedirs(os.path.join(OUT_DIR, "classification_reports"), exist_ok=True)
os.makedirs(os.path.join(OUT_DIR, "models"), exist_ok=True)

# Keywords used to mark a transaction as "coffee"
COFFEE_KEYWORDS = [
    "coffee", "kopi", "instant coffee", "instant", "nescafe", "nescaf", "espresso",
    "latte", "cappuccino", "mocha", "kopiko", "good day kopi", "good day kopi 3 in 1",
    "white coffee", "kopi instan", "kopi sachet"
]

# -----------------------------
# Load dataset (robust)
# -----------------------------
def load_csv(path):
    for enc in ("utf-8", "latin1", "cp1252"):
        try:
            return pd.read_csv(path, encoding=enc, low_memory=False)
        except Exception:
            continue
    raise RuntimeError(f"Unable to read CSV at {path}")

print("Loading dataset...", DATA_PATH)
df = load_csv(DATA_PATH)
print("Dataset shape:", df.shape)
print("Columns:", list(df.columns))

# Show a quick preview (comment out if running non-interactively)
print(df.head(5).to_string(index=False))

# -----------------------------
# Find item / transaction columns
# -----------------------------
# Prefer explicit columns that commonly appear in retail transaction datasets
possible_item_cols = [c for c in df.columns if 'item' in c.lower() or 'product' in c.lower() or 'coffee' in c.lower() or 'name' in c.lower()]
possible_tid_cols = [c for c in df.columns if 'tid' in c.lower() or 'trans' in c.lower() or 'receipt' in c.lower() or 'id' == c.lower()]

print("Candidate item columns:", possible_item_cols)
print("Candidate transaction-id columns:", possible_tid_cols)

# Choose item column heuristically
if 'coffee_name' in df.columns:
    item_col = 'coffee_name'
elif possible_item_cols:
    item_col = possible_item_cols[0]
else:
    # fallback: try 'Items' or 'items'
    item_col = None
    for fallback in ['Items', 'items', 'Item', 'item']:
        if fallback in df.columns:
            item_col = fallback
            break
    if item_col is None:
        raise RuntimeError("Couldn't detect an item column. Please ensure the CSV has an item/product column.")

print("Using item column:", item_col)

# If there's a transaction id and multiple rows per transaction, group into lists
if possible_tid_cols:
    tid_col = possible_tid_cols[0]
    # Check if multiple rows per tid
    if df.groupby(tid_col).size().max() > 1:
        baskets = df.groupby(tid_col)[item_col].apply(lambda s: s.astype(str).str.strip().tolist()).reset_index()
        baskets.columns = [tid_col, 'items']
    else:
        # Assume item column is comma-separated list per row
        baskets = df[[item_col]].copy()
        baskets['items'] = baskets[item_col].astype(str).apply(
            lambda x: [it.strip() for it in str(x).split(',') if it.strip()!='']
        )
        baskets['TID'] = baskets.index.astype(str)
        tid_col = 'TID'
        baskets = baskets[[tid_col, 'items']]
else:
    # No tid col -> assume item list per row
    baskets = df[[item_col]].copy()
    baskets.columns = ['items_raw']
    baskets['items'] = baskets['items_raw'].astype(str).apply(
        lambda x: [it.strip() for it in str(x).split(',') if it.strip()!='']
    )
    baskets['TID'] = baskets.index.astype(str)
    tid_col = 'TID'
    baskets = baskets[[tid_col, 'items']]

print("Number of transactions:", len(baskets))

# Normalize item strings
def normalize_items(lst):
    return [str(x).strip().lower() for x in lst if str(x).strip()!='']

baskets['items_norm'] = baskets['items'].apply(normalize_items)

# -----------------------------
# Create target: contains_coffee
# -----------------------------
def contains_coffee(tokens):
    for it in tokens:
        for kw in COFFEE_KEYWORDS:
            if kw in it:
                return 1
    return 0

baskets['contains_coffee'] = baskets['items_norm'].apply(contains_coffee)
print("Coffee-positive transactions:", baskets['contains_coffee'].sum(), "/", len(baskets))

# -----------------------------
# Create one-hot item features (top-K) to limit dimensionality
# -----------------------------
mlb = MultiLabelBinarizer(sparse_output=False)
X_items = mlb.fit_transform(baskets['items_norm'])
item_features = mlb.classes_.tolist()
print("Distinct items:", len(item_features))

# Optionally reduce to most frequent K features to avoid huge sparse matrix
MAX_ITEM_FEATURES = 1000
if X_items.shape[1] > MAX_ITEM_FEATURES:
    freqs = X_items.sum(axis=0)
    top_idx = np.argsort(freqs)[-MAX_ITEM_FEATURES:]
    X_items = X_items[:, top_idx]
    item_features = [item_features[i] for i in top_idx]
    print(f"Reduced item features to top {MAX_ITEM_FEATURES}")

X = X_items
feature_names = item_features.copy()

# -----------------------------
# Add extra features if present in original CSV (hour_of_day, money, cashier type)
# -----------------------------
# We'll try to pull those from the original df by matching on index or tid where possible.
extra_df = None
if 'TID' in df.columns and tid_col in df.columns and 'TID' == tid_col:
    extra_df = df
else:
    # Try to join using transaction id column if it exists
    if tid_col in df.columns:
        extra_df = df
    else:
        extra_df = df  # fallback; we will try to extract columns by position

# Attempt to attach 'money' or 'amount' if present
candidate_numeric = [c for c in df.columns if any(k in c.lower() for k in ['money','amount','price','total','value'])]
candidate_hour = [c for c in df.columns if 'hour' in c.lower() or 'time'==c.lower()]
candidate_cat = [c for c in df.columns if 'cash' in c.lower() or 'cashier' in c.lower() or 'type' in c.lower()]

print("Candidate numeric cols for sale amount:", candidate_numeric)
print("Candidate hour/time cols:", candidate_hour)
print("Candidate categorical cols (cashier/type):", candidate_cat)

# We'll try to extract columns from df by using the same row order if there's no explicit tid mapping.
# Build aux features using aggregation: if multiple rows per tid, compute mean money per tid; otherwise map values directly.
aux = pd.DataFrame({tid_col: baskets[tid_col].values})
aux.set_index(tid_col, inplace=True)

# If 'money' like column exists, aggregate by tid from original df if possible
if candidate_numeric:
    money_col = candidate_numeric[0]
    try:
        if tid_col in df.columns:
            money_agg = df.groupby(tid_col)[money_col].agg('mean').astype(float)
            aux = aux.join(money_agg, how='left')
            feature_names.append(money_col)
        else:
            # approximate: try to take first N rows corresponding to baskets order
            aux[money_col] = np.nan
    except Exception:
        pass

# If hour/time exists
if candidate_hour:
    hour_col = candidate_hour[0]
    try:
        if tid_col in df.columns:
            hour_agg = df.groupby(tid_col)[hour_col].agg('first')
            aux = aux.join(hour_agg, how='left')
            feature_names.append(hour_col)
    except Exception:
        pass

# If cashier/type exists (categorical), one-hot encode and join
if candidate_cat:
    cat_col = candidate_cat[0]
    try:
        if tid_col in df.columns:
            cat_agg = df.groupby(tid_col)[cat_col].agg('first').astype(str)
            # one-hot encode
            ohe = pd.get_dummies(cat_agg, prefix=cat_col)
            aux = aux.join(ohe, how='left')
            feature_names.extend(ohe.columns.tolist())
        else:
            pass
    except Exception:
        pass

# Fill missing numeric values with 0 and fill NaN for dummies with 0
aux = aux.fillna(0)
if len(aux.columns) > 0:
    # Concatenate item features + aux numeric features
    X = np.hstack([X, aux.values])
    # extend feature_names already done above
else:
    # no extra features added
    pass

y = baskets['contains_coffee'].values

# -----------------------------
# Train/test split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)
print("Positive rate (train):", y_train.mean(), " (test):", y_test.mean())

# -----------------------------
# Classifiers to train
# -----------------------------
models = {
    "LogisticRegression": make_pipeline(StandardScaler(with_mean=False), LogisticRegression(max_iter=1000, solver='liblinear')),
    "RandomForest": RandomForestClassifier(n_estimators=200, random_state=42),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "BernoulliNB": BernoulliNB(),
    "KNeighbors": KNeighborsClassifier(n_neighbors=5),
    "SVC": make_pipeline(StandardScaler(with_mean=False), SVC(kernel='rbf', probability=True)),
    "DecisionTree": DecisionTreeClassifier(random_state=42)
}

results = []
for name, clf in models.items():
    print("\nTraining:", name)
    try:
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        rep_text = classification_report(y_test, y_pred, zero_division=0)
        print(rep_text)
        # Save detailed report to file
        with open(os.path.join(OUT_DIR, "classification_reports", f"{name}_report.txt"), "w") as f:
            f.write(f"Model: {name}\n\n")
            f.write("Classification report:\n")
            f.write(rep_text + "\n")
            f.write(f"Accuracy: {acc}\n")
        # Save model
        joblib.dump(clf, os.path.join(OUT_DIR, "models", f"{name}.joblib"))
        # Summarize positive class metrics
        rep_dict = {}
        try:
            # use sklearn's classification_report dict form
            from sklearn.metrics import precision_recall_fscore_support
            p, r, f, s = precision_recall_fscore_support(y_test, y_pred, zero_division=0)
            # positive class is index 1 assuming binary 0/1
            pos_idx = 1
            precision_pos = p[pos_idx] if pos_idx < len(p) else 0.0
            recall_pos = r[pos_idx] if pos_idx < len(r) else 0.0
            f1_pos = f[pos_idx] if pos_idx < len(f) else 0.0
        except Exception:
            precision_pos = recall_pos = f1_pos = None
        results.append({
            "model": name,
            "accuracy": acc,
            "precision_pos": precision_pos,
            "recall_pos": recall_pos,
            "f1_pos": f1_pos
        })
    except Exception as e:
        print(f"Model {name} failed: {e}")

# Save summary CSV
summary_df = pd.DataFrame(results).sort_values(by='f1_pos', ascending=False)
summary_csv = os.path.join(OUT_DIR, "classification_summary.csv")
summary_df.to_csv(summary_csv, index=False)
print("\nSaved model summary to:", summary_csv)

print("\nOutputs written to:", OUT_DIR)
print("If you'd like: I can modify the target, add cross-validation, or produce ROC/PR curves.")


Loading dataset... /content/data_set.csv
Dataset shape: (3547, 11)
Columns: ['hour_of_day', 'cash_type', 'money', 'coffee_name', 'Time_of_Day', 'Weekday', 'Month_name', 'Weekdaysort', 'Monthsort', 'Date', 'Time']
 hour_of_day cash_type  money   coffee_name Time_of_Day Weekday Month_name  Weekdaysort  Monthsort       Date    Time
          10      card   38.7         Latte     Morning     Fri        Mar            5          3 01-03-2024 15:50.5
          12      card   38.7 Hot Chocolate   Afternoon     Fri        Mar            5          3 01-03-2024 19:22.5
          12      card   38.7 Hot Chocolate   Afternoon     Fri        Mar            5          3 01-03-2024 20:18.1
          13      card   28.9     Americano   Afternoon     Fri        Mar            5          3 01-03-2024 46:33.0
          13      card   38.7         Latte   Afternoon     Fri        Mar            5          3 01-03-2024 48:14.6
Candidate item columns: ['coffee_name', 'Month_name']
Candidate transaction-id 