In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier

# GitHub Copilot
# Jupyter cell at index 0 - Multiclass classification pipeline (Decision Tree)

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix
)

# Load data
df = pd.read_csv("C:/Users/Logo/Desktop/Ai project/Retail-Buyer-Segmentation/Classification/retail_customers_with_2_clusters.csv")

# Features and target
selected_features = [
    "annual_income",
    "spend_wine", "spend_fruits", "spend_meat", "spend_fish",
    "spend_sweets", "spend_gold",
    "num_web_purchases", "num_catalog_purchases", "num_store_purchases",
    "num_discount_purchases",
]
target_col = "cluster_kmeans"

# Ensure target is present
if target_col not in df.columns:
    raise KeyError(f"Target column '{target_col}' not found in dataframe.")

X_df = df[selected_features].copy()
y_series = df[target_col].copy()

# Encode target for multiclass (if needed)
if y_series.dtype == object or y_series.dtype.name == 'category':
    le = LabelEncoder()
    y = le.fit_transform(y_series)
else:
    # ensure integer labels starting from 0
    y = y_series.astype(int).values

# Identify categorical vs numeric features in selected_features
categorical_features = X_df.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_features = [c for c in selected_features if c not in categorical_features]

# Preprocessing pipelines
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore", sparse=False)),
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ],
    remainder="drop",
    sparse_threshold=0
)

# Full pipeline with a classifier that supports multiclass
clf_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("clf", DecisionTreeClassifier(random_state=42))
])

# Train / test split (stratify to preserve class proportions)
X_train, X_test, y_train, y_test = train_test_split(
    X_df, y, test_size=0.2, random_state=42, stratify=y
)

# Fit pipeline
clf_pipeline.fit(X_train, y_train)

# Predictions
y_train_pred = clf_pipeline.predict(X_train)
y_test_pred = clf_pipeline.predict(X_test)

# Metrics (macro-averaged appropriate for multiclass)
print("TRAIN Metrics:")
print("Accuracy:", accuracy_score(y_train, y_train_pred))
print("Precision (macro):", precision_score(y_train, y_train_pred, average="macro"))
print("Recall (macro):", recall_score(y_train, y_train_pred, average="macro"))
print("F1 (macro):", f1_score(y_train, y_train_pred, average="macro"))
print("\nClassification report (train):\n", classification_report(y_train, y_train_pred))

print("\nTEST Metrics:")
print("Accuracy:", accuracy_score(y_test, y_test_pred))
print("Precision (macro):", precision_score(y_test, y_test_pred, average="macro"))
print("Recall (macro):", recall_score(y_test, y_test_pred, average="macro"))
print("F1 (macro):", f1_score(y_test, y_test_pred, average="macro"))
print("\nClassification report (test):\n", classification_report(y_test, y_test_pred))

print("\nConfusion matrix (test):\n", confusion_matrix(y_test, y_test_pred))

# Cross-validation with stratified folds for multiclass
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(clf_pipeline, X_df, y, cv=cv, scoring="f1_macro", n_jobs=-1)
print("\nCV F1-macro scores:", cv_scores)
print("Mean CV F1-macro:", np.mean(cv_scores))