# **Baseline Notebook**



---
## Setup Environment

In [None]:
# DO NOT MODIFY THE CODE IN THIS CELL
!pip install -q utstd

from utstd.folders import *
from utstd.ipyrenders import *

at = AtFolder(
    course_code=36106,
    assignment="AT3",
)
at.run()

import warnings
warnings.simplefilter(action='ignore')

---
## Student Information

In [None]:
# <Student to fill this section and then remove this comment>
group_name = "AT3-group 12"
student_name = "CEWANG"
student_id = "25687207"

In [None]:
# Do not modify this code
print_tile(size="h1", key='group_name', value=group_name)

In [None]:
# DO NOT MODIFY THE CODE IN THIS CELL
print_tile(size="h1", key='student_name', value=student_name)

In [None]:
# DO NOT MODIFY THE CODE IN THIS CELL
print_tile(size="h1", key='student_id', value=student_id)

---
## 0. Python Packages

### 0.a Install Additional Packages

> If you are using additional packages, you need to install them here using the command: `! pip install <package_name>`

In [None]:
!pip install -q scikit-learn matplotlib seaborn altair

### 0.b Import Packages

In [None]:
# <Student to fill this section and then remove this comment>
import pandas as pd
import altair as alt
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

---
## A. Assess Baseline Model

In [None]:
# DO NOT MODIFY THE CODE IN THIS CELL
# Load data
try:
  X_train = pd.read_csv(at.folder_path / 'X_train.csv')
  y_train = pd.read_csv(at.folder_path / 'y_train.csv')

  X_val = pd.read_csv(at.folder_path / 'X_val.csv')
  y_val = pd.read_csv(at.folder_path / 'y_val.csv')

  X_test = pd.read_csv(at.folder_path / 'X_test.csv')
  y_test = pd.read_csv(at.folder_path / 'y_test.csv')
except Exception as e:
  print(e)

In [None]:

try:

  customers_cleaned = pd.read_csv(at.folder_path / "data" / "customers_cleaned.csv")
  print("Customers cleaned data loaded successfully!")
  print(f"Customers shape: {customers_cleaned.shape}")

  try:
    X_train_final = pd.read_csv(at.folder_path / "X_train_final.csv")
    X_val_final = pd.read_csv(at.folder_path / "X_val_final.csv")
    X_test_final = pd.read_csv(at.folder_path / "X_test_final.csv")
    print("Preprocessed feature data found!")
  except:
    print("Preprocessed feature data not found, we'll use customers_cleaned data")

except Exception as e:
  print(f"Error loading data: {e}")

import os
print("\nAvailable files in data directory:")
data_dir = at.folder_path / "data"
if data_dir.exists():
    for file in data_dir.iterdir():
        print(f"  - {file.name}")

### A.1 Generate Predictions with Baseline Model

In [None]:
# A.1 Generate Predictions with Baseline Model
print("=== Baseline K-Means Clustering Model ===\n")

# Use our existing customer data
df = customers_cleaned.copy()

# Select numerical features for clustering
numerical_features = ['age', 'annual_income', 'number_dependents', 'customer_value_score']
print(f"Selected numerical features for clustering: {numerical_features}")

# Prepare data
X_cluster = df[numerical_features].copy()

# Standardize data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_cluster)

print(f"Data prepared for clustering: {X_scaled.shape}")

# Use elbow method to determine optimal number of clusters
print("\n=== Elbow Method Analysis ===\n")
wcss = []  # Within-Cluster Sum of Square
silhouette_scores = []
k_range = range(2, 11)

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    wcss.append(kmeans.inertia_)

    # Calculate silhouette score
    if k > 1:  # Silhouette score requires at least 2 clusters
        silhouette_avg = silhouette_score(X_scaled, kmeans.labels_)
        silhouette_scores.append(silhouette_avg)
        print(f"K={k}: WCSS = {kmeans.inertia_:.2f}, Silhouette Score = {silhouette_avg:.4f}")
    else:
        print(f"K={k}: WCSS = {kmeans.inertia_:.2f}")

# Plot elbow method
plt.figure(figsize=(15, 5))

plt.subplot(1, 2, 1)
plt.plot(k_range, wcss, 'bo-')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Within-Cluster Sum of Squares (WCSS)')
plt.title('Elbow Method for Optimal K')
plt.grid(True)

plt.subplot(1, 2, 2)
plt.plot(range(2, 11), silhouette_scores, 'go-')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Analysis for Optimal K')
plt.grid(True)

plt.tight_layout()
plt.show()

### A.2 Selection of Performance Metrics

> Provide some explanations on why you believe the performance metrics you chose is appropriate


In [None]:
print("=== Final Baseline Model with K=4 ===")

kmeans_baseline = KMeans(n_clusters=4, random_state=42, n_init=10)
baseline_labels = kmeans_baseline.fit_predict(X_scaled)

df['cluster'] = baseline_labels

print(f"Baseline model trained with {kmeans_baseline.n_clusters} clusters")

cluster_counts = df['cluster'].value_counts().sort_index()
print("\nCluster distribution:")
for cluster, count in cluster_counts.items():
    print(f"Cluster {cluster}: {count} customers ({count/len(df)*100:.1f}%)")

print("\n=== Cluster Profile Analysis ===")
cluster_profile = df.groupby('cluster')[numerical_features].mean()
print(cluster_profile.round(2))

silhouette_avg = silhouette_score(X_scaled, baseline_labels)
calinski_harabasz = calinski_harabasz_score(X_scaled, baseline_labels)
davies_bouldin = davies_bouldin_score(X_scaled, baseline_labels)

print(f"\n=== Model Performance Metrics ===")
print(f"Silhouette Score: {silhouette_avg:.4f}")
print(f"Calinski-Harabasz Index: {calinski_harabasz:.2f}")
print(f"Davies-Bouldin Index: {davies_bouldin:.4f}")

plt.figure(figsize=(12, 4))

plt.subplot(1, 3, 1)
df['cluster'].value_counts().sort_index().plot(kind='bar', color='skyblue')
plt.title('Cluster Size Distribution')
plt.xlabel('Cluster')
plt.ylabel('Number of Customers')

plt.subplot(1, 3, 2)
for feature in numerical_features:
    plt.scatter(df['cluster'], df[feature], alpha=0.6, label=feature)
plt.title('Feature Distribution by Cluster')
plt.xlabel('Cluster')
plt.ylabel('Feature Value')
plt.legend()

plt.subplot(1, 3, 3)
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=baseline_labels, cmap='viridis', alpha=0.6)
plt.title('PCA Visualization of Clusters')
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.colorbar(label='Cluster')

plt.tight_layout()
plt.show()

In [None]:
performance_metrics_explanations = """
For clustering analysis, I selected three complementary performance metrics:

1. **Silhouette Score**: Measures how similar an object is to its own cluster compared to other clusters. Values range from -1 to 1, where higher values indicate better-defined clusters. This is appropriate because it evaluates both cluster cohesion and separation.

2. **Calinski-Harabasz Index**: Also known as the Variance Ratio Criterion, this metric measures the ratio between within-cluster dispersion and between-cluster dispersion. Higher values indicate better clustering. It's suitable for our K-means baseline as it works well with Euclidean distance-based algorithms.

3. **Davies-Bouldin Index**: Measures the average similarity between each cluster and its most similar cluster. Lower values indicate better clustering. This provides a different perspective by focusing on cluster separation quality.

These metrics are appropriate because:
- They provide comprehensive evaluation of cluster quality from different angles
- They work well with K-means clustering and Euclidean distance
- They help validate both cluster cohesion and separation
- They are widely accepted in clustering literature and practice
"""

In [None]:
# DO NOT MODIFY THE CODE IN THIS CELL
print_tile(size="h3", key='performance_metrics_explanations', value=performance_metrics_explanations)

### A.3 Baseline Model Performance

> Provide some explanations on model performance


In [None]:
# <Student to fill this section and then remove this comment>

In [None]:
baseline_performance_explanations = """
Baseline K-means clustering model performance analysis:

Model Configuration:**
- Algorithm: K-means with K=4 clusters
- Features: age, annual_income, number_dependents, customer_value_score
- Dataset: 19,963 customers

Performance Metrics Analysis:**
- **Silhouette Score (0.2911)**: Indicates fair cluster separation. The score is positive but could be improved, suggesting some overlap between clusters.
- **Calinski-Harabasz Index (9652.31)**: Very high value indicates excellent between-cluster variance relative to within-cluster variance.
- **Davies-Bouldin Index (1.1052)**: Low value indicates good cluster separation with minimal overlap.

Cluster Distribution:**
The model created 4 well-balanced clusters (22-29% each), avoiding the issue of imbalanced clusters.

Cluster Profiles:**
- **Cluster 0**: Middle-aged customers with high income, few dependents, high value score
- **Cluster 1**: Older customers with highest income, many dependents, high value score
- **Cluster 2**: Older customers with low income, moderate dependents, low value score
- **Cluster 3**: Younger customers with lowest income, few dependents, lowest value score

Overall Assessment:**
The baseline model provides a solid foundation with meaningful customer segments. The clusters show clear demographic and socioeconomic patterns. Performance metrics indicate reasonable cluster quality, though there's room for improvement in future iterations.
"""

In [None]:
# DO NOT MODIFY THE CODE IN THIS CELL
print_tile(size="h3", key='baseline_performance_explanations', value=baseline_performance_explanations)