In [None]:
# Customer Segmentation & Classification - Project Akhir Data Mining

## Overview
Project ini melakukan analisis customer segmentation dan classification menggunakan data Online Retail II. 

### Tahapan Project:
1. **Preprocessing Data Transaksi** - Membersihkan dan menyiapkan data transaksi
2. **Customer Aggregation** - Agregasi data transaksi ke level pelanggan
3. **Preprocessing Data Customer** - Mempersiapkan data customer untuk modeling
4. **Classification Modeling** - Supervised Learning untuk prediksi High Value Customer
5. **Customer Segmentation** - Unsupervised Learning menggunakan K-Means Clustering

---

## Part 1: Preprocessing Data Transaksi

Tahap pertama adalah membersihkan dan mempersiapkan data transaksi online retail.

In [None]:
# Import Libraries
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt 
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

print("âœ“ Libraries berhasil di-import")

In [None]:
# Load Data
df = pd.read_csv('FinalDM/Raw/online_retail_II.csv')

print("="*60)
print("DATA BERHASIL DIMUAT")
print("="*60)
print(f"Jumlah baris: {df.shape[0]:,}")
print(f"Jumlah kolom: {df.shape[1]}")
print(f"\nKolom-kolom dalam dataset:")
print(df.columns.tolist())
print(f"\n5 Baris Pertama:")
df.head()

In [None]:
# Informasi Dataset
print("="*60)
print("INFORMASI DATASET")
print("="*60)
print(f"\nTipe Data:")
print(df.dtypes)
print(f"\nMissing Values:")
print(df.isnull().sum())
print(f"\nStatistik Deskriptif:")
df.describe()

### 1.1 Handle Missing Values

In [None]:
# Handle Missing Values
print("="*60)
print("HANDLE MISSING VALUES")
print("="*60)

# Hapus baris dengan missing di kolom krusial
initial_rows = len(df)
df = df.dropna(subset=['Invoice', 'StockCode', 'Customer ID'])
removed = initial_rows - len(df)
print(f"\nâœ“ Dihapus {removed:,} baris dengan missing Invoice/StockCode/Customer ID")

# Isi missing Description dan Country
df['Description'].fillna('Unknown', inplace=True)
df['Country'].fillna('Unknown', inplace=True)
print(f"âœ“ Missing Description & Country diisi dengan 'Unknown'")

print(f"\nJumlah baris sekarang: {len(df):,}")
print(f"Missing values setelah handling:")
print(df.isnull().sum())

### 1.2 Remove Duplicates & Handle Negative Values

In [None]:
# Remove Duplicates
print("="*60)
print("REMOVE DUPLICATES")
print("="*60)

initial_rows = len(df)
df = df.drop_duplicates()
duplicates_removed = initial_rows - len(df)
print(f"âœ“ Dihapus {duplicates_removed:,} baris duplikat")

# Handle Negative Values (cancellation/return)
print("\n" + "="*60)
print("HANDLE NEGATIVE VALUES")
print("="*60)

negative_quantity = (df['Quantity'] < 0).sum()
print(f"Transaksi dengan Quantity negatif: {negative_quantity:,}")

initial_rows = len(df)
df = df[df['Quantity'] > 0]
removed = initial_rows - len(df)
print(f"âœ“ Dihapus {removed:,} baris dengan Quantity negatif")

print(f"\nJumlah baris sekarang: {len(df):,}")

### 1.3 Data Type Conversion & Feature Engineering

In [None]:
# Data Type Conversion
print("="*60)
print("DATA TYPE CONVERSION")
print("="*60)

# Konversi InvoiceDate ke datetime
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
df['Quantity'] = pd.to_numeric(df['Quantity'], errors='coerce').astype('int64')
df['Price'] = pd.to_numeric(df['Price'], errors='coerce').astype('float64')
df['Customer ID'] = df['Customer ID'].astype(str)

print("âœ“ InvoiceDate -> datetime")
print("âœ“ Quantity -> int64")
print("âœ“ Price -> float64")
print("âœ“ Customer ID -> object")

# Feature Engineering
print("\n" + "="*60)
print("FEATURE ENGINEERING")
print("="*60)

# TotalAmount
df['TotalAmount'] = df['Quantity'] * df['Price']
print("âœ“ TotalAmount = Quantity Ã— Price")

# Time Features
df['Year'] = df['InvoiceDate'].dt.year
df['Month'] = df['InvoiceDate'].dt.month
df['Day'] = df['InvoiceDate'].dt.day
df['Hour'] = df['InvoiceDate'].dt.hour
df['DayOfWeek'] = df['InvoiceDate'].dt.dayofweek
df['IsWeekend'] = df['DayOfWeek'].isin([5, 6])

print("âœ“ Fitur waktu: Year, Month, Day, Hour, DayOfWeek, IsWeekend")

print(f"\nJumlah kolom sekarang: {df.shape[1]}")
print(f"Kolom baru: {['TotalAmount', 'Year', 'Month', 'Day', 'Hour', 'DayOfWeek', 'IsWeekend']}")

In [None]:
# Save Preprocessed Data
output_path = 'FinalDM/Raw/online_retail_II_preprocessed.csv'
df.to_csv(output_path, index=False)

print("="*60)
print("PREPROCESSING SELESAI")
print("="*60)
print(f"âœ“ Data berhasil disimpan: {output_path}")
print(f"âœ“ Total baris: {len(df):,}")
print(f"âœ“ Total kolom: {df.shape[1]}")
print(f"âœ“ Periode: {df['InvoiceDate'].min()} s/d {df['InvoiceDate'].max()}")

df.head()

---

## Part 2: Customer Aggregation

Mengagregasi data transaksi ke level pelanggan untuk membuat fitur RFM (Recency, Frequency, Monetary).

In [None]:
# Load Preprocessed Data
df_trans = pd.read_csv('FinalDM/Raw/online_retail_II_preprocessed.csv')
df_trans['InvoiceDate'] = pd.to_datetime(df_trans['InvoiceDate'])

print("="*60)
print("DATA PREPROCESSED DIMUAT")
print("="*60)
print(f"Total transaksi: {len(df_trans):,}")
print(f"Customer unik: {df_trans['Customer ID'].nunique():,}")
print(f"Periode: {df_trans['InvoiceDate'].min()} s/d {df_trans['InvoiceDate'].max()}")

In [None]:
# Agregasi ke Level Pelanggan
print("="*60)
print("AGREGASI KE LEVEL PELANGGAN")
print("="*60)

# Tanggal acuan untuk Recency
reference_date = df_trans['InvoiceDate'].max() + pd.Timedelta(days=1)
print(f"Tanggal acuan untuk Recency: {reference_date}")

# Agregasi per Customer ID
customer_agg = df_trans.groupby('Customer ID').agg({
    'TotalAmount': 'sum',           # TotalSpending
    'Invoice': 'nunique',            # TotalTransaction
    'Quantity': 'sum',               # TotalQuantity
    'Price': 'mean',                 # AvgPrice
    'InvoiceDate': 'max'             # Last Transaction Date
}).reset_index()

# Rename columns
customer_agg.columns = ['CustomerID', 'TotalSpending', 'TotalTransaction', 
                        'TotalQuantity', 'AvgPrice', 'LastTransactionDate']

# Hitung Recency
customer_agg['Recency'] = (reference_date - customer_agg['LastTransactionDate']).dt.days

# Bulatkan nilai
customer_agg['AvgPrice'] = customer_agg['AvgPrice'].round(2)
customer_agg['TotalSpending'] = customer_agg['TotalSpending'].round(2)

print(f"\nâœ“ Agregasi selesai!")
print(f"âœ“ Jumlah pelanggan: {len(customer_agg):,}")

customer_agg.head(10)

In [None]:
# Statistik Fitur Agregasi
print("="*60)
print("STATISTIK FITUR AGREGASI")
print("="*60)

for col in ['TotalSpending', 'TotalTransaction', 'TotalQuantity', 'AvgPrice', 'Recency']:
    print(f"\n{col}:")
    print(f"  Min: {customer_agg[col].min():.2f}")
    print(f"  Max: {customer_agg[col].max():.2f}")
    print(f"  Mean: {customer_agg[col].mean():.2f}")
    print(f"  Median: {customer_agg[col].median():.2f}")

print("\n" + "="*60)
print("TOP 10 CUSTOMER BY TOTAL SPENDING")
print("="*60)
customer_agg.nlargest(10, 'TotalSpending')[['CustomerID', 'TotalSpending', 'TotalTransaction', 'Recency']]

In [None]:
# Save Customer Aggregated Data
df_to_save = customer_agg.drop('LastTransactionDate', axis=1)
output_path = 'FinalDM/Raw/customer_aggregated.csv'
df_to_save.to_csv(output_path, index=False)

print("="*60)
print("CUSTOMER AGGREGATION SELESAI")
print("="*60)
print(f"âœ“ Data disimpan: {output_path}")
print(f"âœ“ Total customer: {len(df_to_save):,}")
print(f"âœ“ Fitur: {list(df_to_save.columns)}")

df_to_save.head()

---

## Part 3: Preprocessing Customer Data

Mempersiapkan data customer untuk modeling dengan konversi tipe data dan validasi.

In [None]:
# Load Customer Aggregated Data
df_cust = pd.read_csv('FinalDM/Raw/customer_aggregated.csv')

print("="*60)
print("CUSTOMER DATA LOADED")
print("="*60)
print(f"Total customers: {len(df_cust):,}")
print(f"Columns: {list(df_cust.columns)}")
print(f"\nData Info:")
print(df_cust.info())
print(f"\nSample Data:")
df_cust.head()

In [None]:
# Konversi CustomerID ke Object
print("="*60)
print("KONVERSI CUSTOMER ID")
print("="*60)

print(f"Tipe data sebelum: {df_cust['CustomerID'].dtype}")

# Handle missing values
missing = df_cust['CustomerID'].isnull().sum()
if missing > 0:
    print(f"Missing CustomerID: {missing}")
    df_cust = df_cust.dropna(subset=['CustomerID'])
    
# Konversi ke integer lalu ke object
df_cust['CustomerID'] = df_cust['CustomerID'].astype('int64').astype('object')

print(f"Tipe data setelah: {df_cust['CustomerID'].dtype}")
print(f"âœ“ CustomerID berhasil dikonversi ke object")

# Validasi Data
print("\n" + "="*60)
print("VALIDASI DATA")
print("="*60)

# Cek nilai negatif
numeric_cols = df_cust.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
    negative_count = (df_cust[col] < 0).sum()
    print(f"{col}: {negative_count} nilai negatif")

# Cek duplikat CustomerID
duplicates = df_cust['CustomerID'].duplicated().sum()
print(f"\nDuplikat CustomerID: {duplicates}")
print("âœ“ Data valid dan siap untuk modeling")

In [None]:
# Save Preprocessed Customer Data
output_path = 'FinalDM/Modelling/customer_aggregated_preprocessed.csv'
df_cust.to_csv(output_path, index=False)

print("="*60)
print("CUSTOMER PREPROCESSING SELESAI")
print("="*60)
print(f"âœ“ Data disimpan: {output_path}")
print(f"âœ“ Total customers: {len(df_cust):,}")
print(f"âœ“ Siap untuk Classification & Clustering")

df_cust.head()

---

## Part 4: Classification Modeling - High Value Customer (HVC)

Supervised Learning untuk memprediksi High Value Customer menggunakan Logistic Regression dan Random Forest.

In [None]:
# Import Libraries for Modeling
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, confusion_matrix, classification_report)

# Load Data
df_model = pd.read_csv('FinalDM/Modelling/customer_aggregated_preprocessed.csv')

print("="*60)
print("DATA LOADED FOR MODELING")
print("="*60)
print(f"Total customers: {len(df_model):,}")
df_model.head()

### 4.1 Create Target Variable: HVC (High Value Customer)

HVC didefinisikan berdasarkan persentil ke-75 dari TotalSpending.

In [None]:
# Create HVC Labels
percentile_75 = df_model['TotalSpending'].quantile(0.75)
df_model['HVC'] = (df_model['TotalSpending'] > percentile_75).astype(int)

print("="*60)
print("HIGH VALUE CUSTOMER (HVC) LABELS")
print("="*60)
print(f"Threshold (P75): Â£{percentile_75:,.2f}")
print(f"\nDistribusi HVC:")
print(df_model['HVC'].value_counts())
print(f"\nPersentase:")
print(df_model['HVC'].value_counts(normalize=True) * 100)

# Visualisasi
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Distribution
df_model['HVC'].value_counts().plot(kind='bar', ax=axes[0], color=['#3498db', '#e74c3c'])
axes[0].set_title('Distribusi HVC', fontsize=14, fontweight='bold')
axes[0].set_xlabel('HVC Label')
axes[0].set_ylabel('Jumlah Customer')
axes[0].set_xticklabels(['Non-HVC (0)', 'HVC (1)'], rotation=0)

# TotalSpending Distribution by HVC
df_model.boxplot(column='TotalSpending', by='HVC', ax=axes[1])
axes[1].set_title('TotalSpending by HVC', fontsize=14, fontweight='bold')
axes[1].set_xlabel('HVC Label')
axes[1].set_ylabel('TotalSpending (Â£)')

plt.tight_layout()
plt.show()

### 4.2 Feature Selection & Train-Test Split

Features: TotalTransaction, TotalQuantity, AvgPrice, Recency (tanpa TotalSpending untuk avoid data leakage)

In [None]:
# Feature Selection (exclude TotalSpending untuk avoid data leakage)
feature_cols = ['TotalTransaction', 'TotalQuantity', 'AvgPrice', 'Recency']
X = df_model[feature_cols]
y = df_model['HVC']

print("="*60)
print("FEATURE SELECTION")
print("="*60)
print(f"Features: {feature_cols}")
print(f"Target: HVC")
print(f"\nX shape: {X.shape}")
print(f"y shape: {y.shape}")

# Train-Test Split (80-20 dengan stratified sampling)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("\n" + "="*60)
print("TRAIN-TEST SPLIT")
print("="*60)
print(f"Train: {len(X_train)} samples ({len(X_train)/len(X)*100:.1f}%)")
print(f"Test: {len(X_test)} samples ({len(X_test)/len(X)*100:.1f}%)")
print(f"\nTrain HVC distribution:")
print(y_train.value_counts())
print(f"\nTest HVC distribution:")
print(y_test.value_counts())

### 4.3 Model Training

- **Logistic Regression**: Dengan StandardScaler
- **Random Forest**: Tanpa scaling (tree-based model)

In [None]:
# Preprocessing: StandardScaler untuk Logistic Regression
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("="*60)
print("MODEL 1: LOGISTIC REGRESSION")
print("="*60)

# Train Logistic Regression
model_lr = LogisticRegression(random_state=42, max_iter=1000)
model_lr.fit(X_train_scaled, y_train)

# Predictions
y_pred_lr = model_lr.predict(X_test_scaled)

# Evaluation
acc_lr = accuracy_score(y_test, y_pred_lr)
prec_lr = precision_score(y_test, y_pred_lr)
rec_lr = recall_score(y_test, y_pred_lr)
f1_lr = f1_score(y_test, y_pred_lr)

print(f"\nâœ“ Logistic Regression trained!")
print(f"Accuracy: {acc_lr:.4f}")
print(f"Precision: {prec_lr:.4f}")
print(f"Recall: {rec_lr:.4f}")
print(f"F1-Score: {f1_lr:.4f}")

print("\n" + "="*60)
print("MODEL 2: RANDOM FOREST")
print("="*60)

# Train Random Forest (tanpa scaling)
model_rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
model_rf.fit(X_train, y_train)

# Predictions
y_pred_rf = model_rf.predict(X_test)

# Evaluation
acc_rf = accuracy_score(y_test, y_pred_rf)
prec_rf = precision_score(y_test, y_pred_rf)
rec_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)

print(f"\nâœ“ Random Forest trained!")
print(f"Accuracy: {acc_rf:.4f}")
print(f"Precision: {prec_rf:.4f}")
print(f"Recall: {rec_rf:.4f}")
print(f"F1-Score: {f1_rf:.4f}")

### 4.4 Model Evaluation & Visualization

In [None]:
# Confusion Matrix Visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Logistic Regression
cm_lr = confusion_matrix(y_test, y_pred_lr)
sns.heatmap(cm_lr, annot=True, fmt='d', cmap='Blues', ax=axes[0])
axes[0].set_title('Confusion Matrix - Logistic Regression', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Predicted')
axes[0].set_ylabel('Actual')

# Random Forest
cm_rf = confusion_matrix(y_test, y_pred_rf)
sns.heatmap(cm_rf, annot=True, fmt='d', cmap='Greens', ax=axes[1])
axes[1].set_title('Confusion Matrix - Random Forest', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Predicted')
axes[1].set_ylabel('Actual')

plt.tight_layout()
plt.show()

# Metrics Comparison
print("="*60)
print("PERBANDINGAN MODEL")
print("="*60)

metrics_df = pd.DataFrame({
    'Model': ['Logistic Regression', 'Random Forest'],
    'Accuracy': [acc_lr, acc_rf],
    'Precision': [prec_lr, prec_rf],
    'Recall': [rec_lr, rec_rf],
    'F1-Score': [f1_lr, f1_rf]
})

print(metrics_df.to_string(index=False))

# Bar plot comparison
metrics_df.set_index('Model').plot(kind='bar', figsize=(10, 6), rot=0)
plt.title('Model Performance Comparison', fontsize=14, fontweight='bold')
plt.ylabel('Score')
plt.ylim(0, 1)
plt.legend(loc='lower right')
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

---

## Part 5: Customer Segmentation - K-Means Clustering

Unsupervised Learning untuk mengelompokkan customer berdasarkan perilaku pembelian.

In [None]:
# Import Libraries for Clustering
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# Load Data untuk Clustering
df_cluster = pd.read_csv('FinalDM/Modelling/customer_aggregated_preprocessed.csv')

print("="*60)
print("DATA LOADED FOR CLUSTERING")
print("="*60)
print(f"Total customers: {len(df_cluster):,}")
print(f"Columns: {list(df_cluster.columns)}")
df_cluster.head()

### 5.1 Feature Selection & Preprocessing

Menggunakan 5 fitur numerik: TotalSpending, TotalTransaction, TotalQuantity, AvgPrice, Recency

In [None]:
# Feature Selection untuk Clustering
cluster_features = ['TotalSpending', 'TotalTransaction', 'TotalQuantity', 'AvgPrice', 'Recency']
X_cluster = df_cluster[cluster_features]

print("="*60)
print("FEATURE SELECTION")
print("="*60)
print(f"Features: {cluster_features}")
print(f"Shape: {X_cluster.shape}")

# Preprocessing: StandardScaler
scaler_cluster = StandardScaler()
X_cluster_scaled = scaler_cluster.fit_transform(X_cluster)

print("\nâœ“ Data berhasil di-scale menggunakan StandardScaler")
print(f"Scaled data shape: {X_cluster_scaled.shape}")

### 5.2 Elbow Method - Menentukan K Optimal

In [None]:
# Elbow Method
print("="*60)
print("ELBOW METHOD")
print("="*60)
print("Mencari K optimal (2-10)...")

inertias = []
K_range = range(2, 11)

for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_cluster_scaled)
    inertias.append(kmeans.inertia_)
    print(f"K={k}: Inertia={kmeans.inertia_:.2f}")

# Plot Elbow Curve
plt.figure(figsize=(10, 6))
plt.plot(K_range, inertias, marker='o', linewidth=2, markersize=8)
plt.xlabel('Number of Clusters (K)', fontsize=12)
plt.ylabel('Inertia (Within-Cluster Sum of Squares)', fontsize=12)
plt.title('Elbow Method - Determining Optimal K', fontsize=14, fontweight='bold')
plt.grid(True, alpha=0.3)
plt.xticks(K_range)
plt.tight_layout()
plt.show()

print("\nâœ“ Dari grafik Elbow, K optimal dapat ditentukan di titik 'siku' kurva")

### 5.3 K-Means Clustering dengan K=4

In [None]:
# Train K-Means dengan K=4
optimal_k = 4

print("="*60)
print(f"K-MEANS CLUSTERING (K={optimal_k})")
print("="*60)

kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
df_cluster['Cluster'] = kmeans.fit_predict(X_cluster_scaled)

print(f"\nâœ“ K-Means clustering selesai!")
print(f"âœ“ Inertia: {kmeans.inertia_:.2f}")

# Distribusi Cluster
print("\n" + "="*60)
print("DISTRIBUSI CLUSTER")
print("="*60)
print(df_cluster['Cluster'].value_counts().sort_index())

# Visualisasi distribusi
plt.figure(figsize=(8, 5))
df_cluster['Cluster'].value_counts().sort_index().plot(kind='bar', color=['#3498db', '#e74c3c', '#2ecc71', '#f39c12'])
plt.title(f'Customer Distribution Across {optimal_k} Clusters', fontsize=14, fontweight='bold')
plt.xlabel('Cluster')
plt.ylabel('Number of Customers')
plt.xticks(rotation=0)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

### 5.4 Analisis & Interpretasi Cluster

In [None]:
# Profil Cluster
print("="*60)
print("PROFIL SETIAP CLUSTER")
print("="*60)

cluster_profile = df_cluster.groupby('Cluster')[cluster_features].mean()
print("\nRata-rata fitur per cluster:")
print(cluster_profile.round(2))

# Visualisasi Heatmap
plt.figure(figsize=(12, 6))
sns.heatmap(cluster_profile.T, annot=True, fmt='.2f', cmap='YlOrRd', cbar_kws={'label': 'Average Value'})
plt.title('Cluster Profile Heatmap', fontsize=14, fontweight='bold')
plt.xlabel('Cluster')
plt.ylabel('Features')
plt.tight_layout()
plt.show()

# Interpretasi Cluster
print("\n" + "="*60)
print("INTERPRETASI CLUSTER")
print("="*60)

for cluster_id in range(optimal_k):
    profile = cluster_profile.loc[cluster_id]
    print(f"\nðŸ“Œ CLUSTER {cluster_id}:")
    print(f"   â€¢ TotalSpending: Â£{profile['TotalSpending']:,.2f}")
    print(f"   â€¢ TotalTransaction: {profile['TotalTransaction']:.0f}")
    print(f"   â€¢ TotalQuantity: {profile['TotalQuantity']:,.0f}")
    print(f"   â€¢ AvgPrice: Â£{profile['AvgPrice']:.2f}")
    print(f"   â€¢ Recency: {profile['Recency']:.0f} hari")
    print(f"   â€¢ Jumlah customer: {(df_cluster['Cluster'] == cluster_id).sum():,}")

### 5.5 Visualisasi dengan PCA (2D)

In [None]:
# PCA untuk visualisasi 2D
pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X_cluster_scaled)

print("="*60)
print("PCA DIMENSIONALITY REDUCTION")
print("="*60)
print(f"Original dimensions: {X_cluster_scaled.shape[1]}")
print(f"Reduced dimensions: {X_pca.shape[1]}")
print(f"\nExplained variance ratio:")
print(f"  PC1: {pca.explained_variance_ratio_[0]:.4f} ({pca.explained_variance_ratio_[0]*100:.2f}%)")
print(f"  PC2: {pca.explained_variance_ratio_[1]:.4f} ({pca.explained_variance_ratio_[1]*100:.2f}%)")
print(f"  Total: {pca.explained_variance_ratio_.sum():.4f} ({pca.explained_variance_ratio_.sum()*100:.2f}%)")

# Scatter Plot
plt.figure(figsize=(12, 8))
colors = ['#3498db', '#e74c3c', '#2ecc71', '#f39c12']
for cluster_id in range(optimal_k):
    mask = df_cluster['Cluster'] == cluster_id
    plt.scatter(X_pca[mask, 0], X_pca[mask, 1], 
                c=colors[cluster_id], label=f'Cluster {cluster_id}',
                alpha=0.6, s=50, edgecolors='k', linewidth=0.5)

# Plot centroids
centroids_pca = pca.transform(kmeans.cluster_centers_)
plt.scatter(centroids_pca[:, 0], centroids_pca[:, 1], 
            c='black', marker='X', s=300, linewidths=2, 
            edgecolors='white', label='Centroids')

plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]*100:.2f}%)', fontsize=12)
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]*100:.2f}%)', fontsize=12)
plt.title('Customer Segmentation - K-Means Clustering (PCA Visualization)', 
          fontsize=14, fontweight='bold')
plt.legend(loc='best', fontsize=10)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Save hasil clustering
output_path = 'FinalDM/Modelling/customer_segmentation_results.csv'
df_cluster.to_csv(output_path, index=False)

print("="*60)
print("CLUSTERING SELESAI")
print("="*60)
print(f"âœ“ Hasil clustering disimpan: {output_path}")
print(f"âœ“ Total customers: {len(df_cluster):,}")
print(f"âœ“ Number of clusters: {optimal_k}")
print(f"âœ“ Features used: {cluster_features}")

df_cluster.head(10)

---

## Kesimpulan

### Preprocessing
- Data transaksi berhasil dibersihkan dari missing values, duplikat, dan nilai negatif
- Feature engineering berhasil menambahkan fitur TotalAmount dan fitur waktu
- Data diagregasi ke level customer dengan fitur RFM (Recency, Frequency, Monetary)

### Classification (High Value Customer)
- **Logistic Regression** dan **Random Forest** berhasil memprediksi HVC
- Model dievaluasi menggunakan Accuracy, Precision, Recall, dan F1-Score
- Features: TotalTransaction, TotalQuantity, AvgPrice, Recency (tanpa data leakage)

### Clustering (Customer Segmentation)
- **K-Means Clustering** dengan K=4 berhasil mengelompokkan customer
- Elbow Method digunakan untuk menentukan K optimal
- Setiap cluster memiliki karakteristik yang berbeda berdasarkan perilaku pembelian
- Visualisasi menggunakan PCA menunjukkan separasi cluster yang jelas

### Output Files
1. `online_retail_II_preprocessed.csv` - Data transaksi yang sudah dipreprocessing
2. `customer_aggregated.csv` - Data agregasi level customer
3. `customer_aggregated_preprocessed.csv` - Data customer siap modeling
4. `customer_segmentation_results.csv` - Hasil clustering dengan label cluster

---

**Project by: [Nama Anda]**  
**Course: Data Mining - Semester V**  
**Date: December 2025**