In [11]:
# ==========================================
# BAGIAN 1: IMPORT LIBRARY & LOAD DATA
# ==========================================
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [12]:
# Update path untuk local - letakkan file Excel di folder yang sama dengan notebook
# Atau sesuaikan path sesuai lokasi file Anda
import os
file_path = 'SPE-OPT-31122025.xlsx'  # File di folder yang sama dengan notebook
# Alternatif: file_path = r'D:\ICON+\Prediksi Flow Produk\SPE-OPT-31122025.xlsx'

print("1. Sedang membaca file (ini mungkin memakan waktu untuk 200k baris)...")
try:
    # Jika file csv, ganti read_excel jadi read_csv
    df = pd.read_excel(file_path)
    print(f"   Sukses! Total data: {len(df)} baris.")
except Exception as e:
    print(f"   Error: {e}")
    print(f"   Pastikan file 'SPE-OPT-31122025.xlsx' ada di folder yang sama dengan notebook ini.")

1. Sedang membaca file (ini mungkin memakan waktu untuk 200k baris)...
   Sukses! Total data: 275088 baris.


In [18]:
# ==========================================
# BAGIAN 2: CLEANING & ADVANCED FEATURE ENGINEERING (MODIFIED)
# ==========================================
print("2. Membersihkan & Menambah Fitur Canggih...")

# Define columns (Added Price and Bandwidth)
col_id = 'idPerusahaan'
col_prod = 'namaProduk'
col_date = 'tanggalAwalKontrak' # Or 'tanggalBuatPermohonan' if available
col_segmen = 'segmenCustomer'
col_sbu = 'sbuOwner'
col_price = 'hargaPelanggan' # NEW
col_bw = 'bandwidthBaru' # NEW

# Select columns
cols_to_use = [col_id, col_prod, col_date, col_segmen, col_sbu, col_price, col_bw]
df_ml = df[cols_to_use].copy()

# 1. Clean Data Types
df_ml[col_date] = pd.to_datetime(df_ml[col_date], errors='coerce')

# Clean Price (Remove commas/dots if string)
def clean_currency(x):
    try:
        return float(str(x).replace(',', '.').replace('nan', '0'))
    except:
        return 0
df_ml[col_price] = df_ml[col_price].apply(clean_currency)
df_ml[col_bw] = pd.to_numeric(df_ml[col_bw], errors='coerce').fillna(0)

# 2. Sort is CRITICAL for flow
df_ml.dropna(subset=[col_prod, col_date], inplace=True)
df_ml.sort_values(by=[col_id, col_date], inplace=True)

# 3. Create "Flow" Features (The Magic Sauce)
grouped = df_ml.groupby(col_id)

# Feature A: Previous Product (What did they buy BEFORE this?)
df_ml['Prev_Product'] = grouped[col_prod].shift(1).fillna('New Customer')

# Feature B: Days Since Last Order (Gap time)
df_ml['Prev_Date'] = grouped[col_date].shift(1)
df_ml['Days_Since_Last'] = (df_ml[col_date] - df_ml['Prev_Date']).dt.days.fillna(-1)

# Feature C: Order Sequence (1st order, 2nd order, etc.)
df_ml['Order_Seq'] = grouped.cumcount() + 1

# 4. Create Target (Next Product)
df_ml['Next_Product'] = grouped[col_prod].shift(-1)

# Filter valid data for training
train_data_raw = df_ml.dropna(subset=['Next_Product']).copy()
latest_status = df_ml.groupby(col_id).tail(1).copy()

# Filter Rare Products (Same as your code)
target_counts = train_data_raw['Next_Product'].value_counts()
valid_targets = target_counts[target_counts >= 2].index
train_data = train_data_raw[train_data_raw['Next_Product'].isin(valid_targets)].copy()

# ==========================================
# ADVANCED FEATURE ENGINEERING (NEW)
# ==========================================
print("   Menambahkan fitur lanjutan...")

# 1. Time-based features
train_data['Year'] = train_data[col_date].dt.year
train_data['Month'] = train_data[col_date].dt.month
train_data['Quarter'] = train_data[col_date].dt.quarter
train_data['DayOfWeek'] = train_data[col_date].dt.dayofweek
train_data['IsWeekend'] = (train_data['DayOfWeek'] >= 5).astype(int)

# 2. Customer behavior aggregations
customer_stats = train_data.groupby(col_id).agg({
    col_price: ['mean', 'std', 'min', 'max', 'count'],
    col_bw: ['mean', 'std', 'min', 'max'],
    'Days_Since_Last': ['mean', 'std'],
    'Order_Seq': 'max'
}).reset_index()

customer_stats.columns = [col_id] + [f'Customer_{col[0]}_{col[1]}' if col[1] else col[0] for col in customer_stats.columns[1:]]
train_data = train_data.merge(customer_stats, on=col_id, how='left')

# 3. Product transition patterns (how often does this transition happen?)
train_data['Transition'] = train_data[col_prod].astype(str) + '_TO_' + train_data['Prev_Product'].astype(str)
transition_counts = train_data['Transition'].value_counts().to_dict()
train_data['Transition_Frequency'] = train_data['Transition'].map(transition_counts)

# 4. Price and bandwidth ratios
train_data['Price_Ratio'] = train_data[col_price] / (train_data['Customer_hargaPelanggan_mean'] + 1e-6)
train_data['BW_Ratio'] = train_data[col_bw] / (train_data['Customer_bandwidthBaru_mean'] + 1e-6)

# 5. Interaction features
train_data['Price_BW_Interaction'] = train_data[col_price] * train_data[col_bw]
train_data['Days_Order_Interaction'] = train_data['Days_Since_Last'] * train_data['Order_Seq']

# 6. Customer loyalty/recency features
train_data['Is_Repeat_Customer'] = (train_data['Order_Seq'] > 1).astype(int)
train_data['Is_Long_Gap'] = (train_data['Days_Since_Last'] > train_data['Customer_Days_Since_Last_mean']).astype(int)

# 7. Segment-SBU interaction
train_data['Segmen_SBU'] = train_data[col_segmen].astype(str) + '_' + train_data[col_sbu].astype(str)

# 8. ADVANCED FEATURES - Rolling Statistics (Moving Window)
print("   Menambahkan rolling statistics...")
train_data_sorted = train_data.sort_values(by=[col_id, col_date])
grouped_sorted = train_data_sorted.groupby(col_id)

# Rolling mean untuk price dan bandwidth (window 3)
train_data_sorted['Price_Rolling_Mean_3'] = grouped_sorted[col_price].transform(lambda x: x.rolling(window=3, min_periods=1).mean())
train_data_sorted['BW_Rolling_Mean_3'] = grouped_sorted[col_bw].transform(lambda x: x.rolling(window=3, min_periods=1).mean())
train_data_sorted['Days_Rolling_Mean_3'] = grouped_sorted['Days_Since_Last'].transform(lambda x: x.rolling(window=3, min_periods=1).mean())

# 9. Product Popularity Features
print("   Menambahkan fitur popularitas produk...")
product_popularity = train_data[col_prod].value_counts().to_dict()
train_data['Product_Popularity'] = train_data[col_prod].map(product_popularity)
train_data['Prev_Product_Popularity'] = train_data['Prev_Product'].map(product_popularity).fillna(0)

# 10. Customer Purchase Frequency
print("   Menambahkan frekuensi pembelian customer...")
customer_frequency = train_data.groupby(col_id).size().to_dict()
train_data['Customer_Frequency'] = train_data[col_id].map(customer_frequency)

# Simpan dictionary untuk digunakan di prediksi (global scope)
globals()['product_popularity'] = product_popularity
globals()['customer_frequency'] = customer_frequency

# 11. Time-based Advanced Features
print("   Menambahkan fitur waktu lanjutan...")
train_data['Days_Since_Year_Start'] = (train_data[col_date] - pd.to_datetime(train_data['Year'].astype(str) + '-01-01')).dt.days
train_data['Days_Since_Month_Start'] = (train_data[col_date] - pd.to_datetime(train_data[col_date].dt.to_period('M').astype(str))).dt.days
train_data['Is_Month_End'] = (train_data[col_date].dt.day > 25).astype(int)
train_data['Is_Quarter_End'] = train_data['Month'].isin([3, 6, 9, 12]).astype(int)

# 12. Price Change Features
print("   Menambahkan fitur perubahan harga...")
train_data_sorted['Price_Change'] = grouped_sorted[col_price].diff().fillna(0)
train_data_sorted['Price_Change_Pct'] = grouped_sorted[col_price].pct_change().fillna(0)
train_data_sorted['BW_Change'] = grouped_sorted[col_bw].diff().fillna(0)

# Merge rolling features back
train_data = train_data_sorted.copy()

# 13. Customer-Product Affinity (berapa kali customer beli produk ini sebelumnya)
print("   Menambahkan affinity customer-produk...")
customer_product_count = train_data.groupby([col_id, col_prod]).size().reset_index(name='Customer_Product_Count')
train_data = train_data.merge(customer_product_count, on=[col_id, col_prod], how='left')
train_data['Customer_Product_Count'] = train_data['Customer_Product_Count'].fillna(0)

# 14. Segment-Product Affinity
print("   Menambahkan affinity segmen-produk...")
segmen_product_count = train_data.groupby([col_segmen, col_prod]).size().reset_index(name='Segmen_Product_Count')
train_data = train_data.merge(segmen_product_count, on=[col_segmen, col_prod], how='left')
train_data['Segmen_Product_Count'] = train_data['Segmen_Product_Count'].fillna(0)

# Simpan untuk digunakan di prediksi (global scope)
globals()['customer_product_count'] = customer_product_count
globals()['segmen_product_count'] = segmen_product_count

# 15. Normalized Features (untuk membantu model)
print("   Menambahkan fitur normalisasi...")
train_data['Order_Seq_Normalized'] = train_data['Order_Seq'] / (train_data['Customer_Order_Seq_max'] + 1e-6)
train_data['Days_Since_Last_Normalized'] = train_data['Days_Since_Last'] / (train_data['Customer_Days_Since_Last_mean'] + 1e-6)

# Fill NaN values from aggregations
train_data = train_data.fillna(0)

print(f"   Data Ready! Total fitur: {len(train_data.columns)} kolom")

2. Membersihkan & Menambah Fitur Canggih...
   Menambahkan fitur lanjutan...
   Menambahkan rolling statistics...
   Menambahkan fitur popularitas produk...
   Menambahkan frekuensi pembelian customer...
   Menambahkan fitur waktu lanjutan...
   Menambahkan fitur perubahan harga...
   Menambahkan affinity customer-produk...
   Menambahkan affinity segmen-produk...
   Menambahkan fitur normalisasi...
   Data Ready! Total fitur: 48 kolom


In [19]:
# ==========================================
# BAGIAN 3: ENCODING (PERBAIKAN - MENANGANI DATA HISTORY)
# ==========================================
print("3. Encoding Data & Persiapan Fitur...")

# 1. Inisialisasi Encoders
le_prod = LabelEncoder()
le_prev_prod = LabelEncoder()
le_segmen = LabelEncoder()
le_sbu = LabelEncoder()
le_target = LabelEncoder()

# 2. Fit Encoders (Belajar dari seluruh kemungkinan data)
# Gabung Produk saat ini + Previous + Next supaya encoder tahu semua jenis produk
all_products = pd.concat([
    df_ml[col_prod],
    df_ml['Prev_Product'],
    df_ml['Next_Product'].dropna()
]).unique().astype(str)

le_prod.fit(all_products)
le_prev_prod.fit(all_products) # Gunakan list produk yang sama
le_target.fit(all_products)    # Gunakan list produk yang sama

# Fit segmen & SBU
le_segmen.fit(df_ml[col_segmen].astype(str))
le_sbu.fit(df_ml[col_sbu].astype(str))

# 3. Transformasi Data Training (Ubah teks jadi angka)
# Kita buat kolom baru berakhiran '_code'
train_data['prod_code'] = le_prod.transform(train_data[col_prod].astype(str))
train_data['prev_prod_code'] = le_prev_prod.transform(train_data['Prev_Product'].astype(str))
train_data['segmen_code'] = le_segmen.transform(train_data[col_segmen].astype(str))
train_data['sbu_code'] = le_sbu.transform(train_data[col_sbu].astype(str))
train_data['target_code'] = le_target.transform(train_data['Next_Product'].astype(str))

print("   Encoding Selesai!")
print("   Fitur Siap: Produk, History Produk, Harga, Durasi, Segmen, dll.")

3. Encoding Data & Persiapan Fitur...
   Encoding Selesai!
   Fitur Siap: Produk, History Produk, Harga, Durasi, Segmen, dll.


In [20]:
# ==========================================
# BAGIAN 4: TRAINING XGBOOST DENGAN FITUR LENGKAP
# ==========================================
print("\n4. Training XGBoost dengan Fitur Lengkap...")

# Pastikan semua import dan variabel tersedia
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

# Pastikan variabel dari cell sebelumnya tersedia
required_vars = ['train_data', 'col_price', 'col_bw', 'col_id', 'col_prod', 'col_date', 'col_segmen', 'col_sbu']
missing_vars = [v for v in required_vars if v not in globals()]
if missing_vars:
    raise Exception(f"ERROR: Variabel berikut tidak ditemukan: {missing_vars}. Pastikan Cell 2 dan 3 sudah dijalankan!")

try:
    # 1. Encoding fitur tambahan
    print("   Step 1: Encoding fitur tambahan...")
    le_segmen_sbu = LabelEncoder()
    le_transition = LabelEncoder()

    # Encode interaction features
    train_data['segmen_sbu_code'] = le_segmen_sbu.fit_transform(train_data['Segmen_SBU'].astype(str))
    train_data['transition_code'] = le_transition.fit_transform(train_data['Transition'].astype(str))
    print("   ✓ Encoding selesai")

    # 2. Definisi Fitur Lengkap (X) - Menggunakan semua fitur yang sudah di-engineer
    print("   Step 2: Menyiapkan fitur...")
    features = [
        'prod_code',
        'prev_prod_code',
        'segmen_code',
        'sbu_code',
        'segmen_sbu_code',
        'Days_Since_Last',
        col_price,
        col_bw,
        'Order_Seq',
        'Year',
        'Month',
        'Quarter',
        'DayOfWeek',
        'IsWeekend',
        'Customer_hargaPelanggan_mean',
        'Customer_hargaPelanggan_std',
        'Customer_bandwidthBaru_mean',
        'Customer_bandwidthBaru_std',
        'Customer_Days_Since_Last_mean',
        'Customer_Order_Seq_max',
        'Transition_Frequency',
        'Price_Ratio',
        'BW_Ratio',
        'Price_BW_Interaction',
        'Days_Order_Interaction',
        'Is_Repeat_Customer',
        'Is_Long_Gap',
        'transition_code',
        # NEW ADVANCED FEATURES
        'Price_Rolling_Mean_3',
        'BW_Rolling_Mean_3',
        'Days_Rolling_Mean_3',
        'Product_Popularity',
        'Prev_Product_Popularity',
        'Customer_Frequency',
        'Days_Since_Year_Start',
        'Days_Since_Month_Start',
        'Is_Month_End',
        'Is_Quarter_End',
        'Price_Change',
        'Price_Change_Pct',
        'BW_Change',
        'Customer_Product_Count',
        'Segmen_Product_Count',
        'Order_Seq_Normalized',
        'Days_Since_Last_Normalized'
    ]

    # Filter features that exist in train_data
    available_features = [f for f in features if f in train_data.columns]
    missing_features = [f for f in features if f not in train_data.columns]
    if missing_features:
        print(f"   Warning: Fitur yang tidak ditemukan: {missing_features}")
    
    X = train_data[available_features].copy()
    print(f"   ✓ Total fitur yang digunakan: {len(available_features)}")

    # 3. Clean data - replace inf and NaN
    print("   Step 3: Membersihkan data...")
    X = X.replace([np.inf, -np.inf], np.nan)
    X = X.fillna(0)

    # Check for any remaining issues
    if X.isnull().any().any():
        print("   Warning: Masih ada NaN, mengisi dengan 0...")
        X = X.fillna(0)
    print("   ✓ Data bersih")

    # 4. KHUSUS TARGET (y): Buat Encoder Baru Setelah Filter
    print("   Step 4: Encoding target...")
    le_target_final = LabelEncoder()
    y = le_target_final.fit_transform(train_data['Next_Product'].astype(str))
    print(f"   ✓ Target encoded, jumlah kelas: {len(np.unique(y))}")

    # 5. Split Data dengan error handling untuk stratify
    print("   Step 5: Membagi data training dan test...")
    try:
        X_train, X_test, y_train, y_test = train_test_split(
            X, y,
            test_size=0.2,
            random_state=42,
            stratify=y
        )
        print(f"   ✓ Data split dengan stratify")
    except ValueError as e:
        print(f"   Warning: Stratify gagal ({e}), menggunakan split tanpa stratify...")
        X_train, X_test, y_train, y_test = train_test_split(
            X, y,
            test_size=0.2,
            random_state=42
        )
        print(f"   ✓ Data split tanpa stratify")
    
    print(f"   Data training: {len(X_train)}, Data test: {len(X_test)}")

    # 6. Training Model dengan Parameter yang Dioptimalkan (IMPROVED)
    print("   Step 6: Training model dengan parameter yang lebih optimal...")
    
    # Handle class imbalance dengan scale_pos_weight
    from collections import Counter
    class_counts = Counter(y_train)
    max_class_count = max(class_counts.values())
    scale_pos_weight_dict = {cls: max_class_count / count for cls, count in class_counts.items()}
    # XGBoost tidak support per-class weights untuk multi-class, jadi kita gunakan parameter lain
    
    model = xgb.XGBClassifier(
        objective='multi:softprob',
        eval_metric='mlogloss',
        n_estimators=600,  # Increased lebih banyak
        max_depth=9,  # Increased untuk capture lebih kompleks
        learning_rate=0.03,  # Reduced lebih jauh untuk better generalization
        subsample=0.85,  # Slightly increased
        colsample_bytree=0.85,  # Slightly increased
        colsample_bylevel=0.8,  # NEW: subsample columns per level
        min_child_weight=2,  # Reduced untuk lebih fleksibel
        gamma=0.15,  # Increased untuk regularization
        reg_alpha=0.2,  # Increased L1 regularization
        reg_lambda=1.5,  # Increased L2 regularization
        max_delta_step=1,  # Help with imbalanced classes
        random_state=42,
        n_jobs=-1,
        tree_method='hist'  # Faster and often better
    )

    # Fit model (tanpa eval_set untuk menghindari error)
    model.fit(X_train, y_train)

    print(f"   ✓ Model berhasil dilatih dengan {model.n_estimators} iterasi")
    print("   ✓ Training selesai!")

except Exception as e:
    print(f"   ERROR: {type(e).__name__}: {str(e)}")
    import traceback
    traceback.print_exc()
    raise


4. Training XGBoost dengan Fitur Lengkap...
   Step 1: Encoding fitur tambahan...
   ✓ Encoding selesai
   Step 2: Menyiapkan fitur...
   ✓ Total fitur yang digunakan: 38
   Step 3: Membersihkan data...
   ✓ Data bersih
   Step 4: Encoding target...
   ✓ Target encoded, jumlah kelas: 211
   Step 5: Membagi data training dan test...
   ✓ Data split dengan stratify
   Data training: 180338, Data test: 45085
   Step 6: Training model dengan parameter yang lebih optimal...
   ✓ Model berhasil dilatih dengan 600 iterasi
   ✓ Training selesai!


In [21]:
# ==========================================
# OPSIONAL: HYPERPARAMETER TUNING (UNTUK AKURASI MAKSIMAL)
# ==========================================
# Uncomment bagian ini jika ingin melakukan tuning lebih lanjut
# WARNING: Proses ini memakan waktu lama (bisa 30-60 menit)

# print("\n4b. Hyperparameter Tuning (Opsional - Memakan Waktu)...")
# 
# # Define parameter grid
# param_grid = {
#     'max_depth': [6, 8, 10],
#     'learning_rate': [0.03, 0.05, 0.1],
#     'n_estimators': [300, 500, 700],
#     'subsample': [0.7, 0.8, 0.9],
#     'colsample_bytree': [0.7, 0.8, 0.9],
#     'min_child_weight': [1, 3, 5],
#     'gamma': [0, 0.1, 0.2],
#     'reg_alpha': [0, 0.1, 0.5],
#     'reg_lambda': [0.5, 1.0, 1.5]
# }
# 
# # Create base model
# base_model = xgb.XGBClassifier(
#     objective='multi:softprob',
#     eval_metric='mlogloss',
#     random_state=42,
#     n_jobs=-1
# )
# 
# # Randomized search (lebih cepat dari GridSearch)
# random_search = RandomizedSearchCV(
#     base_model,
#     param_distributions=param_grid,
#     n_iter=20,  # Jumlah kombinasi yang dicoba
#     cv=3,  # 3-fold CV untuk mempercepat
#     scoring='accuracy',
#     n_jobs=-1,
#     random_state=42,
#     verbose=1
# )
# 
# random_search.fit(X_train, y_train)
# 
# print(f"\n   Best Parameters: {random_search.best_params_}")
# print(f"   Best CV Score: {random_search.best_score_*100:.2f}%")
# 
# # Gunakan model terbaik
# model = random_search.best_estimator_
# 
# # Fit ulang dengan early stopping
# model.fit(
#     X_train, y_train,
#     eval_set=[(X_train, y_train), (X_test, y_test)],
#     early_stopping_rounds=20,
#     verbose=False
# )

In [None]:
# ==========================================
# BAGIAN 5: EVALUASI MODEL
# ==========================================
print("\n5. Evaluasi Model pada Data Test...")

# Prediksi pada test set
preds = model.predict(X_test)
acc = accuracy_score(y_test, preds)
print(f"   Akurasi Model (Test Set): {acc*100:.2f}%")

# Classification report untuk detail per kelas
print("\n   Classification Report (Top 10 kelas):")
unique_classes = np.unique(y_test)
if len(unique_classes) > 10:
    # Ambil 10 kelas dengan sample terbanyak
    class_counts = pd.Series(y_test).value_counts().head(10)
    top_classes = class_counts.index.tolist()
    report = classification_report(y_test, preds, labels=top_classes, 
                                   target_names=[le_target_final.inverse_transform([c])[0] for c in top_classes],
                                   zero_division=0, output_dict=True)
    print(f"   (Menampilkan 10 kelas dengan sample terbanyak dari {len(unique_classes)} total kelas)")
else:
    report = classification_report(y_test, preds, 
                                   target_names=[le_target_final.inverse_transform([c])[0] for c in unique_classes],
                                   zero_division=0)

# Feature importance
print("\n   Top 15 Fitur Paling Penting:")
feature_importance = pd.DataFrame({
    'feature': available_features,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)
print(feature_importance.head(15).to_string(index=False))

# ==========================================
# BAGIAN 6: PREDIKSI & EXPORT HASIL
# ==========================================
print("\n6. Menghasilkan Prediksi Akhir...")

# Ambil data kondisi terakhir setiap customer untuk diprediksi
X_latest = latest_status.copy()

# Tambahkan fitur lanjutan untuk data prediksi
X_latest['Year'] = X_latest[col_date].dt.year
X_latest['Month'] = X_latest[col_date].dt.month
X_latest['Quarter'] = X_latest[col_date].dt.quarter
X_latest['DayOfWeek'] = X_latest[col_date].dt.dayofweek
X_latest['IsWeekend'] = (X_latest['DayOfWeek'] >= 5).astype(int)

# Customer stats untuk data prediksi
X_latest = X_latest.merge(customer_stats, on=col_id, how='left')

# Transition features
X_latest['Transition'] = X_latest[col_prod].astype(str) + '_TO_' + X_latest['Prev_Product'].astype(str)
X_latest['Transition_Frequency'] = X_latest['Transition'].map(transition_counts).fillna(0)

# Ratios
X_latest['Price_Ratio'] = X_latest[col_price] / (X_latest['Customer_hargaPelanggan_mean'] + 1e-6)
X_latest['BW_Ratio'] = X_latest[col_bw] / (X_latest['Customer_bandwidthBaru_mean'] + 1e-6)

# Interactions
X_latest['Price_BW_Interaction'] = X_latest[col_price] * X_latest[col_bw]
X_latest['Days_Order_Interaction'] = X_latest['Days_Since_Last'] * X_latest['Order_Seq']
X_latest['Is_Repeat_Customer'] = (X_latest['Order_Seq'] > 1).astype(int)
X_latest['Is_Long_Gap'] = (X_latest['Days_Since_Last'] > X_latest['Customer_Days_Since_Last_mean']).astype(int)
X_latest['Segmen_SBU'] = X_latest[col_segmen].astype(str) + '_' + X_latest[col_sbu].astype(str)

# NEW: Advanced features untuk prediksi
# Rolling statistics (gunakan nilai terakhir sebagai proxy)
X_latest['Price_Rolling_Mean_3'] = X_latest[col_price]  # Proxy: gunakan nilai saat ini
X_latest['BW_Rolling_Mean_3'] = X_latest[col_bw]
X_latest['Days_Rolling_Mean_3'] = X_latest['Days_Since_Last']

# Product popularity
X_latest['Product_Popularity'] = X_latest[col_prod].map(product_popularity).fillna(0)
X_latest['Prev_Product_Popularity'] = X_latest['Prev_Product'].map(product_popularity).fillna(0)

# Customer frequency
X_latest['Customer_Frequency'] = X_latest[col_id].map(customer_frequency).fillna(1)

# Time-based advanced
X_latest['Days_Since_Year_Start'] = (X_latest[col_date] - pd.to_datetime(X_latest['Year'].astype(str) + '-01-01')).dt.days
X_latest['Days_Since_Month_Start'] = (X_latest[col_date] - pd.to_datetime(X_latest[col_date].dt.to_period('M').astype(str))).dt.days
X_latest['Is_Month_End'] = (X_latest[col_date].dt.day > 25).astype(int)
X_latest['Is_Quarter_End'] = X_latest['Month'].isin([3, 6, 9, 12]).astype(int)

# Price change (proxy: 0 karena tidak ada history)
X_latest['Price_Change'] = 0
X_latest['Price_Change_Pct'] = 0
X_latest['BW_Change'] = 0

# Customer-Product and Segment-Product affinity
X_latest = X_latest.merge(customer_product_count, on=[col_id, col_prod], how='left')
X_latest['Customer_Product_Count'] = X_latest['Customer_Product_Count'].fillna(0)
X_latest = X_latest.merge(segmen_product_count, on=[col_segmen, col_prod], how='left')
X_latest['Segmen_Product_Count'] = X_latest['Segmen_Product_Count'].fillna(0)

# Normalized features
X_latest['Order_Seq_Normalized'] = X_latest['Order_Seq'] / (X_latest['Customer_Order_Seq_max'] + 1e-6)
X_latest['Days_Since_Last_Normalized'] = X_latest['Days_Since_Last'] / (X_latest['Customer_Days_Since_Last_mean'] + 1e-6)

# Fill NaN
X_latest = X_latest.fillna(0)

# Encoding fitur untuk data prediksi (dengan error handling)
def safe_transform(encoder, values):
    """Transform dengan handling untuk nilai yang tidak terlihat"""
    result = []
    for val in values:
        val_str = str(val)
        if val_str in encoder.classes_:
            result.append(encoder.transform([val_str])[0])
        else:
            # Gunakan nilai default (0 atau nilai pertama)
            result.append(0)
    return result

X_latest['prod_code'] = safe_transform(le_prod, X_latest[col_prod].astype(str))
X_latest['prev_prod_code'] = safe_transform(le_prev_prod, X_latest['Prev_Product'].astype(str))
X_latest['segmen_code'] = safe_transform(le_segmen, X_latest[col_segmen].astype(str))
X_latest['sbu_code'] = safe_transform(le_sbu, X_latest[col_sbu].astype(str))

# Encode interaction features (handle unseen values)
X_latest['segmen_sbu_code'] = X_latest['Segmen_SBU'].apply(
    lambda x: le_segmen_sbu.transform([str(x)])[0] if str(x) in le_segmen_sbu.classes_ else 0
)
X_latest['transition_code'] = X_latest['Transition'].apply(
    lambda x: le_transition.transform([str(x)])[0] if str(x) in le_transition.classes_ else 0
)

# Pilih kolom fitur yang sama persis dengan saat training
X_final_pred = X_latest[available_features].copy()

# Clean data prediksi - replace inf and NaN
X_final_pred = X_final_pred.replace([np.inf, -np.inf], np.nan)
X_final_pred = X_final_pred.fillna(0)

# Pastikan semua kolom ada
missing_cols = set(available_features) - set(X_final_pred.columns)
if missing_cols:
    print(f"   Warning: Kolom yang hilang: {missing_cols}, mengisi dengan 0...")
    for col in missing_cols:
        X_final_pred[col] = 0

# Pastikan urutan kolom sama dengan training
X_final_pred = X_final_pred[available_features]

# Lakukan Prediksi
print("   Melakukan prediksi...")
pred_codes = model.predict(X_final_pred)

# PENTING: Decode pakai le_target_final (yang dibuat di Bagian 4)
pred_names = le_target_final.inverse_transform(pred_codes)

# Simpan Hasil
hasil_akhir = latest_status[[col_id, col_prod, col_segmen, col_sbu, col_date]].copy()
hasil_akhir.rename(columns={col_prod: 'Produk_Saat_Ini', col_date: 'Tanggal_Terakhir'}, inplace=True)
hasil_akhir['Rekomendasi_Produk_Berikutnya'] = pred_names
hasil_akhir['Beda_ato_ngga'] = np.where(hasil_akhir['Produk_Saat_Ini'] == hasil_akhir['Rekomendasi_Produk_Berikutnya'], 'Sama', 'Beda')

# Export - file akan disimpan di folder yang sama dengan notebook
output_filename = 'Hasil_Prediksi_Flow_Fixed_2.xlsx'
hasil_akhir.to_excel(output_filename, index=False)
print(f"   Selesai! File tersimpan di: {output_filename}")


5. Evaluasi Model pada Data Test...
   Akurasi Model (Test Set): 79.94%

   Classification Report (Top 10 kelas):
   (Menampilkan 10 kelas dengan sample terbanyak dari 198 total kelas)

   Top 15 Fitur Paling Penting:
                     feature  importance
                   prod_code    0.176622
             transition_code    0.094369
              prev_prod_code    0.064246
                 segmen_code    0.050245
      Customer_Order_Seq_max    0.049218
        Transition_Frequency    0.047204
             segmen_sbu_code    0.047028
        Segmen_Product_Count    0.046669
      Customer_Product_Count    0.038998
 Customer_bandwidthBaru_mean    0.036975
  Customer_bandwidthBaru_std    0.031130
Customer_hargaPelanggan_mean    0.026722
 Customer_hargaPelanggan_std    0.025853
                    sbu_code    0.024618
                        Year    0.020363

6. Menghasilkan Prediksi Akhir...
   Melakukan prediksi...
   Selesai! File tersimpan di: Hasil_Prediksi_Flow_Fixed.xlsx
