In [None]:
# ==========================================
# BAGIAN 1: IMPORT LIBRARY & LOAD DATA
# ==========================================
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import numpy as np

In [None]:
# Update path untuk local - letakkan file Excel di folder yang sama dengan notebook
# Atau sesuaikan path sesuai lokasi file Anda
import os
file_path = 'SPE-OPT-31122025.xlsx'  # File di folder yang sama dengan notebook
# Alternatif: file_path = r'D:\ICON+\Prediksi Flow Produk\SPE-OPT-31122025.xlsx'

print("1. Sedang membaca file (ini mungkin memakan waktu untuk 200k baris)...")
try:
    # Jika file csv, ganti read_excel jadi read_csv
    df = pd.read_excel(file_path)
    print(f"   Sukses! Total data: {len(df)} baris.")
except Exception as e:
    print(f"   Error: {e}")
    print(f"   Pastikan file 'SPE-OPT-31122025.xlsx' ada di folder yang sama dengan notebook ini.")

1. Sedang membaca file (ini mungkin memakan waktu untuk 200k baris)...
   Sukses! Total data: 275088 baris.


In [4]:
# ==========================================
# BAGIAN 2: CLEANING & ADVANCED FEATURE ENGINEERING (MODIFIED)
# ==========================================
print("2. Membersihkan & Menambah Fitur Canggih...")

# Define columns (Added Price and Bandwidth)
col_id = 'idPerusahaan'
col_prod = 'namaProduk'
col_date = 'tanggalAwalKontrak' # Or 'tanggalBuatPermohonan' if available
col_segmen = 'segmenCustomer'
col_sbu = 'sbuOwner'
col_price = 'hargaPelanggan' # NEW
col_bw = 'bandwidthBaru' # NEW

# Select columns
cols_to_use = [col_id, col_prod, col_date, col_segmen, col_sbu, col_price, col_bw]
df_ml = df[cols_to_use].copy()

# 1. Clean Data Types
df_ml[col_date] = pd.to_datetime(df_ml[col_date], errors='coerce')

# Clean Price (Remove commas/dots if string)
def clean_currency(x):
    try:
        return float(str(x).replace(',', '.').replace('nan', '0'))
    except:
        return 0
df_ml[col_price] = df_ml[col_price].apply(clean_currency)
df_ml[col_bw] = pd.to_numeric(df_ml[col_bw], errors='coerce').fillna(0)

# 2. Sort is CRITICAL for flow
df_ml.dropna(subset=[col_prod, col_date], inplace=True)
df_ml.sort_values(by=[col_id, col_date], inplace=True)

# 3. Create "Flow" Features (The Magic Sauce)
grouped = df_ml.groupby(col_id)

# Feature A: Previous Product (What did they buy BEFORE this?)
df_ml['Prev_Product'] = grouped[col_prod].shift(1).fillna('New Customer')

# Feature B: Days Since Last Order (Gap time)
df_ml['Prev_Date'] = grouped[col_date].shift(1)
df_ml['Days_Since_Last'] = (df_ml[col_date] - df_ml['Prev_Date']).dt.days.fillna(-1)

# Feature C: Order Sequence (1st order, 2nd order, etc.)
df_ml['Order_Seq'] = grouped.cumcount() + 1

# 4. Create Target (Next Product)
df_ml['Next_Product'] = grouped[col_prod].shift(-1)

# Filter valid data for training
train_data_raw = df_ml.dropna(subset=['Next_Product']).copy()
latest_status = df_ml.groupby(col_id).tail(1).copy()

# Filter Rare Products (Same as your code)
target_counts = train_data_raw['Next_Product'].value_counts()
valid_targets = target_counts[target_counts >= 2].index
train_data = train_data_raw[train_data_raw['Next_Product'].isin(valid_targets)].copy()

print(f"   Data Ready! Added features: Price, Bandwidth, Days_Since_Last, Prev_Product")

2. Membersihkan & Menambah Fitur Canggih...
   Data Ready! Added features: Price, Bandwidth, Days_Since_Last, Prev_Product


In [6]:
# ==========================================
# BAGIAN 3: ENCODING (PERBAIKAN - MENANGANI DATA HISTORY)
# ==========================================
print("3. Encoding Data & Persiapan Fitur...")

# 1. Inisialisasi Encoders
le_prod = LabelEncoder()
le_prev_prod = LabelEncoder()
le_segmen = LabelEncoder()
le_sbu = LabelEncoder()
le_target = LabelEncoder()

# 2. Fit Encoders (Belajar dari seluruh kemungkinan data)
# Gabung Produk saat ini + Previous + Next supaya encoder tahu semua jenis produk
all_products = pd.concat([
    df_ml[col_prod],
    df_ml['Prev_Product'],
    df_ml['Next_Product'].dropna()
]).unique().astype(str)

le_prod.fit(all_products)
le_prev_prod.fit(all_products) # Gunakan list produk yang sama
le_target.fit(all_products)    # Gunakan list produk yang sama

# Fit segmen & SBU
le_segmen.fit(df_ml[col_segmen].astype(str))
le_sbu.fit(df_ml[col_sbu].astype(str))

# 3. Transformasi Data Training (Ubah teks jadi angka)
# Kita buat kolom baru berakhiran '_code'
train_data['prod_code'] = le_prod.transform(train_data[col_prod].astype(str))
train_data['prev_prod_code'] = le_prev_prod.transform(train_data['Prev_Product'].astype(str))
train_data['segmen_code'] = le_segmen.transform(train_data[col_segmen].astype(str))
train_data['sbu_code'] = le_sbu.transform(train_data[col_sbu].astype(str))
train_data['target_code'] = le_target.transform(train_data['Next_Product'].astype(str))

print("   Encoding Selesai!")
print("   Fitur Siap: Produk, History Produk, Harga, Durasi, Segmen, dll.")

3. Encoding Data & Persiapan Fitur...
   Encoding Selesai!
   Fitur Siap: Produk, History Produk, Harga, Durasi, Segmen, dll.


In [8]:
# ==========================================
# BAGIAN 4: TRAINING XGBOOST (FIXED: RE-ENCODING TARGET)
# ==========================================
print("\n4. Training XGBoost...")

# 1. Definisi Fitur (X)
# Fitur (X) boleh punya gap angka, tidak masalah.
features = [
    'prod_code',
    'prev_prod_code',
    'segmen_code',
    'sbu_code',
    'Days_Since_Last',
    col_price,
    col_bw,
    'Order_Seq'
]

X = train_data[features]

# 2. KHUSUS TARGET (y): Buat Encoder Baru Setelah Filter
# Ini solusi untuk error "Invalid classes". Kita reset nomor urut targetnya.
le_target_final = LabelEncoder()
y = le_target_final.fit_transform(train_data['Next_Product'].astype(str))

# 3. Split Data
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# 4. Training Model
model = xgb.XGBClassifier(
    objective='multi:softmax',
    eval_metric='mlogloss',
    use_label_encoder=False,
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1
)

model.fit(X_train, y_train)


4. Training XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [None]:
# 5. Evaluasi
preds = model.predict(X_test)
acc = accuracy_score(y_test, preds)
print(f"   Akurasi Model: {acc*100:.2f}%")

# ==========================================
# BAGIAN 5: PREDIKSI & EXPORT HASIL
# ==========================================
print("\n5. Menghasilkan Prediksi Akhir...")

# Ambil data kondisi terakhir setiap customer untuk diprediksi
X_latest = latest_status.copy()

# Encoding fitur untuk data prediksi (Gunakan encoder global dari Bagian 3)
X_latest['prod_code'] = le_prod.transform(X_latest[col_prod].astype(str))
X_latest['prev_prod_code'] = le_prev_prod.transform(X_latest['Prev_Product'].astype(str))
X_latest['segmen_code'] = le_segmen.transform(X_latest[col_segmen].astype(str))
X_latest['sbu_code'] = le_sbu.transform(X_latest[col_sbu].astype(str))

# Pilih kolom fitur yang sama persis dengan saat training
X_final_pred = X_latest[features]

# Lakukan Prediksi
pred_codes = model.predict(X_final_pred)

# PENTING: Decode pakai le_target_final (yang dibuat di Bagian 4)
pred_names = le_target_final.inverse_transform(pred_codes)

# Simpan Hasil
hasil_akhir = latest_status[[col_id, col_prod, col_segmen, col_sbu, col_date]].copy()
hasil_akhir.rename(columns={col_prod: 'Produk_Saat_Ini', col_date: 'Tanggal_Terakhir'}, inplace=True)
hasil_akhir['Rekomendasi_Produk_Berikutnya'] = pred_names
hasil_akhir['Beda_ato_ngga'] = np.where(hasil_akhir['Produk_Saat_Ini'] == hasil_akhir['Rekomendasi_Produk_Berikutnya'], 'Sama', 'Beda')

# Export - file akan disimpan di folder yang sama dengan notebook
output_filename = 'Hasil_Prediksi_Flow_Fixed.xlsx'
hasil_akhir.to_excel(output_filename, index=False)
print(f"   Selesai! File tersimpan di: {output_filename}")

   Akurasi Model: 77.50%

5. Menghasilkan Prediksi Akhir...
   Selesai! File tersimpan di: /content/drive/MyDrive/Icon Plus/Hasil_Prediksi_Flow_Fixed.xlsx
