In [4]:
# ==========================================
# BAGIAN 1: IMPORT LIBRARY & LOAD DATA
# ==========================================
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import numpy as np

ModuleNotFoundError: No module named 'pandas'

In [None]:
# Update path untuk local - letakkan file Excel di folder yang sama dengan notebook
# Atau sesuaikan path sesuai lokasi file Anda
import os
file_path = 'SPE-OPT-31122025.xlsx'  # File di folder yang sama dengan notebook
# Alternatif: file_path = r'D:\ICON+\Prediksi Flow Produk\SPE-OPT-31122025.xlsx'

print("1. Sedang membaca file (ini mungkin memakan waktu untuk 200k baris)...")
try:
    # Jika file csv, ganti read_excel jadi read_csv
    df = pd.read_excel(file_path)
    print(f"   Sukses! Total data: {len(df)} baris.")
except Exception as e:
    print(f"   Error: {e}")
    print(f"   Pastikan file 'SPE-OPT-31122025.xlsx' ada di folder yang sama dengan notebook ini.")

1. Sedang membaca file (ini mungkin memakan waktu untuk 200k baris)...
   Sukses! Total data: 275088 baris.


In [None]:
# ==========================================
# BAGIAN 2: CLEANING & FILTER DATA LANGKA (PERBAIKAN)
# ==========================================
print("2. Membersihkan & Memfilter Data...")

col_id = 'idPerusahaan'
col_prod = 'namaProduk'
col_date = 'tanggalAwalKontrak'
col_segmen = 'segmenCustomer'
col_sbu = 'sbuOwner'
col_sales = 'namaSales' # Opsional

df_ml = df[[col_id, col_prod, col_date, col_segmen, col_sbu, col_sales]].copy()

# 1. Fix Tanggal
df_ml[col_date] = pd.to_datetime(df_ml[col_date], errors='coerce')
df_ml.dropna(subset=[col_prod, col_date], inplace=True)
df_ml.sort_values(by=[col_id, col_date], inplace=True)

# 2. Uppercase
for c in [col_prod, col_segmen, col_sbu]:
    df_ml[c] = df_ml[c].astype(str).str.upper().str.strip()

# 3. Buat Target
df_ml['Next_Product'] = df_ml.groupby(col_id)[col_prod].shift(-1)

# Pisahkan Data Training & Latest
train_data_raw = df_ml.dropna(subset=['Next_Product']).copy()
latest_status = df_ml.groupby(col_id).tail(1).copy()

# --- FILTER DATA LANGKA (SOLUSI ERROR) ---
# Kita hitung dulu, produk apa yang munculnya kurang dari 2 kali sebagai target
target_counts = train_data_raw['Next_Product'].value_counts()
valid_targets = target_counts[target_counts >= 2].index

# Hanya ambil data yang targetnya valid (muncul minimal 2x)
# Ini agar saat di-split, produk ini bisa ada di Training DAN Test
train_data = train_data_raw[train_data_raw['Next_Product'].isin(valid_targets)].copy()

print(f"   Data training awal: {len(train_data_raw)}")
print(f"   Data setelah filter produk langka: {len(train_data)}")
print(f"   (Produk yang cuma muncul 1x dibuang agar model tidak error)")

2. Membersihkan & Memfilter Data...
   Data training awal: 225441
   Data setelah filter produk langka: 225423
   (Produk yang cuma muncul 1x dibuang agar model tidak error)


In [None]:
# ==========================================
# BAGIAN 3: ENCODING (PERBAIKAN)
# ==========================================
print("3. Encoding Data...")

# Encoder Input
le_prod_input = LabelEncoder()
le_segmen = LabelEncoder()
le_sbu = LabelEncoder()

# Fit pada seluruh kemungkinan data
all_prods = pd.concat([df_ml[col_prod], df_ml['Next_Product'].dropna()]).unique()
le_prod_input.fit(all_prods)
le_segmen.fit(df_ml[col_segmen])
le_sbu.fit(df_ml[col_sbu])

# Encoder Target (KHUSUS DARI DATA YANG SUDAH DI-FILTER)
le_target = LabelEncoder()
le_target.fit(train_data['Next_Product'])

# Transform
train_data['prod_code'] = le_prod_input.transform(train_data[col_prod])
train_data['segmen_code'] = le_segmen.transform(train_data[col_segmen])
train_data['sbu_code'] = le_sbu.transform(train_data[col_sbu])
train_data['target_code'] = le_target.transform(train_data['Next_Product'])

# Transform Latest Status
# Handle jika ada produk di latest yang tidak dikenal encoder (map ke 0)
latest_status['prod_code'] = latest_status[col_prod].map(lambda x: le_prod_input.transform([x])[0] if x in le_prod_input.classes_ else 0)
latest_status['segmen_code'] = latest_status[col_segmen].map(lambda x: le_segmen.transform([x])[0] if x in le_segmen.classes_ else 0)
latest_status['sbu_code'] = latest_status[col_sbu].map(lambda x: le_sbu.transform([x])[0] if x in le_sbu.classes_ else 0)

3. Encoding Data...


In [None]:
# ==========================================
# BAGIAN 4: TRAINING DENGAN STRATIFY (SOLUSI ERROR)
# ==========================================
print("4. Training XGBoost...")

X = train_data[['prod_code', 'segmen_code', 'sbu_code']]
y = train_data['target_code']

# PENTING: Tambahkan stratify=y
# Ini memaksa pembagian data adil, semua kelas terwakili di training & test
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y  # <--- INI KUNCINYA
)

model = xgb.XGBClassifier(
    objective='multi:softmax',
    num_class=len(le_target.classes_),
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    n_jobs=-1
)

model.fit(X_train, y_train)

preds = model.predict(X_test)
print(f"   Model Berhasil Dilatih! Akurasi: {accuracy_score(y_test, preds)*100:.2f}%")

4. Training XGBoost...
   Model Berhasil Dilatih! Akurasi: 73.04%


In [None]:
# ==========================================
# BAGIAN 5: PREDIKSI & EXPORT (PERBAIKAN)
# ==========================================
print("5. Menghasilkan Prediksi...")

# Siapkan input
X_latest = latest_status[['prod_code', 'segmen_code', 'sbu_code']]

# Prediksi (Outputnya adalah kode dari le_target)
pred_codes = model.predict(X_latest)

# Kembalikan kode angka ke Nama Produk Asli (Pakai le_target!)
pred_names = le_target.inverse_transform(pred_codes)

# Simpan hasil
hasil_akhir = latest_status[[col_id, col_prod, col_segmen, col_sbu, col_sales, col_date]].copy()
hasil_akhir.rename(columns={col_prod: 'Produk_Saat_Ini', col_date: 'Tanggal_Terakhir'}, inplace=True)
hasil_akhir['Rekomendasi_Produk_Berikutnya'] = pred_names

# Export ke Excel - file akan disimpan di folder yang sama dengan notebook
output_filename = 'Hasil_Prediksi_XGBoost.xlsx'
hasil_akhir.to_excel(output_filename, index=False)

print(f"\nSELESAI! File hasil prediksi telah disimpan di:\n{output_filename}")

5. Menghasilkan Prediksi...

SELESAI! File hasil prediksi telah disimpan di:
/content/drive/MyDrive/Icon Plus/Hasil_Prediksi_XGBoost.xlsx
