In [2]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [4]:
import pandas as pd
import re
import os
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder

# ==========================================
# 1. LOAD ID DARI FILE EKSTERNAL
# ==========================================
nama_file_txt = "error_5050.txt"

print(f"Membaca file: {nama_file_txt} ...")

try:
    with open(nama_file_txt, "r") as f:
        raw_ids = f.read()
    
    # Regex mencari pola 'EMP-' diikuti angka
    target_ids = re.findall(r'EMP-\d+', raw_ids)
    print(f"‚úÖ Berhasil memuat {len(target_ids)} ID unik untuk dilacak.")
    
except FileNotFoundError:
    print(f"‚ùå Error: File '{nama_file_txt}' tidak ditemukan.")
    exit()

# ==========================================
# 2. PREPROCESSING
# ==========================================
df = pd.read_csv("../data/dataset_miniproject.csv")

ids_series = df['Employee_ID']
y_raw = df['Attrition_Risk_Level']

# Encode & Scale
X_raw = df.drop(columns=['Employee_ID', 'Attrition_Risk_Level'])
X_encoded = pd.get_dummies(X_raw, drop_first=True) 
le = LabelEncoder()
y_encoded = le.fit_transform(y_raw)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_encoded)

# ==========================================
# 3. FUNGSI PELACAK OTOMATIS
# ==========================================
def cek_status_di_split(ratio_test, nama_split):
    print(f"\n‚è≥ Sedang memproses {nama_split} (Test Size: {ratio_test})...")
    
    # 1. Split
    X_train, X_test, y_train, y_test, id_train, id_test = train_test_split(
        X_scaled, y_encoded, ids_series, 
        test_size=ratio_test, 
        random_state=52,
        stratify=y_encoded 
    )
    
    # 2. Re-Train
    mlp = MLPClassifier(hidden_layer_sizes=(128, 64, 32, 16), 
                        max_iter=300, 
                        random_state=52) 
    mlp.fit(X_train, y_train)
    
    # 3. Predict
    y_pred = mlp.predict(X_test)
    y_prob = mlp.predict_proba(X_test)
    confidence = [max(probs) for probs in y_prob]
    
    # 4. DataFrame
    hasil_df = pd.DataFrame({
        'Employee_ID': id_test,
        'Label_Asli_Code': y_test,
        'Prediksi_Code': y_pred,
        'Confidence': confidence
    })
    
    # Decode
    hasil_df['Label_Asli'] = le.inverse_transform(hasil_df['Label_Asli_Code'])
    hasil_df['Prediksi_MLP'] = le.inverse_transform(hasil_df['Prediksi_Code'])
    hasil_df['Status'] = hasil_df.apply(
        lambda x: 'CORRECT' if x['Label_Asli'] == x['Prediksi_MLP'] else 'WRONG', axis=1
    )
    
    # 5. Filter ID
    laporan = hasil_df[hasil_df['Employee_ID'].isin(target_ids)].copy()
    
    # Tambahkan kolom info split
    laporan['Skenario_Split'] = nama_split
    
    cols = ['Employee_ID', 'Skenario_Split', 'Label_Asli', 'Prediksi_MLP', 'Confidence', 'Status']
    return laporan[cols]

# ==========================================
# 4. MENU PILIHAN INTERAKTIF
# ==========================================
print("\n" + "="*40)
print("   PILIH SKENARIO YANG INGIN DICARI")
print("="*40)
print("1. Split 50:50 Only")
print("2. Split 60:40 Only")
print("3. Split 70:30 Only")
print("4. Jalankan SEMUA dan Gabungkan (All in One)")

pilihan = input("\nMasukkan angka pilihan (1-4): ")

output_df = pd.DataFrame()
nama_file_output = ""

if pilihan == '1':
    output_df = cek_status_di_split(0.5, "50:50")
    nama_file_output = "Hasil_Analisis_50_50.csv"

elif pilihan == '2':
    output_df = cek_status_di_split(0.4, "60:40")
    nama_file_output = "Hasil_Analisis_60_40.csv"

elif pilihan == '3':
    output_df = cek_status_di_split(0.3, "70:30")
    nama_file_output = "Hasil_Analisis_70_30.csv"

elif pilihan == '4':
    print("\nMenjalankan semua skenario...")
    df1 = cek_status_di_split(0.5, "50:50")
    df2 = cek_status_di_split(0.4, "60:40")
    df3 = cek_status_di_split(0.3, "70:30")
    output_df = pd.concat([df1, df2, df3])
    nama_file_output = "Hasil_Analisis_Lengkap_Gabungan.csv"

else:
    print("‚ùå Pilihan tidak valid. Keluar.")
    exit()

# ==========================================
# 5. EXPORT HASIL
# ==========================================
if not output_df.empty:
    print("\n" + "-"*40)
    print("HASIL ANALISIS:")
    print("-" * 40)
    
    # Cek apakah kosong (artinya semua ID target masuk ke Training set, bukan Test set)
    if len(output_df) == 0:
        print("‚ö†Ô∏è  Data Kosong: ID yang kamu cari semuanya masuk ke TRAINING SET pada split ini.")
        print("    Tidak ada data yang bisa disimpan.")
    else:
        print(output_df.head().to_string(index=False))
        
        output_df.to_csv(nama_file_output, index=False)
        print(f"\n‚úÖ File CSV berhasil disimpan: {nama_file_output}")
        print(f"   Total baris: {len(output_df)}")

Membaca file: error_5050.txt ...
‚úÖ Berhasil memuat 108 ID unik untuk dilacak.

   PILIH SKENARIO YANG INGIN DICARI
1. Split 50:50 Only
2. Split 60:40 Only
3. Split 70:30 Only
4. Jalankan SEMUA dan Gabungkan (All in One)

‚è≥ Sedang memproses 60:40 (Test Size: 0.4)...

----------------------------------------
HASIL ANALISIS:
----------------------------------------
Employee_ID Skenario_Split  Label_Asli  Prediksi_MLP  Confidence  Status
  EMP-00407          60:40           1             1    1.000000 CORRECT
  EMP-01787          60:40           1             1    1.000000 CORRECT
  EMP-04168          60:40           2             1    0.838951   WRONG
  EMP-13445          60:40           2             2    1.000000 CORRECT
  EMP-11787          60:40           2             2    1.000000 CORRECT

‚úÖ File CSV berhasil disimpan: Hasil_Analisis_60_40.csv
   Total baris: 81


In [7]:
# ==========================================
# 1. LOAD ID DARI FILE EKSTERNAL
# ==========================================
nama_file_txt = "error_5050.txt"

print(f"Membaca file: {nama_file_txt} ...")

try:
    with open(nama_file_txt, "r") as f:
        raw_ids = f.read()
    target_ids = re.findall(r'EMP-\d+', raw_ids)
    print(f"‚úÖ Berhasil memuat {len(target_ids)} ID unik untuk dilacak.")
    
except FileNotFoundError:
    print(f"‚ùå Error: File '{nama_file_txt}' tidak ditemukan.")
    exit()

# ==========================================
# 2. PREPROCESSING
# ==========================================
df = pd.read_csv("../data/dataset_miniproject.csv")

ids_series = df['Employee_ID']
y_raw = df['Attrition_Risk_Level']

# Encode & Scale
X_raw = df.drop(columns=['Employee_ID', 'Attrition_Risk_Level'])
X_encoded = pd.get_dummies(X_raw, drop_first=True) 
le = LabelEncoder()
y_encoded = le.fit_transform(y_raw)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_encoded)

# ==========================================
# 3. FUNGSI PELACAK OTOMATIS
# ==========================================
def cek_status_di_split(ratio_test, nama_split):
    print(f"\n‚è≥ Sedang memproses {nama_split} ... mohon tunggu.")
    
    # Split (Random State 52)
    X_train, X_test, y_train, y_test, id_train, id_test = train_test_split(
        X_scaled, y_encoded, ids_series, 
        test_size=ratio_test, 
        random_state=52,
        stratify=y_encoded 
    )
    
    # Re-Train
    mlp = MLPClassifier(hidden_layer_sizes=(128, 64, 32, 16), 
                        max_iter=300, 
                        random_state=52) 
    mlp.fit(X_train, y_train)
    
    # Predict
    y_pred = mlp.predict(X_test)
    y_prob = mlp.predict_proba(X_test)
    confidence = [max(probs) for probs in y_prob]
    
    # DataFrame Hasil
    hasil_df = pd.DataFrame({
        'Employee_ID': id_test,
        'Label_Asli_Code': y_test,
        'Prediksi_Code': y_pred,
        'Confidence': confidence
    })
    
    # Decode & Status
    hasil_df['Label_Asli'] = le.inverse_transform(hasil_df['Label_Asli_Code'])
    hasil_df['Prediksi_MLP'] = le.inverse_transform(hasil_df['Prediksi_Code'])
    hasil_df['Status'] = hasil_df.apply(
        lambda x: 'BENAR' if x['Label_Asli'] == x['Prediksi_MLP'] else 'SALAH', axis=1
    )
    
    # Filter sesuai ID di TXT
    laporan = hasil_df[hasil_df['Employee_ID'].isin(target_ids)].copy()
    cols = ['Employee_ID', 'Label_Asli', 'Prediksi_MLP', 'Confidence', 'Status']
    
    return laporan[cols]

# ==========================================
# 4. MENU INTERAKTIF (BAGIAN BARU)
# ==========================================
print("\n" + "="*40)
print("   MENU PILIHAN ANALISIS SPLIT")
print("="*40)
print("1. Cek di Split 50:50")
print("2. Cek di Split 60:40")
print("3. Cek di Split 70:30")
print("4. Jalankan SEMUA & Gabungkan (All-in-One)")
print("0. Keluar")

pilihan = input("\nMasukkan nomor pilihan (0-4): ")

output_df = pd.DataFrame()
nama_file_output = ""

if pilihan == '1':
    output_df = cek_status_di_split(0.5, "Split 50:50")
    nama_file_output = "Analisis_Error_Khusus_50_50.csv"

elif pilihan == '2':
    output_df = cek_status_di_split(0.4, "Split 60:40")
    nama_file_output = "Analisis_Error_Khusus_60_40.csv"

elif pilihan == '3':
    output_df = cek_status_di_split(0.3, "Split 70:30")
    nama_file_output = "Analisis_Error_Khusus_70_30.csv"

elif pilihan == '4':
    # Jalankan semua
    df50 = cek_status_di_split(0.5, "Split 50:50")
    df60 = cek_status_di_split(0.4, "Split 60:40")
    df70 = cek_status_di_split(0.3, "Split 70:30")
    
    df50['Skenario'] = '50:50'
    df60['Skenario'] = '60:40'
    df70['Skenario'] = '70:30'
    
    output_df = pd.concat([df50, df60, df70])
    nama_file_output = "Analisis_Error_Gabungan_Lengkap.csv"

elif pilihan == '0':
    print("Keluar dari program.")
    exit()

else:
    print("Pilihan tidak valid.")
    exit()

# ==========================================
# 5. SIMPAN HASIL
# ==========================================
if not output_df.empty:
    print(f"\n‚úÖ Analisis Selesai! Ditemukan {len(output_df)} baris data.")
    output_df.to_csv(nama_file_output, index=False)
    print(f"üìÇ File berhasil disimpan sebagai: {nama_file_output}")
    
    print("\nPreview Data:")
    print(output_df.head())
else:
    print("\n‚ö†Ô∏è Tidak ada data ID tersebut yang ditemukan di Test Set split ini.")
    print("(Kemungkinan ID tersebut masuk ke Training Set pada split ratio ini).")

Membaca file: error_5050.txt ...
‚úÖ Berhasil memuat 108 ID unik untuk dilacak.

   MENU PILIHAN ANALISIS SPLIT
1. Cek di Split 50:50
2. Cek di Split 60:40
3. Cek di Split 70:30
4. Jalankan SEMUA & Gabungkan (All-in-One)
0. Keluar

‚è≥ Sedang memproses Split 70:30 ... mohon tunggu.

‚úÖ Analisis Selesai! Ditemukan 59 baris data.
üìÇ File berhasil disimpan sebagai: Analisis_Error_Khusus_70_30.csv

Preview Data:
     Employee_ID  Label_Asli  Prediksi_MLP  Confidence Status
3581   EMP-03582           1             1    1.000000  BENAR
1781   EMP-01782           0             0    1.000000  BENAR
5935   EMP-05936           0             0    1.000000  BENAR
6303   EMP-06304           0             0    1.000000  BENAR
4167   EMP-04168           2             1    0.996063  SALAH
