# ===== 01. Penggunaan library ===== #

In [20]:
import pandas as pd
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from tabulate import tabulate

warnings.filterwarnings('ignore')

# ===== 02. Load Dataset ===== #

In [21]:
file_path = '../data/raw/Dataset Bahan Makanan & Alergen.xlsx'
df = pd.read_excel(file_path, sheet_name = 'Dataset')

In [28]:
import numpy as np
from collections import Counter

print("🔍 Informasi Dataset:")
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")

# **IMPORTANT: Check target distribution**
print("\n🎯 Distribusi Target 'Prediksi':")
print(df['Prediksi'].value_counts())
print(f"\nTotal samples: {len(df)}")

# **VERY IMPORTANT: Check encoded data**
print(f"\n🤖 Check Encoded Data:")
print(f"X_encoded shape: {X_encoded.shape}")
print(f"y_encoded shape: {y_encoded.shape}")
print(f"Unique y_encoded values: {np.unique(y_encoded)}")

# Check if there's data leakage or imbalanced classes
print(f"\ny_encoded distribution: {Counter(y_encoded)}")

# Check a few samples of encoded data to see if they're different
print(f"\n🔍 Sample of X_encoded (first 3 rows, first 10 columns):")
print(X_encoded.iloc[:3, :10])
print(f"...(showing first 10 of {X_encoded.shape[1]} total features)")

# Check if all samples are identical (this could explain the same prediction)
if len(X_encoded) > 1:
    first_row = X_encoded.iloc[0].values
    all_identical = True
    for i in range(1, min(10, len(X_encoded))):  # Check first 10 rows
        if not np.array_equal(first_row, X_encoded.iloc[i].values):
            all_identical = False
            break
    print(f"\n⚠️ Are first 10 rows identical? {all_identical}")
    
    # Check variance across features
    feature_variance = X_encoded.var()
    zero_variance_features = (feature_variance == 0).sum()
    print(f"Features with zero variance: {zero_variance_features} out of {len(feature_variance)}")
    print(f"Percentage of zero-variance features: {zero_variance_features/len(feature_variance)*100:.2f}%")
    
    # Show some features with highest variance
    top_variance_features = feature_variance.nlargest(10)
    print(f"\nTop 10 features with highest variance:")
    for feature, variance in top_variance_features.items():
        print(f"  {feature}: {variance:.4f}")

🔍 Informasi Dataset:
Shape: (399, 8)
Columns: ['Nama Produk Makanan', 'Bahan Utama', 'Pemanis', 'Lemak/Minyak', 'Penyedap Rasa', 'Alergen', 'Prediksi', 'Keterangan']

🎯 Distribusi Target 'Prediksi':
Prediksi
Mengandung Alergen          256
Tidak Mengandung Alergen    143
Name: count, dtype: int64

Total samples: 399

🤖 Check Encoded Data:
X_encoded shape: (399, 623)
y_encoded shape: (399,)
Unique y_encoded values: [0 1]

y_encoded distribution: Counter({np.int64(0): 256, np.int64(1): 143})

🔍 Sample of X_encoded (first 3 rows, first 10 columns):
   Nama Produk Makanan_Air Lemon Manis  Nama Produk Makanan_Apel  \
0                                False                     False   
1                                False                     False   
2                                False                     False   

   Nama Produk Makanan_Apel Karamel  Nama Produk Makanan_Apel Panggang  \
0                             False                              False   
1                          

# ===== 03. Pilih Atribut & Target ===== #

In [23]:
fitur = ['Nama Produk Makanan', 'Bahan Utama', 'Pemanis', 'Lemak/Minyak', 'Penyedap Rasa', 'Alergen']
target = 'Prediksi'

X = df[fitur]
y = df[target]

# ===== 04. Transformasi Nominal ke Numerik ===== #

In [24]:
X_encoded = pd.get_dummies(X)
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# ===== 05. Inisialisasi Model SVM + AdaBoost ===== #

In [25]:
svm_base = SVC(kernel = 'linear', probability = True, random_state = 42)
svm_adaboost_model = AdaBoostClassifier(estimator = svm_base, n_estimators = 50, random_state = 42)

# ===== 06. Evaluasi dengan Cross Validation (K = 10) ===== #

In [26]:
k = 10
cv = StratifiedKFold(n_splits = k, shuffle = True, random_state = 42)
cv_scores = cross_val_score(svm_adaboost_model, X_encoded, y_encoded, cv = cv, scoring = 'accuracy')

df_cv = pd.DataFrame({'Fold': [f'Fold {i+1}' for i in range(k)], 'Akurasi': [f"{score * 100:.2f}%" for score in cv_scores]})

df_cv.loc[k] = ['Rata-Rata', f"{cv_scores.mean() * 100:.2f}%"]

print("\n===== Evaluasi Cross Validation (K = 10) =====")
print(tabulate(df_cv, headers = 'keys', tablefmt = 'grid', showindex = False))

KeyboardInterrupt: 

# ===== 07. Pelatihan Model di Seluruh Data ===== #

In [30]:
svm_adaboost_model.fit(X_encoded, y_encoded)

0,1,2
,estimator,SVC(kernel='l...ndom_state=42)
,n_estimators,50
,learning_rate,1.0
,algorithm,'deprecated'
,random_state,42

0,1,2
,C,1.0
,kernel,'linear'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,True
,tol,0.001
,cache_size,200
,class_weight,


# ===== 08. Prediksi Data Baru ===== #

In [31]:
# ===== DEBUGGING: Why all predictions are the same? =====

print("🔍 DEBUGGING: Model Analysis")
print(f"Label encoder classes: {le.classes_}")
print(f"Label mapping: 0 -> {le.classes_[0]}, 1 -> {le.classes_[1]}")

# Test with data from training set to see if model works correctly
print("\n=== Test with TRAINING data (should have different results) ===")
sample_indices = [0, 1, 10, 50, 100]  # Different samples from training data

for i in sample_indices:
    # Get original training sample
    train_sample = X_encoded.iloc[[i]]  # Keep as DataFrame with correct shape
    train_target = y_encoded[i]
    
    # Predict
    pred_train = svm_adaboost_model.predict(train_sample)
    prob_train = svm_adaboost_model.predict_proba(train_sample)
    
    # Convert back to label
    pred_label = le.inverse_transform(pred_train)[0]
    actual_label = le.classes_[train_target]
    
    confidence = prob_train[0][pred_train[0]] * 100
    
    print(f"Sample {i}: Actual={actual_label}, Predicted={pred_label}, Confidence={confidence:.2f}%")
    print(f"  Probabilities: {prob_train[0]}")

print("\n🤔 Now let's see what happens with NEW data...")

# Create a simple test case that should be very different
test_simple = {'Nama Produk Makanan': 'Test Simple', 'Bahan Utama': 'Test Bahan', 'Pemanis': 'Test Pemanis', 'Lemak/Minyak': 'Test Minyak', 'Penyedap Rasa': 'Test Rasa', 'Alergen': 'Test Alergen'}

df_test_simple = pd.DataFrame([test_simple])
df_test_simple_encoded = pd.get_dummies(df_test_simple)

print(f"\nTest simple encoded shape: {df_test_simple_encoded.shape}")
print(f"Training data shape: {X_encoded.shape}")

# Sync columns
for col in X_encoded.columns:
    if col not in df_test_simple_encoded.columns:
        df_test_simple_encoded[col] = 0

df_test_simple_encoded = df_test_simple_encoded[X_encoded.columns]

print(f"Test simple aligned shape: {df_test_simple_encoded.shape}")

# Check if test data is all zeros (this would explain the problem!)
non_zero_features = (df_test_simple_encoded != 0).sum().sum()
total_features = df_test_simple_encoded.shape[0] * df_test_simple_encoded.shape[1]
print(f"Non-zero features in test data: {non_zero_features} out of {total_features}")
print(f"Percentage non-zero: {non_zero_features/total_features*100:.4f}%")

# Make prediction
pred_simple = svm_adaboost_model.predict(df_test_simple_encoded)
prob_simple = svm_adaboost_model.predict_proba(df_test_simple_encoded)

print(f"\nSimple test prediction: {le.inverse_transform(pred_simple)[0]}")
print(f"Simple test confidence: {prob_simple[0][pred_simple[0]]*100:.2f}%")
print(f"Simple test probabilities: {prob_simple[0]}")

print("\n🎯 HYPOTHESIS: New data becomes mostly zeros after one-hot encoding!")
print("When we have new categories not seen in training, they all become 0 vectors.")
print("The model then always predicts the same thing for zero vectors!")

# Let's test with data that MATCHES training categories
print("\n=== Test with data using EXISTING training categories ===")

# Find some actual values from training data
print("Some actual training values:")
print(f"Nama Produk: {df['Nama Produk Makanan'].unique()[:5]}")
print(f"Bahan Utama: {df['Bahan Utama'].unique()[:5]}")
print(f"Pemanis: {df['Pemanis'].unique()[:3]}")
print(f"Lemak/Minyak: {df['Lemak/Minyak'].unique()[:3]}")

# Test with existing categories
test_existing = {
    'Nama Produk Makanan': df['Nama Produk Makanan'].iloc[0],
    'Bahan Utama': df['Bahan Utama'].iloc[1],  # Mix different values
    'Pemanis': df['Pemanis'].iloc[2], 
    'Lemak/Minyak': df['Lemak/Minyak'].iloc[3],
    'Penyedap Rasa': df['Penyedap Rasa'].iloc[4],
    'Alergen': df['Alergen'].iloc[5]
}

print(f"\nTesting with existing categories: {test_existing}")

df_test_existing = pd.DataFrame([test_existing])
df_test_existing_encoded = pd.get_dummies(df_test_existing)

for col in X_encoded.columns:
    if col not in df_test_existing_encoded.columns:
        df_test_existing_encoded[col] = 0
        
df_test_existing_encoded = df_test_existing_encoded[X_encoded.columns]

# Count non-zero features
non_zero_existing = (df_test_existing_encoded != 0).sum().sum()
print(f"Non-zero features with existing categories: {non_zero_existing}")

# Predict
pred_existing = svm_adaboost_model.predict(df_test_existing_encoded)
prob_existing = svm_adaboost_model.predict_proba(df_test_existing_encoded)

print(f"Prediction with existing categories: {le.inverse_transform(pred_existing)[0]}")
print(f"Confidence: {prob_existing[0][pred_existing[0]]*100:.2f}%")
print(f"Probabilities: {prob_existing[0]}")

🔍 DEBUGGING: Model Analysis
Label encoder classes: ['Mengandung Alergen' 'Tidak Mengandung Alergen']
Label mapping: 0 -> Mengandung Alergen, 1 -> Tidak Mengandung Alergen

=== Test with TRAINING data (should have different results) ===
Sample 0: Actual=Mengandung Alergen, Predicted=Mengandung Alergen, Confidence=71.34%
  Probabilities: [0.7133944 0.2866056]
Sample 1: Actual=Mengandung Alergen, Predicted=Mengandung Alergen, Confidence=71.34%
  Probabilities: [0.7133944 0.2866056]
Sample 10: Actual=Mengandung Alergen, Predicted=Mengandung Alergen, Confidence=60.56%
  Probabilities: [0.60557034 0.39442966]
Sample 50: Actual=Mengandung Alergen, Predicted=Mengandung Alergen, Confidence=60.56%
  Probabilities: [0.60557034 0.39442966]
Sample 100: Actual=Mengandung Alergen, Predicted=Mengandung Alergen, Confidence=60.56%
  Probabilities: [0.60557034 0.39442966]

🤔 Now let's see what happens with NEW data...

Test simple encoded shape: (1, 6)
Training data shape: (399, 623)
Test simple aligned 

# One-hot encoding data baru

In [None]:
df_baru_encoded = pd.get_dummies(df_baru)

# Sinkronisasi kolom (pastikan kolom sama dengan training)

In [None]:
for col in X_encoded.columns:
    if col not in df_baru_encoded.columns:
        df_baru_encoded[col] = 0

df_baru_encoded = df_baru_encoded[X_encoded.columns]

# Prediksi

In [None]:
prediksi = svm_adaboost_model.predict(df_baru_encoded)
probabilitas = svm_adaboost_model.predict_proba(df_baru_encoded)

# Konversi kembali ke target/label

In [None]:
hasil_target = le.inverse_transform(prediksi)
akurasi_prediksi = round(probabilitas[0][prediksi[0]] * 100, 2)

# Tambahkan data ke DataFrame

In [None]:
df_baru['Prediksi Alergen'] = hasil_target
df_baru['Akurasi (%)'] = akurasi_prediksi

# ===== OUTPUT ===== #

In [None]:
print("\n ===== Prediksi Data Baru =====")
print(tabulate(df_baru, headers = 'keys', tablefmt = 'grid', showindex = False))


 ===== Prediksi Data Baru =====
+-----------------------+---------------+-----------+----------------+-----------------+-----------+--------------------+---------------+
| Nama Produk Makanan   | Bahan Utama   | Pemanis   | Lemak/Minyak   | Penyedap Rasa   | Alergen   | Prediksi Alergen   |   Akurasi (%) |
| Kacang Tanah          | Kacang Tanah  | Gula Aren | Minyak Sawit   | MSG             | Telur     | Mengandung Alergen |         60.56 |
+-----------------------+---------------+-----------+----------------+-----------------+-----------+--------------------+---------------+
