In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from category_encoders import TargetEncoder
import joblib
import warnings
warnings.filterwarnings('ignore')

print("Do")


Do


In [3]:
df = pd.read_csv('../Dataset/DataGabungan.csv')
print(f"Dataset dimuat: {df.shape}")
print(f"\nDistribusi target:")
print(f"   Lulus: {(df['risiko_gagal']==0).sum()} ({(df['risiko_gagal']==0).mean()*100:.1f}%)")
print(f"   Gagal: {(df['risiko_gagal']==1).sum()} ({(df['risiko_gagal']==1).mean()*100:.1f}%)")


Dataset dimuat: (1044, 35)

Distribusi target:
   Lulus: 814 (78.0%)
   Gagal: 230 (22.0%)


In [None]:
## 3.2 Feature Engineering
print("Feature Engineering")

df['total_dukungan'] = (
    (df['dukungan_sekolah'] == 'Ya').astype(int) +
    (df['dukungan_keluarga'] == 'Ya').astype(int) +
    (df['les_pribadi'] == 'Ya').astype(int)
)

df['avg_parent_edu'] = (df['pendidikan_ibu'] + df['pendidikan_ayah']) / 2

df['perkembangan_G1_G2'] = df['nilai_periode2'] - df['nilai_periode1']
df['perkembangan_G2_G3'] = df['nilai_akhir'] - df['nilai_periode2']
df['rata_rata_Perkemvbangan'] = (df['perkembangan_G1_G2'] + df['perkembangan_G2_G3']) / 2

df['ketidakhadiran_tinggi'] = (df['ketidakhadiran'] > df['ketidakhadiran'].quantile(0.75)).astype(int)
df['waktu_belajar_rendah'] = (df['waktu_belajar'] <= 2).astype(int)
df['ada_kegagalan'] = (df['jumlah_kegagalan'] > 0).astype(int)

df['skor_gaya_hidup'] = (
    (5 - df['konsumsi_alkohol_harian']) + (5 - df['konsumsi_alkohol_akhir_pekan']) +
    df['kesehatan'] + (5 - df['keluar_dengan_teman']) + df['waktu_belajar']
) / 5

df['stabilitas_keluarga'] = (
    (df['hubungan_keluarga'] >= 4).astype(int) +
    (df['status_orangtua'] == 'Tinggal bersama').astype(int)
)

df['akses_dukungan'] = (
    (df['internet_rumah'] == 'Ya').astype(int) +
    (df['dukungan_sekolah'] == 'Ya').astype(int) +
    (df['dukungan_keluarga'] == 'Ya').astype(int)
)

print(f"seles")
print(f"   Fitur baru: {len(df.columns) - 33}")


Feature Engineering


KeyError: 'G2'

In [None]:

## 3.4 Encoding Kategorikal
print("Encoding variabel kategorikal")

X = df.drop(['risiko_gagal', 'nilai_akhir'], axis=1)
y = df['risiko_gagal']

categorical_cols = X.select_dtypes(include='object').columns.tolist()
print(f"   Kolom kategorikal: {len(categorical_cols)}")

binary_cols = [
    'dukungan_sekolah', 'dukungan_keluarga', 'les_pribadi',
    'kegiatan_ekstra', 'tk', 'pendidikan_tinggi',
    'internet_rumah', 'pacaran'
]

for col in binary_cols:
    if col in X.columns:
        X[col + '_binary'] = (X[col] == 'yes').astype(int)

label_encoders = {}
label_encode_cols = [
    'sekolah', 'jenis_kelamin', 'tipe_alamat',
    'ukuran_keluarga', 'status_ortu', 'wali', 'subject'
]

for col in label_encode_cols:
    if col in X.columns:
        le = LabelEncoder()
        X[col + '_encoded'] = le.fit_transform(X[col].astype(str))
        label_encoders[col] = le

target_encoders = {}
target_encode_cols = ['pekerjaan_ibu', 'pekerjaan_ayah', 'alasan_sekolah']

for col in target_encode_cols:
    if col in X.columns:
        te = TargetEncoder(cols=[col])
        X[col + '_encoded'] = te.fit_transform(X[col], y)
        target_encoders[col] = te

cols_to_drop = categorical_cols + binary_cols
X = X.drop(columns=[c for c in cols_to_drop if c in X.columns])

print(f"Encoding selesai: {len(X.columns)} fitur numerik")


In [None]:
## 3.5 Seleksi Fitur

print("Menghapus fitur berkorelasi tinggi")

corr_matrix = X.corr().abs()
upper_triangle = corr_matrix.where(
    np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
)

to_drop = [col for col in upper_triangle.columns 
           if any(upper_triangle[col] > 0.95)]

print(f"   Fitur dengan korelasi >0.95: {len(to_drop)}")
X = X.drop(columns=to_drop)
print(f"Fitur akhir: {len(X.columns)}")


In [None]:
## 3.7 Train-Test Split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

print("TTS:")
print(f"   Training: {len(X_train)} ({len(X_train)/len(X)*100:.1f}%)")
print(f"   Test: {len(X_test)} ({len(X_test)/len(X)*100:.1f}%)")
print(f"\n   Train - Lulus: {(y_train==0).sum()} ({(y_train==0).mean()*100:.1f}%)")
print(f"   Train - Gagal: {(y_train==1).sum()} ({(y_train==1).mean()*100:.1f}%)")


In [None]:
# ## 3.8 Feature Scaling

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

print("StandardScaler")
print(f"   Mean: {X_train_scaled.mean().mean():.2e}")
print(f"   Std: {X_train_scaled.std().mean():.2f}")

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].boxplot(X_train[['nilai_periode1', 'nilai_periode2', 'ketidakhadiran']].values)
axes[0].set_xticklabels(['nilai_periode1', 'nilai_periode2', 'ketidakhadiran'])
axes[0].set_title('Sebelum Scaling', fontweight='bold')
axes[0].set_ylabel('Nilai')

axes[1].boxplot(X_train_scaled[['nilai_periode1', 'nilai_periode2', 'ketidakhadiran']].values)
axes[1].set_xticklabels(['nilai_periode1', 'nilai_periode2', 'ketidakhadiran'])
axes[1].set_title('Setelah Scaling', fontweight='bold')
axes[1].set_ylabel('Nilai Standar')

plt.tight_layout()
plt.show()


In [None]:
## 3.9 Simpan Dataset

X_train_scaled.to_csv('../Dataset/X_train.csv', index=False)
X_test_scaled.to_csv('../Dataset/X_test.csv', index=False)
y_train.to_csv('../Dataset/y_train.csv', index=False, header=True)
y_test.to_csv('../Dataset/y_test.csv', index=False, header=True)
print("Dataset tersimpan:")


In [None]:
## 3.10 Simpan Artifacts
joblib.dump(scaler, '../models/scaler.pkl')
joblib.dump(label_encoders, '../models/label_encoders.pkl')
joblib.dump(target_encoders, '../models/target_encoders.pkl')
joblib.dump(X_train.columns.tolist(), '../models/feature_cols.pkl')
