In [8]:
# ==============================
# 1. Import Library
# ==============================
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Optional: miceforest (hanya jika ingin imputasi model)
try:
    import miceforest as mf
    MICE_AVAILABLE = True
except ImportError:
    MICE_AVAILABLE = False
    print("Library miceforest belum diinstal. Lewati tahap imputasi model ini.")


# ==============================
# 2. Load Dataset
# ==============================
features = pd.read_csv("features.csv")
labels = pd.read_csv("labels.csv")

print("Data features.csv dan labels.csv berhasil dimuat.\n")

print("=== Features (5 baris pertama) ===")
display(features.head())
print("\n=== Labels (5 baris pertama) ===")
display(labels.head())


# ==============================
# 3. Penjelasan Kolom
# ==============================
print("\nDeskripsi Kolom Features:")
print(features.info())

print("\nDeskripsi Kolom Labels:")
print(labels.info())


# ==============================
# 4. Agregasi antara features dan labels
# ==============================
# Gabungkan berdasarkan kolom yang sama (jika ada)
common_cols = set(features.columns).intersection(labels.columns)
if common_cols:
    key = list(common_cols)[0]
    data = pd.merge(features, labels, on=key, how="inner")
    print(f"Data berhasil digabung berdasarkan kolom '{key}'.")
else:
    data = pd.concat([features.reset_index(drop=True),
                      labels.reset_index(drop=True)], axis=1)
    print("Tidak ada kolom kunci yang sama. Data digabung berdasarkan index.")

print("\nData Gabungan (5 baris pertama):")
display(data.head())


# ==============================
# 5. Tipe Data Setiap Kolom
# ==============================
print("\nTipe data setiap kolom:")
print(data.dtypes)


# ==============================
# 6. Statistik Deskriptif
# ==============================
print("\nStatistik deskriptif untuk kolom numerik:")
display(data.describe(include=[np.number]).T)

print("\nStatistik deskriptif untuk kolom kategorikal:")
display(data.describe(include=[object]).T)

print("\nAnalisis singkat:")
print("- Statistik deskriptif menunjukkan nilai mean, std, dan quartile untuk memahami distribusi data.")
print("- Perhatikan nilai minimum, maksimum, serta kemungkinan adanya data kosong atau outlier.")


# ==============================
# 7. Slicing Data
# ==============================
print("\n20 baris pertama:")
display(data.head(20))

print("\n20 baris terakhir:")
display(data.tail(20))

if len(data) > 100:
    print("\nBaris index 80 - 100:")
    display(data.iloc[80:101])
else:
    print("\nDataset memiliki kurang dari 100 baris, tidak dapat menampilkan index 80–100.")


# ==============================
# 8. Tangani Nilai Hilang (NaN)
# ==============================
print("\nJumlah nilai hilang sebelum imputasi:")
print(data.isnull().sum())

# Pisahkan kolom numerik dan kategorikal
num_cols = data.select_dtypes(include=[np.number]).columns
obj_cols = data.select_dtypes(include=['object']).columns

# Imputasi numerik (gunakan median / mean / mode)
imputer_num = SimpleImputer(strategy='median')
data[num_cols] = imputer_num.fit_transform(data[num_cols])

# Imputasi kategorikal (gunakan modus)
for col in obj_cols:
    mode_value = data[col].mode()[0]
    data[col] = data[col].fillna(mode_value)

print("\nJumlah nilai hilang setelah imputasi:")
print(data.isnull().sum())


# ==============================
# 9. Encoding (opsional)
# ==============================
use_encoding = "label"  # ubah ke 'onehot' jika ingin OneHotEncoder

if use_encoding == "label":
    print("\nMenggunakan LabelEncoder untuk kolom kategorikal.")
    le = LabelEncoder()
    for col in obj_cols:
        data[col] = le.fit_transform(data[col].astype(str))
elif use_encoding == "onehot":
    print("\nMenggunakan OneHotEncoder untuk kolom kategorikal.")
    data = pd.get_dummies(data, columns=obj_cols)

print("\nContoh hasil setelah encoding:")
display(data.head())


# ==============================
# 10. Hasil Akhir Dataset
# ==============================
print("\nDataset akhir setelah preprocessing:")
display(data.head())

# Simpan hasil ke file CSV
data.to_csv("preprocessed_output.csv", index=False)
print("\nFile hasil preprocessing disimpan sebagai preprocessed_output.csv")


Library miceforest belum diinstal. Lewati tahap imputasi model ini.
Data features.csv dan labels.csv berhasil dimuat.

=== Features (5 baris pertama) ===


Unnamed: 0.1,Unnamed: 0,Student Id,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,...,Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP
0,0,6634,single,1.0,3.0,9773.0,day,1.0,,Portuguese,...,0.0,0.0,6.0,6.0,6.0,15.333333,0.0,11.1,0.6,2.02
1,1,6459,single,1.0,1.0,,day,1.0,136.0,Portuguese,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.4,0.5,1.79
2,2,2238,single,42.0,1.0,9500.0,day,1.0,120.0,Portuguese,...,0.0,4.0,8.0,8.0,7.0,13.285714,0.0,12.7,3.7,-1.7
3,3,2479,single,1.0,1.0,,day,1.0,141.0,Portuguese,...,0.0,0.0,6.0,7.0,6.0,13.142857,0.0,11.1,0.6,2.02
4,4,4260,single,44.0,1.0,,day,39.0,120.0,Spanish,...,0.0,1.0,6.0,,3.0,12.0,0.0,12.4,0.5,1.79



=== Labels (5 baris pertama) ===


Unnamed: 0,Target
0,Graduate
1,Graduate
2,Graduate
3,Graduate
4,Dropout



Deskripsi Kolom Features:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3096 entries, 0 to 3095
Data columns (total 38 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Unnamed: 0                                      3096 non-null   int64  
 1   Student Id                                      3096 non-null   int64  
 2   Marital status                                  3012 non-null   object 
 3   Application mode                                2924 non-null   float64
 4   Application order                               2988 non-null   float64
 5   Course                                          2944 non-null   float64
 6   Daytime/evening attendance	                     3016 non-null   object 
 7   Previous qualification                          2973 non-null   float64
 8   Previous qualification (grade)                  3011 non-null   float64
 9   Nacionality   

Unnamed: 0.1,Unnamed: 0,Student Id,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,0,6634,single,1.0,3.0,9773.0,day,1.0,,Portuguese,...,0.0,6.0,6.0,6.0,15.333333,0.0,11.1,0.6,2.02,Graduate
1,1,6459,single,1.0,1.0,,day,1.0,136.0,Portuguese,...,0.0,0.0,0.0,0.0,0.0,0.0,12.4,0.5,1.79,Graduate
2,2,2238,single,42.0,1.0,9500.0,day,1.0,120.0,Portuguese,...,4.0,8.0,8.0,7.0,13.285714,0.0,12.7,3.7,-1.7,Graduate
3,3,2479,single,1.0,1.0,,day,1.0,141.0,Portuguese,...,0.0,6.0,7.0,6.0,13.142857,0.0,11.1,0.6,2.02,Graduate
4,4,4260,single,44.0,1.0,,day,39.0,120.0,Spanish,...,1.0,6.0,,3.0,12.0,0.0,12.4,0.5,1.79,Dropout



Tipe data setiap kolom:
Unnamed: 0                                          int64
Student Id                                          int64
Marital status                                     object
Application mode                                  float64
Application order                                 float64
Course                                            float64
Daytime/evening attendance\t                       object
Previous qualification                            float64
Previous qualification (grade)                    float64
Nacionality                                        object
Mother's qualification                            float64
Father's qualification                            float64
Mother's occupation                               float64
Father's occupation                               float64
Admission grade                                   float64
Displaced                                         float64
Educational special needs                      

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Unnamed: 0,3096.0,1547.5,893.882543,0.0,773.75,1547.5,2321.25,3095.0
Student Id,3096.0,5470.623062,2582.483019,1000.0,3288.5,5441.5,7712.25,9997.0
Application mode,2924.0,18.785568,17.532674,1.0,1.0,17.0,39.0,53.0
Application order,2988.0,1.73494,1.327527,1.0,1.0,1.0,2.0,9.0
Course,2944.0,8817.929348,2148.407432,33.0,9085.0,9238.0,9556.0,9991.0
Previous qualification,2973.0,4.685503,10.439709,1.0,1.0,1.0,1.0,43.0
Previous qualification (grade),3011.0,132.647293,13.322662,95.0,124.0,133.1,140.0,190.0
Mother's qualification,2951.0,19.47001,15.636952,1.0,2.0,19.0,37.0,44.0
Father's qualification,3000.0,22.408667,15.291008,1.0,3.0,19.0,37.0,44.0
Mother's occupation,3034.0,11.495715,28.173386,0.0,4.0,5.0,9.0,194.0



Statistik deskriptif untuk kolom kategorikal:


Unnamed: 0,count,unique,top,freq
Marital status,3012,6,single,2674
Daytime/evening attendance\t,3016,2,day,2701
Nacionality,2911,19,Portuguese,2836
Target,3096,3,Graduate,1546



Analisis singkat:
- Statistik deskriptif menunjukkan nilai mean, std, dan quartile untuk memahami distribusi data.
- Perhatikan nilai minimum, maksimum, serta kemungkinan adanya data kosong atau outlier.

20 baris pertama:


Unnamed: 0.1,Unnamed: 0,Student Id,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,0,6634,single,1.0,3.0,9773.0,day,1.0,,Portuguese,...,0.0,6.0,6.0,6.0,15.333333,0.0,11.1,0.6,2.02,Graduate
1,1,6459,single,1.0,1.0,,day,1.0,136.0,Portuguese,...,0.0,0.0,0.0,0.0,0.0,0.0,12.4,0.5,1.79,Graduate
2,2,2238,single,42.0,1.0,9500.0,day,1.0,120.0,Portuguese,...,4.0,8.0,8.0,7.0,13.285714,0.0,12.7,3.7,-1.7,Graduate
3,3,2479,single,1.0,1.0,,day,1.0,141.0,Portuguese,...,0.0,6.0,7.0,6.0,13.142857,0.0,11.1,0.6,2.02,Graduate
4,4,4260,single,44.0,1.0,,day,39.0,120.0,Spanish,...,1.0,6.0,,3.0,12.0,0.0,12.4,0.5,1.79,Dropout
5,5,3544,married,39.0,1.0,9853.0,day,1.0,,Portuguese,...,0.0,7.0,10.0,3.0,13.0,0.0,11.1,0.6,2.02,Dropout
6,6,4956,single,39.0,1.0,9085.0,day,1.0,133.1,,...,0.0,6.0,14.0,4.0,12.0,0.0,11.1,0.6,2.02,Dropout
7,7,6758,single,1.0,4.0,9670.0,day,1.0,130.0,Portuguese,...,0.0,6.0,7.0,6.0,13.428571,0.0,8.9,1.4,3.51,Enrolled
8,8,2484,single,1.0,1.0,9773.0,day,1.0,135.0,Portuguese,...,0.0,6.0,7.0,6.0,12.166667,0.0,11.1,0.6,2.02,Graduate
9,9,9845,single,39.0,1.0,9670.0,day,1.0,120.0,Portuguese,...,0.0,5.0,8.0,4.0,11.25,0.0,7.6,2.6,0.32,Dropout



20 baris terakhir:


Unnamed: 0.1,Unnamed: 0,Student Id,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
3076,3076,6757,single,17.0,,9254.0,day,1.0,122.0,Portuguese,...,0.0,6.0,11.0,3.0,11.333333,0.0,7.6,2.6,0.32,Enrolled
3077,3077,5750,single,1.0,2.0,9500.0,day,1.0,140.0,Portuguese,...,0.0,8.0,8.0,8.0,13.88125,0.0,9.4,-0.8,-3.12,Graduate
3078,3078,2599,single,1.0,1.0,9085.0,day,1.0,131.0,Portuguese,...,0.0,5.0,5.0,,12.8,0.0,10.8,1.4,1.74,Graduate
3079,3079,3249,single,17.0,1.0,9238.0,day,1.0,120.0,Portuguese,...,0.0,6.0,13.0,2.0,10.666667,0.0,16.2,0.3,-0.92,Dropout
3080,3080,3366,divorced,39.0,1.0,9003.0,day,12.0,133.1,Portuguese,...,10.0,13.0,14.0,13.0,14.230769,,15.5,2.8,-4.06,Graduate
3081,3081,3462,single,39.0,1.0,171.0,day,1.0,147.0,Portuguese,...,0.0,0.0,0.0,0.0,0.0,0.0,16.2,0.3,-0.92,Dropout
3082,3082,9194,single,17.0,5.0,9147.0,day,1.0,131.0,Portuguese,...,0.0,5.0,11.0,5.0,11.4,4.0,16.2,,-0.92,Graduate
3083,3083,5574,single,1.0,1.0,9500.0,day,1.0,140.0,Portuguese,...,0.0,8.0,11.0,8.0,14.881818,0.0,16.2,0.3,-0.92,Graduate
3084,3084,8311,single,1.0,2.0,9773.0,day,1.0,130.0,Portuguese,...,0.0,6.0,6.0,6.0,12.333333,0.0,12.4,0.5,1.79,Graduate
3085,3085,2304,single,17.0,1.0,9238.0,day,,139.0,Portuguese,...,0.0,6.0,7.0,6.0,14.0,0.0,15.5,2.8,-4.06,Graduate



Baris index 80 - 100:


Unnamed: 0.1,Unnamed: 0,Student Id,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
80,80,1238,single,17.0,1.0,9670.0,day,1.0,155.0,Portuguese,...,,6.0,10.0,4.0,13.0,0.0,12.7,3.7,-1.7,Enrolled
81,81,4121,single,39.0,1.0,9003.0,day,1.0,,Portuguese,...,0.0,,8.0,0.0,0.0,0.0,15.5,2.8,-4.06,Dropout
82,82,2890,single,1.0,1.0,171.0,day,1.0,159.0,,...,0.0,0.0,0.0,0.0,0.0,0.0,12.4,,1.79,Dropout
83,83,4358,single,1.0,1.0,9147.0,day,1.0,150.0,Ukrainian,...,0.0,5.0,7.0,5.0,13.0,0.0,9.4,-0.8,-3.12,Graduate
84,84,4896,single,,1.0,9991.0,evening,1.0,140.0,Portuguese,...,0.0,5.0,5.0,,0.0,0.0,15.5,2.8,-4.06,Dropout
85,85,4735,single,18.0,1.0,9500.0,day,1.0,144.0,Portuguese,...,,8.0,8.0,7.0,13.542857,0.0,12.7,3.7,-1.7,Graduate
86,86,1047,single,1.0,1.0,9773.0,day,1.0,117.0,Portuguese,...,0.0,6.0,11.0,3.0,13.0,0.0,15.5,2.8,-4.06,Graduate
87,87,7430,single,39.0,2.0,9556.0,day,19.0,133.1,Portuguese,...,0.0,8.0,9.0,8.0,11.9625,0.0,13.9,-0.3,0.79,Graduate
88,88,8527,single,1.0,2.0,9500.0,day,1.0,145.0,Portuguese,...,0.0,7.0,7.0,6.0,12.95,0.0,7.6,2.6,0.32,Graduate
89,89,5420,single,43.0,1.0,,day,1.0,121.0,Portuguese,...,0.0,6.0,8.0,4.0,11.25,0.0,9.4,,-3.12,Enrolled



Jumlah nilai hilang sebelum imputasi:
Unnamed: 0                                          0
Student Id                                          0
Marital status                                     84
Application mode                                  172
Application order                                 108
Course                                            152
Daytime/evening attendance\t                       80
Previous qualification                            123
Previous qualification (grade)                     85
Nacionality                                       185
Mother's qualification                            145
Father's qualification                             96
Mother's occupation                                62
Father's occupation                               144
Admission grade                                   141
Displaced                                         122
Educational special needs                         119
Debtor                                     

Unnamed: 0.1,Unnamed: 0,Student Id,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,0.0,6634.0,4,1.0,3.0,9773.0,0,1.0,133.1,13,...,0.0,6.0,6.0,6.0,15.333333,0.0,11.1,0.6,2.02,2
1,1.0,6459.0,4,1.0,1.0,9238.0,0,1.0,136.0,13,...,0.0,0.0,0.0,0.0,0.0,0.0,12.4,0.5,1.79,2
2,2.0,2238.0,4,42.0,1.0,9500.0,0,1.0,120.0,13,...,4.0,8.0,8.0,7.0,13.285714,0.0,12.7,3.7,-1.7,2
3,3.0,2479.0,4,1.0,1.0,9238.0,0,1.0,141.0,13,...,0.0,6.0,7.0,6.0,13.142857,0.0,11.1,0.6,2.02,2
4,4.0,4260.0,4,44.0,1.0,9238.0,0,39.0,120.0,17,...,1.0,6.0,8.0,3.0,12.0,0.0,12.4,0.5,1.79,0



Dataset akhir setelah preprocessing:


Unnamed: 0.1,Unnamed: 0,Student Id,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,0.0,6634.0,4,1.0,3.0,9773.0,0,1.0,133.1,13,...,0.0,6.0,6.0,6.0,15.333333,0.0,11.1,0.6,2.02,2
1,1.0,6459.0,4,1.0,1.0,9238.0,0,1.0,136.0,13,...,0.0,0.0,0.0,0.0,0.0,0.0,12.4,0.5,1.79,2
2,2.0,2238.0,4,42.0,1.0,9500.0,0,1.0,120.0,13,...,4.0,8.0,8.0,7.0,13.285714,0.0,12.7,3.7,-1.7,2
3,3.0,2479.0,4,1.0,1.0,9238.0,0,1.0,141.0,13,...,0.0,6.0,7.0,6.0,13.142857,0.0,11.1,0.6,2.02,2
4,4.0,4260.0,4,44.0,1.0,9238.0,0,39.0,120.0,17,...,1.0,6.0,8.0,3.0,12.0,0.0,12.4,0.5,1.79,0



File hasil preprocessing disimpan sebagai preprocessed_output.csv
