# LIBRARY

In [41]:
!pip install miceforest



In [42]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import re
import miceforest as mf
import warnings
warnings.filterwarnings("ignore")

# LOAD DATASET

In [43]:
features = pd.read_csv("/content/features.csv")
labels = pd.read_csv("/content/labels.csv")

# 1️. Tampilkan data yang telah diunduh, kemudian jelaskan semua kolom yang terdapat pada dataset tersebut.


In [44]:
features.head().T

Unnamed: 0,0,1,2,3,4
Unnamed: 0,0,1,2,3,4
Student Id,6634,6459,2238,2479,4260
Marital status,single,single,single,single,single
Application mode,1.0,1.0,42.0,1.0,44.0
Application order,3.0,1.0,1.0,1.0,1.0
Course,9773.0,,9500.0,,
Daytime/evening attendance\t,day,day,day,day,day
Previous qualification,1.0,1.0,1.0,1.0,39.0
Previous qualification (grade),,136.0,120.0,141.0,120.0
Nacionality,Portuguese,Portuguese,Portuguese,Portuguese,Spanish


In [45]:
labels.head()

Unnamed: 0,Target
0,Graduate
1,Graduate
2,Graduate
3,Graduate
4,Dropout


PENJELASAN KOLOM DATASET:

1. Student Id : ID unik mahasiswa.
2. Marital status : Status pernikahan mahasiswa (single, married, dll).
3. Application mode : Cara atau jalur pendaftaran mahasiswa.
4. Application order : Urutan pilihan program studi yang diajukan saat mendaftar.
5. Course : Kode program studi (misalnya 9773 = Informatika, 9500 = Manajemen, dll).
6. Daytime/evening attendance : Jenis kelas (day = reguler pagi, evening = kelas malam).
7. Previous qualification : Jenis kualifikasi pendidikan sebelumnya (misal SMA, diploma).
8. Previous qualification (grade) : Nilai akhir dari pendidikan sebelumnya.
9. Nacionality : Kewarganegaraan mahasiswa.
10. Mother's qualification : Pendidikan terakhir ibu.
11. Father's qualification : Pendidikan terakhir ayah.
12. Mother's occupation : Pekerjaan ibu.
13. Father's occupation : Pekerjaan ayah.
14. Admission grade : Nilai ujian masuk universitas.
15. Displaced : Apakah mahasiswa tinggal jauh dari keluarga (1 = ya, 0 = tidak).
16. Educational special needs : Apakah mahasiswa memiliki kebutuhan khusus (1 = ya, 0 = tidak).
17. Debtor : Apakah mahasiswa memiliki tunggakan pembayaran (1 = ya, 0 = tidak).
18. Tuition fees up to date : Apakah pembayaran biaya kuliah sudah lunas.
19. Gender : Jenis kelamin mahasiswa (male/female).
20. Scholarship holder : Apakah mahasiswa penerima beasiswa.
21. Age at enrollment : Umur mahasiswa saat mendaftar kuliah.
22. International : Status mahasiswa internasional (1 = ya, 0 = tidak).
23. Curricular units 1st sem (credited) : Jumlah mata kuliah semester 1 yang diakui dari tempat lain.
24. Curricular units 1st sem (enrolled) : Jumlah mata kuliah semester 1 yang diambil.
25. Curricular units 1st sem (evaluations) : Jumlah evaluasi yang diikuti pada semester 1.
26. Curricular units 1st sem (approved) : Jumlah mata kuliah semester 1 yang lulus.
27. Curricular units 1st sem (grade) : Rata-rata nilai mata kuliah semester 1.
28. Curricular units 1st sem (without evaluations) : Jumlah mata kuliah semester 1 tanpa evaluasi.
29. Curricular units 2nd sem (credited) : Jumlah mata kuliah semester 2 yang diakui.
30. Curricular units 2nd sem (enrolled) : Jumlah mata kuliah semester 2 yang diambil.
31. Curricular units 2nd sem (evaluations) : Jumlah evaluasi yang diikuti semester 2.
32. Curricular units 2nd sem (approved) : Jumlah mata kuliah semester 2 yang lulus.
33. Curricular units 2nd sem (grade) : Rata-rata nilai semester 2.
34. Curricular units 2nd sem (without evaluations) : Jumlah mata kuliah semester 2 tanpa evaluasi.
35. Unemployment rate : Tingkat pengangguran nasional pada tahun tersebut.
36. Inflation rate : Tingkat inflasi nasional.
37. GDP : Pertumbuhan Produk Domestik Bruto (PDB) nasional.
38. Target : Label hasil (Graduate = lulus, Dropout = berhenti kuliah).

# 2. agregasi antara tabel features dengan tabel labels

In [46]:
df = pd.concat([features, labels], axis=1)
df

Unnamed: 0.1,Unnamed: 0,Student Id,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,0,6634,single,1.0,3.0,9773.0,day,1.0,,Portuguese,...,0.0,6.0,6.0,6.0,15.333333,0.0,11.1,0.6,2.02,Graduate
1,1,6459,single,1.0,1.0,,day,1.0,136.0,Portuguese,...,0.0,0.0,0.0,0.0,0.000000,0.0,12.4,0.5,1.79,Graduate
2,2,2238,single,42.0,1.0,9500.0,day,1.0,120.0,Portuguese,...,4.0,8.0,8.0,7.0,13.285714,0.0,12.7,3.7,-1.70,Graduate
3,3,2479,single,1.0,1.0,,day,1.0,141.0,Portuguese,...,0.0,6.0,7.0,6.0,13.142857,0.0,11.1,0.6,2.02,Graduate
4,4,4260,single,44.0,1.0,,day,39.0,120.0,Spanish,...,1.0,6.0,,3.0,12.000000,0.0,12.4,0.5,1.79,Dropout
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3091,3091,6176,single,44.0,1.0,9085.0,day,1.0,140.0,Portuguese,...,1.0,6.0,9.0,4.0,11.500000,0.0,13.9,-0.3,0.79,Graduate
3092,3092,8064,single,1.0,5.0,9238.0,day,1.0,126.0,Portuguese,...,0.0,6.0,13.0,3.0,11.000000,0.0,9.4,-0.8,-3.12,Graduate
3093,3093,2103,single,1.0,2.0,9500.0,day,1.0,141.0,Portuguese,...,0.0,8.0,8.0,8.0,14.272500,0.0,15.5,2.8,-4.06,Graduate
3094,3094,8629,single,39.0,1.0,9500.0,day,4.0,150.0,Portuguese,...,2.0,8.0,,7.0,13.306250,0.0,11.1,0.6,2.02,Dropout


# 3️. Tampilkan tipe data dari setiap kolom yang terdapat dalam dataset

In [47]:
df.dtypes #menampilkan tipe data

Unnamed: 0,0
Unnamed: 0,int64
Student Id,int64
Marital status,object
Application mode,float64
Application order,float64
Course,float64
Daytime/evening attendance\t,object
Previous qualification,float64
Previous qualification (grade),float64
Nacionality,object


In [48]:
df.info() #menampilkan informasi data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3096 entries, 0 to 3095
Data columns (total 39 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Unnamed: 0                                      3096 non-null   int64  
 1   Student Id                                      3096 non-null   int64  
 2   Marital status                                  3012 non-null   object 
 3   Application mode                                2924 non-null   float64
 4   Application order                               2988 non-null   float64
 5   Course                                          2944 non-null   float64
 6   Daytime/evening attendance	                     3016 non-null   object 
 7   Previous qualification                          2973 non-null   float64
 8   Previous qualification (grade)                  3011 non-null   float64
 9   Nacionality                              

# 4. Tampilkan statistik deskriptif dari dataset dan berikan analisis singkat mengenai hasilnya

In [49]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Unnamed: 0,3096.0,1547.5,893.882543,0.0,773.75,1547.5,2321.25,3095.0
Student Id,3096.0,5470.623062,2582.483019,1000.0,3288.5,5441.5,7712.25,9997.0
Application mode,2924.0,18.785568,17.532674,1.0,1.0,17.0,39.0,53.0
Application order,2988.0,1.73494,1.327527,1.0,1.0,1.0,2.0,9.0
Course,2944.0,8817.929348,2148.407432,33.0,9085.0,9238.0,9556.0,9991.0
Previous qualification,2973.0,4.685503,10.439709,1.0,1.0,1.0,1.0,43.0
Previous qualification (grade),3011.0,132.647293,13.322662,95.0,124.0,133.1,140.0,190.0
Mother's qualification,2951.0,19.47001,15.636952,1.0,2.0,19.0,37.0,44.0
Father's qualification,3000.0,22.408667,15.291008,1.0,3.0,19.0,37.0,44.0
Mother's occupation,3034.0,11.495715,28.173386,0.0,4.0,5.0,9.0,194.0


Analisis Singkat

  Profil Mahasiswa

Rata-rata usia mahasiswa saat pertama kali mendaftar adalah 23,3 tahun, dengan rentang dari 17 hingga 70 tahun. Ini berarti sebagian besar adalah mahasiswa usia tipikal, tapi tetap ada sejumlah kecil mahasiswa non-tradisional (dewasa atau lanjut usia) yang ikut menempuh pendidikan tinggi.

  Kinerja Akademik

Nilai masuk (Admission grade) rata-rata 126,9 dari skala 190, menandakan kualitas akademik calon mahasiswa tergolong menengah ke atas, meskipun variansinya cukup besar.

Mahasiswa rata-rata mengambil 6 mata kuliah per semester, tetapi hanya 4–5 yang berhasil lulus, menunjukkan bahwa tingkat kelulusan per semester sekitar 70–80%.

Nilai rata-rata per semester (grade) berada di kisaran 10–10,7, yang bisa dikategorikan sebagai kinerja akademik moderat, bukan buruk tapi jauh dari kategori unggul.

Ada sebagian kecil mahasiswa yang tidak menyelesaikan evaluasi semester (nilai = 0), kemungkinan besar berkaitan dengan absensi tinggi atau dropout di tengah semester.

  Kondisi Sosial dan Finansial

Sekitar 24,6% mahasiswa adalah penerima beasiswa, angka yang cukup signifikan dan menunjukkan adanya dukungan finansial yang kuat dari lembaga.

Hanya sekitar 11,7% mahasiswa yang memiliki tunggakan pembayaran (Debtor = 1), namun kelompok ini bisa jadi faktor utama dalam prediksi dropout karena tekanan finansial cenderung berdampak langsung pada kelanjutan studi.

Sebaliknya, 88% mahasiswa membayar biaya kuliah tepat waktu, yang merupakan indikator kedisiplinan dan stabilitas ekonomi yang baik.

  Kualitas Data

Sebagian besar kolom tidak memiliki data penuh — hampir setiap variabel memiliki nilai kosong (missing values) dengan tingkat kehilangan antara 3%–10%.
Artinya, sebelum data ini bisa digunakan untuk pemodelan prediktif, imputasi atau pembersihan data wajib dilakukan.
Khusus kolom akademik dan sosio-ekonomi (seperti “Parent’s qualification” dan “Admission grade”) perlu perhatian ekstra karena datanya sangat berpengaruh pada akurasi model.

# 5️. Slicing Data

In [50]:
df.head(20) #20 baris awal dari dataset

Unnamed: 0.1,Unnamed: 0,Student Id,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,0,6634,single,1.0,3.0,9773.0,day,1.0,,Portuguese,...,0.0,6.0,6.0,6.0,15.333333,0.0,11.1,0.6,2.02,Graduate
1,1,6459,single,1.0,1.0,,day,1.0,136.0,Portuguese,...,0.0,0.0,0.0,0.0,0.0,0.0,12.4,0.5,1.79,Graduate
2,2,2238,single,42.0,1.0,9500.0,day,1.0,120.0,Portuguese,...,4.0,8.0,8.0,7.0,13.285714,0.0,12.7,3.7,-1.7,Graduate
3,3,2479,single,1.0,1.0,,day,1.0,141.0,Portuguese,...,0.0,6.0,7.0,6.0,13.142857,0.0,11.1,0.6,2.02,Graduate
4,4,4260,single,44.0,1.0,,day,39.0,120.0,Spanish,...,1.0,6.0,,3.0,12.0,0.0,12.4,0.5,1.79,Dropout
5,5,3544,married,39.0,1.0,9853.0,day,1.0,,Portuguese,...,0.0,7.0,10.0,3.0,13.0,0.0,11.1,0.6,2.02,Dropout
6,6,4956,single,39.0,1.0,9085.0,day,1.0,133.1,,...,0.0,6.0,14.0,4.0,12.0,0.0,11.1,0.6,2.02,Dropout
7,7,6758,single,1.0,4.0,9670.0,day,1.0,130.0,Portuguese,...,0.0,6.0,7.0,6.0,13.428571,0.0,8.9,1.4,3.51,Enrolled
8,8,2484,single,1.0,1.0,9773.0,day,1.0,135.0,Portuguese,...,0.0,6.0,7.0,6.0,12.166667,0.0,11.1,0.6,2.02,Graduate
9,9,9845,single,39.0,1.0,9670.0,day,1.0,120.0,Portuguese,...,0.0,5.0,8.0,4.0,11.25,0.0,7.6,2.6,0.32,Dropout


In [51]:
df.tail(20) # 20 baris terakhir dataset

Unnamed: 0.1,Unnamed: 0,Student Id,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
3076,3076,6757,single,17.0,,9254.0,day,1.0,122.0,Portuguese,...,0.0,6.0,11.0,3.0,11.333333,0.0,7.6,2.6,0.32,Enrolled
3077,3077,5750,single,1.0,2.0,9500.0,day,1.0,140.0,Portuguese,...,0.0,8.0,8.0,8.0,13.88125,0.0,9.4,-0.8,-3.12,Graduate
3078,3078,2599,single,1.0,1.0,9085.0,day,1.0,131.0,Portuguese,...,0.0,5.0,5.0,,12.8,0.0,10.8,1.4,1.74,Graduate
3079,3079,3249,single,17.0,1.0,9238.0,day,1.0,120.0,Portuguese,...,0.0,6.0,13.0,2.0,10.666667,0.0,16.2,0.3,-0.92,Dropout
3080,3080,3366,divorced,39.0,1.0,9003.0,day,12.0,133.1,Portuguese,...,10.0,13.0,14.0,13.0,14.230769,,15.5,2.8,-4.06,Graduate
3081,3081,3462,single,39.0,1.0,171.0,day,1.0,147.0,Portuguese,...,0.0,0.0,0.0,0.0,0.0,0.0,16.2,0.3,-0.92,Dropout
3082,3082,9194,single,17.0,5.0,9147.0,day,1.0,131.0,Portuguese,...,0.0,5.0,11.0,5.0,11.4,4.0,16.2,,-0.92,Graduate
3083,3083,5574,single,1.0,1.0,9500.0,day,1.0,140.0,Portuguese,...,0.0,8.0,11.0,8.0,14.881818,0.0,16.2,0.3,-0.92,Graduate
3084,3084,8311,single,1.0,2.0,9773.0,day,1.0,130.0,Portuguese,...,0.0,6.0,6.0,6.0,12.333333,0.0,12.4,0.5,1.79,Graduate
3085,3085,2304,single,17.0,1.0,9238.0,day,,139.0,Portuguese,...,0.0,6.0,7.0,6.0,14.0,0.0,15.5,2.8,-4.06,Graduate


In [52]:
df.iloc[80:101] # Baris dari index 80 - 100

Unnamed: 0.1,Unnamed: 0,Student Id,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
80,80,1238,single,17.0,1.0,9670.0,day,1.0,155.0,Portuguese,...,,6.0,10.0,4.0,13.0,0.0,12.7,3.7,-1.7,Enrolled
81,81,4121,single,39.0,1.0,9003.0,day,1.0,,Portuguese,...,0.0,,8.0,0.0,0.0,0.0,15.5,2.8,-4.06,Dropout
82,82,2890,single,1.0,1.0,171.0,day,1.0,159.0,,...,0.0,0.0,0.0,0.0,0.0,0.0,12.4,,1.79,Dropout
83,83,4358,single,1.0,1.0,9147.0,day,1.0,150.0,Ukrainian,...,0.0,5.0,7.0,5.0,13.0,0.0,9.4,-0.8,-3.12,Graduate
84,84,4896,single,,1.0,9991.0,evening,1.0,140.0,Portuguese,...,0.0,5.0,5.0,,0.0,0.0,15.5,2.8,-4.06,Dropout
85,85,4735,single,18.0,1.0,9500.0,day,1.0,144.0,Portuguese,...,,8.0,8.0,7.0,13.542857,0.0,12.7,3.7,-1.7,Graduate
86,86,1047,single,1.0,1.0,9773.0,day,1.0,117.0,Portuguese,...,0.0,6.0,11.0,3.0,13.0,0.0,15.5,2.8,-4.06,Graduate
87,87,7430,single,39.0,2.0,9556.0,day,19.0,133.1,Portuguese,...,0.0,8.0,9.0,8.0,11.9625,0.0,13.9,-0.3,0.79,Graduate
88,88,8527,single,1.0,2.0,9500.0,day,1.0,145.0,Portuguese,...,0.0,7.0,7.0,6.0,12.95,0.0,7.6,2.6,0.32,Graduate
89,89,5420,single,43.0,1.0,,day,1.0,121.0,Portuguese,...,0.0,6.0,8.0,4.0,11.25,0.0,9.4,,-3.12,Enrolled


# 6️. Tangani nilai yang hilang (NaN)

In [53]:
df.isnull().sum()

Unnamed: 0,0
Unnamed: 0,0
Student Id,0
Marital status,84
Application mode,172
Application order,108
Course,152
Daytime/evening attendance\t,80
Previous qualification,123
Previous qualification (grade),85
Nacionality,185


In [54]:
df.head(10) # 10 baris pertama sebelum imputasi

Unnamed: 0.1,Unnamed: 0,Student Id,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,0,6634,single,1.0,3.0,9773.0,day,1.0,,Portuguese,...,0.0,6.0,6.0,6.0,15.333333,0.0,11.1,0.6,2.02,Graduate
1,1,6459,single,1.0,1.0,,day,1.0,136.0,Portuguese,...,0.0,0.0,0.0,0.0,0.0,0.0,12.4,0.5,1.79,Graduate
2,2,2238,single,42.0,1.0,9500.0,day,1.0,120.0,Portuguese,...,4.0,8.0,8.0,7.0,13.285714,0.0,12.7,3.7,-1.7,Graduate
3,3,2479,single,1.0,1.0,,day,1.0,141.0,Portuguese,...,0.0,6.0,7.0,6.0,13.142857,0.0,11.1,0.6,2.02,Graduate
4,4,4260,single,44.0,1.0,,day,39.0,120.0,Spanish,...,1.0,6.0,,3.0,12.0,0.0,12.4,0.5,1.79,Dropout
5,5,3544,married,39.0,1.0,9853.0,day,1.0,,Portuguese,...,0.0,7.0,10.0,3.0,13.0,0.0,11.1,0.6,2.02,Dropout
6,6,4956,single,39.0,1.0,9085.0,day,1.0,133.1,,...,0.0,6.0,14.0,4.0,12.0,0.0,11.1,0.6,2.02,Dropout
7,7,6758,single,1.0,4.0,9670.0,day,1.0,130.0,Portuguese,...,0.0,6.0,7.0,6.0,13.428571,0.0,8.9,1.4,3.51,Enrolled
8,8,2484,single,1.0,1.0,9773.0,day,1.0,135.0,Portuguese,...,0.0,6.0,7.0,6.0,12.166667,0.0,11.1,0.6,2.02,Graduate
9,9,9845,single,39.0,1.0,9670.0,day,1.0,120.0,Portuguese,...,0.0,5.0,8.0,4.0,11.25,0.0,7.6,2.6,0.32,Dropout


In [55]:
df.isnull().sum() # jumlah Nilai Kosong per Kolom

Unnamed: 0,0
Unnamed: 0,0
Student Id,0
Marital status,84
Application mode,172
Application order,108
Course,152
Daytime/evening attendance\t,80
Previous qualification,123
Previous qualification (grade),85
Nacionality,185


# miceforest

In [56]:
# ambil hanya kolom numerik
numeric_cols = df.select_dtypes(include=['number']).columns
df_numeric = df[numeric_cols].copy()

# bersihkan semua karakter non-alfanumerik di nama kolom
df_numeric.columns = [
    re.sub(r'[^A-Za-z0-9_]+', '_', c) for c in df_numeric.columns
]

# jalankan miceforest
kernel = mf.ImputationKernel(
    data=df_numeric,
    random_state=42
)
kernel.mice(3)
df_imputed_numeric = kernel.complete_data(dataset=0)

# balikin nama kolom aslinya
df_imputed_numeric.columns = numeric_cols

# gabungkan hasil imputasi numerik ke dataset asli
df_imputed = df.copy()
df_imputed[numeric_cols] = df_imputed_numeric

print("Imputasi miceforest selesai tanpa eror:")
print(df_imputed.isnull().sum())

Imputasi miceforest selesai tanpa eror:
Unnamed: 0                                          0
Student Id                                          0
Marital status                                     84
Application mode                                    0
Application order                                   0
Course                                              0
Daytime/evening attendance\t                       80
Previous qualification                              0
Previous qualification (grade)                      0
Nacionality                                       185
Mother's qualification                              0
Father's qualification                              0
Mother's occupation                                 0
Father's occupation                                 0
Admission grade                                     0
Displaced                                           0
Educational special needs                           0
Debtor                                    

# 7. encoding terhadap kolom bertipe object menjadi numerik

In [57]:
obj_cols = df_imputed.select_dtypes(include='object').columns
if len(obj_cols) > 0:
    print("\nMelakukan Encoding pada Kolom:", list(obj_cols))
    le = LabelEncoder()
    for col in obj_cols:
        df_imputed[col] = le.fit_transform(df_imputed[col].astype(str))
else:
    print("\ntidak ada kolom bertipe object untuk di-encode.")


Melakukan Encoding pada Kolom: ['Marital status', 'Daytime/evening attendance\t', 'Nacionality', 'Target']


# SimpelImputer

In [58]:
from sklearn.impute import SimpleImputer

# Cek apakah ada kolom kategorikal
cat_cols = df_imputed.select_dtypes(include='object').columns

if len(cat_cols) == 0:
    print("Tidak ada kolom kategorikal yang perlu diimputasi.")
else:
    print("Kolom kategorikal yang akan diimputasi:", list(cat_cols))
    imputer_cat = SimpleImputer(strategy='most_frequent')
    df_imputed[cat_cols] = imputer_cat.fit_transform(df_imputed[cat_cols])
    print("\nImputasi kategorikal selesai.")
    print(df_imputed.isnull().sum())


Tidak ada kolom kategorikal yang perlu diimputasi.


In [59]:
df_imputed.isnull().sum() # cek apakah masih ada missing value atau tidak

Unnamed: 0,0
Unnamed: 0,0
Student Id,0
Marital status,0
Application mode,0
Application order,0
Course,0
Daytime/evening attendance\t,0
Previous qualification,0
Previous qualification (grade),0
Nacionality,0


# 8️. Tampilkan kembali dataset yang telah melalui seluruh proses preprocessing (cleaning, imputasi, dan encoding)

In [65]:
df_imputed #menampilkan data final

Unnamed: 0.1,Unnamed: 0,Student Id,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,0,6634,5,1.0,3.0,9773.0,0,1.0,130.0,13,...,0.0,6.0,6.0,6.0,15.333333,0.0,11.1,0.6,2.02,2
1,1,6459,5,1.0,1.0,171.0,0,1.0,136.0,13,...,0.0,0.0,0.0,0.0,0.000000,0.0,12.4,0.5,1.79,2
2,2,2238,5,42.0,1.0,9500.0,0,1.0,120.0,13,...,4.0,8.0,8.0,7.0,13.285714,0.0,12.7,3.7,-1.70,2
3,3,2479,5,1.0,1.0,9070.0,0,1.0,141.0,13,...,0.0,6.0,7.0,6.0,13.142857,0.0,11.1,0.6,2.02,2
4,4,4260,5,44.0,1.0,9085.0,0,39.0,120.0,17,...,1.0,6.0,15.0,3.0,12.000000,0.0,12.4,0.5,1.79,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3091,3091,6176,5,44.0,1.0,9085.0,0,1.0,140.0,13,...,1.0,6.0,9.0,4.0,11.500000,0.0,13.9,-0.3,0.79,2
3092,3092,8064,5,1.0,5.0,9238.0,0,1.0,126.0,13,...,0.0,6.0,13.0,3.0,11.000000,0.0,9.4,-0.8,-3.12,2
3093,3093,2103,5,1.0,2.0,9500.0,0,1.0,141.0,13,...,0.0,8.0,8.0,8.0,14.272500,0.0,15.5,2.8,-4.06,2
3094,3094,8629,5,39.0,1.0,9500.0,0,4.0,150.0,13,...,2.0,8.0,9.0,7.0,13.306250,0.0,11.1,0.6,2.02,0
