In [5]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', 200)
pd.set_option('display.width', 200)

from src.features import FeatureEngineer
academic_df = pd.read_csv("./data/raw/academic_records.csv")
admission_df = pd.read_csv("./data/raw/admission.csv")
test_df = pd.read_csv("./data/raw/test.csv")


In [6]:
df = academic_df.merge(admission_df, on='MA_SO_SV', how='left')
engineer = FeatureEngineer()

In [25]:
df[df['MA_SO_SV'] == '00003e092652']

Unnamed: 0,MA_SO_SV,HOC_KY,CPA,GPA,TC_DANGKY,TC_HOANTHANH,NAM_TUYENSINH,PTXT,TOHOP_XT,DIEM_TRUNGTUYEN,DIEM_CHUAN
30995,00003e092652,HK2 2023-2024,1.53,2.05,18,13,2023,100,A00,21.32,20.25
86548,00003e092652,HK1 2023-2024,1.64,1.97,18,15,2023,100,A00,21.32,20.25


In [62]:
def quick_overview(df, name="df"):
    print(f"===== {name} =====")
    display(df.head())
    print(df.shape)
    display(df.describe(include="all").T)
    print("\nMissing values:")
    display(df.isna().mean().sort_values(ascending=False))
quick_overview(df, "RAW TRAIN DATA")


===== RAW TRAIN DATA =====


Unnamed: 0,MA_SO_SV,HOC_KY,CPA,GPA,TC_DANGKY,TC_HOANTHANH,NAM_TUYENSINH,PTXT,TOHOP_XT,DIEM_TRUNGTUYEN,DIEM_CHUAN
0,f022ed8d1ac1,HK2 2020-2021,2.19,2.02,18,18,2020,1,A00,23.96,21.72
1,f022ed8d1ac1,HK1 2022-2023,0.95,2.12,14,7,2020,1,A00,23.96,21.72
2,f022ed8d1ac1,HK1 2023-2024,0.81,1.89,29,16,2020,1,A00,23.96,21.72
3,f022ed8d1ac1,HK2 2022-2023,1.37,1.93,26,23,2020,1,A00,23.96,21.72
4,f022ed8d1ac1,HK2 2023-2024,1.71,1.91,16,13,2020,1,A00,23.96,21.72


(105726, 11)


Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
MA_SO_SV,105726.0,20381.0,9223ee2b1299,9.0,,,,,,,
HOC_KY,105726.0,8.0,HK1 2023-2024,17099.0,,,,,,,
CPA,105726.0,,,,1.962662,0.899984,0.0,1.36,2.0,2.6,4.08
GPA,105726.0,,,,2.290328,0.479314,0.0,1.98,2.25,2.57,4.08
TC_DANGKY,105726.0,,,,19.438492,5.584857,1.0,17.0,20.0,22.0,71.0
TC_HOANTHANH,105726.0,,,,15.609916,6.610703,0.0,12.0,16.0,20.0,65.0
NAM_TUYENSINH,105726.0,,,,2020.155326,1.468706,2018.0,2019.0,2020.0,2021.0,2023.0
PTXT,105726.0,8.0,1,84294.0,,,,,,,
TOHOP_XT,105726.0,18.0,A00,65371.0,,,,,,,
DIEM_TRUNGTUYEN,105726.0,,,,22.045015,2.651439,0.0,20.16,22.38,24.08,59.06



Missing values:


MA_SO_SV           0.0
HOC_KY             0.0
CPA                0.0
GPA                0.0
TC_DANGKY          0.0
TC_HOANTHANH       0.0
NAM_TUYENSINH      0.0
PTXT               0.0
TOHOP_XT           0.0
DIEM_TRUNGTUYEN    0.0
DIEM_CHUAN         0.0
dtype: float64

In [7]:
def inspect_step(df_step, step_name, new_cols):
    print(f"--- STEP: {step_name} ---")
    print(f"Số lượng cột mới: {len(new_cols)}")
    print(f"Cột mới: {new_cols}")
    
    # Xem thống kê mô tả của các cột mới
    display(df_step[new_cols].describe().T)
    
    # Kiểm tra giá trị Null
    null_counts = df_step[new_cols].isnull().sum()
    if null_counts.any():
        print("Cảnh báo: Có giá trị NULL trong cột mới!")
        print(null_counts[null_counts > 0])
    
    # Xem 5 dòng đầu của các cột này
    display(df_step[new_cols].head())
    print("\n" + "="*50 + "\n")

In [9]:
df_step1 = engineer._create_admission_features(df.copy())
new_cols = ['diem_vuot_chuan', 'diem_ratio', 'nam_tuoi']
inspect_step(df_step1, "Admission Features", [c for c in new_cols if c in df_step1.columns])

--- STEP: Admission Features ---
Số lượng cột mới: 3
Cột mới: ['diem_vuot_chuan', 'diem_ratio', 'nam_tuoi']


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
diem_vuot_chuan,105726.0,2.023864,1.792232,0.0,0.64,1.53,2.94,10.9
diem_ratio,105726.0,1.109012,0.108006,0.0,1.02931,1.077036,1.155556,1.735811
nam_tuoi,105726.0,3.844674,1.468706,1.0,3.0,4.0,5.0,6.0


Unnamed: 0,diem_vuot_chuan,diem_ratio,nam_tuoi
0,2.24,1.102623,4
1,2.24,1.102623,4
2,2.24,1.102623,4
3,2.24,1.102623,4
4,2.24,1.102623,4






In [None]:
df_step1[df_step1["MA_SO_SV"] == "f022ed8d1ac1"]

In [11]:
df_step2 = engineer._create_academic_features(df_step1.copy())
new_cols = ['tc_dangky_high', 'tc_dangky_low']
inspect_step(df_step2, "Academic Features", new_cols)

--- STEP: Academic Features ---
Số lượng cột mới: 2
Cột mới: ['tc_dangky_high', 'tc_dangky_low']


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
tc_dangky_high,105726.0,0.027089,0.162343,0.0,0.0,0.0,0.0,1.0
tc_dangky_low,105726.0,0.14565,0.352757,0.0,0.0,0.0,0.0,1.0


Unnamed: 0,tc_dangky_high,tc_dangky_low
0,0,0
1,0,1
2,0,0
3,0,0
4,0,0






In [18]:
df_step3 = engineer._create_temporal_features(df_step2.copy())
new_cols = ['semester_number', 'hoc_ky_nam', 'hoc_ky_so', 'is_semester_2']
inspect_step(df_step3, "Temporal Features", new_cols)

--- STEP: Temporal Features ---
Số lượng cột mới: 4
Cột mới: ['semester_number', 'hoc_ky_nam', 'hoc_ky_so', 'is_semester_2']


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
semester_number,105726.0,4.614012,2.660287,1.0,2.0,4.0,6.0,12.0
hoc_ky_nam,105726.0,2021.719927,1.076527,2020.0,2021.0,2022.0,2023.0,2023.0
hoc_ky_so,105726.0,1.48481,0.499772,1.0,1.0,1.0,2.0,2.0
is_semester_2,105726.0,0.48481,0.499772,0.0,0.0,0.0,1.0,1.0


Unnamed: 0,semester_number,hoc_ky_nam,hoc_ky_so,is_semester_2
0,2,2020,2,1
1,5,2022,1,0
2,7,2023,1,0
3,6,2022,2,1
4,8,2023,2,1






In [39]:
after_feat = pd.read_csv("./data_after_feature.csv")

In [42]:
after_feat.head()
after_feat[after_feat['MA_SO_SV'] == '481436e2064d']

Unnamed: 0,MA_SO_SV,HOC_KY,CPA,GPA,TC_DANGKY,TC_HOANTHANH,NAM_TUYENSINH,PTXT,TOHOP_XT,DIEM_TRUNGTUYEN,DIEM_CHUAN,diem_vuot_chuan,nam_tuoi,tc_dangky_high,tc_dangky_low,hoc_ky_so,hoc_ky_nam,semester_order,semester_number,is_semester_1,is_semester_2,gpa_prev,gpa_mean_prev,cpa_prev,completion_rate_prev,total_tc_tich_luy,avg_completion_rate,completion_std,order_trong_truong
30049,481436e2064d,HK1 2022-2023,1.04,2.17,19,10,2019,1,A00,22.79,19.71,3.08,5,0,0,1,2022,20221,6,1,0,2.5,2.5,2.5,0.803581,0.0,0.803581,0.0,0
30050,481436e2064d,HK2 2022-2023,1.13,2.19,26,14,2019,1,A00,22.79,19.71,3.08,5,0,0,2,2022,20222,7,0,1,2.17,2.17,1.04,0.52356,381077.0,0.66357,0.198004,1
30051,481436e2064d,HK2 2021-2022,1.36,2.19,17,14,2019,1,A00,22.79,19.71,3.08,5,0,0,2,2021,20212,5,0,1,2.19,2.18,1.13,0.536398,381091.0,0.62118,0.158094,2
30052,481436e2064d,HK1 2023-2024,1.7,2.1,32,30,2019,1,A00,22.79,19.71,3.08,5,1,0,1,2023,20231,8,1,0,2.19,2.183333,1.36,0.818713,381105.0,0.670563,0.162534,3
30053,481436e2064d,HK1 2020-2021,1.42,2.21,25,15,2019,1,A00,22.79,19.71,3.08,5,0,0,1,2020,20201,2,1,0,2.1,2.1625,1.7,0.934579,381135.0,0.723366,0.183723,4
30054,481436e2064d,HK2 2023-2024,0.0,2.13,9,0,2019,1,A00,22.79,19.71,3.08,5,0,1,2,2023,20232,9,0,1,2.21,2.172,1.42,0.59761,381150.0,0.702407,0.17216,5
30055,481436e2064d,HK2 2020-2021,2.36,2.3,27,27,2019,1,A00,22.79,19.71,3.08,5,0,0,2,2020,20202,3,0,1,2.13,2.165,0.0,0.0,381150.0,0.602063,0.308515,6
30056,481436e2064d,HK1 2021-2022,1.93,2.28,22,18,2019,1,A00,22.79,19.71,3.08,5,0,0,1,2021,20211,4,1,0,2.3,2.184286,2.36,0.99631,381177.0,0.651344,0.317825,7
