#### 1. Setup & Import Libraries

In [17]:
import pandas as pd
import numpy as np
import re

In [18]:
df_train_raw = pd.read_csv("../../data/processed/light_processed_train.csv")
df_test_raw = pd.read_csv("../../data/processed/light_process_test.csv")

In [19]:
df_train_raw

Unnamed: 0,MA_SO_SV,HOC_KY,CPA,GPA,TC_DANGKY,TC_HOANTHANH,semester_order,NAM_TUYENSINH,PTXT,TOHOP_XT,DIEM_TRUNGTUYEN,DIEM_CHUAN,COMPLETION_RATE
0,00003e092652,HK1 2023-2024,1.64,1.97,18,15,20231,2023,100,A00,21.32,20.25,0.833333
1,00003e092652,HK2 2023-2024,1.53,2.05,18,13,20232,2023,100,A00,21.32,20.25,0.722222
2,000e15519006,HK1 2021-2022,3.85,3.85,9,9,20211,2021,1,D07,23.84,22.43,1.000000
3,000e15519006,HK2 2021-2022,2.77,3.12,19,19,20212,2021,1,D07,23.84,22.43,1.000000
4,000e15519006,HK1 2022-2023,2.83,2.98,21,21,20221,2021,1,D07,23.84,22.43,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
105721,fffd51317dd2,HK2 2022-2023,0.61,1.78,15,5,20222,2020,1,A00,17.61,16.10,0.333333
105722,ffff4d891f10,HK1 2022-2023,3.04,3.04,18,18,20221,2022,100,A00,25.98,19.91,1.000000
105723,ffff4d891f10,HK2 2022-2023,3.16,3.12,18,18,20222,2022,100,A00,25.98,19.91,1.000000
105724,ffff4d891f10,HK1 2023-2024,2.88,3.00,21,21,20231,2022,100,A00,25.98,19.91,1.000000


#### 2. Feature Engineering Pipeline

##### 1. Các hàm tiện ích (Utilities)

Chúng ta cần xây dựng hàm tính **Slope (Hệ số góc)** để bắt được xu hướng điểm số (đang tăng hay giảm).
Công thức tính hệ số góc $m$ cho chuỗi thời gian:

$$m = \frac{\sum(x - \bar{x})(y - \bar{y})}{\sum(x - \bar{x})^2}$$

Hàm `parse_semester_string` giúp chuyển đổi chuỗi `HK1 2023-2024` thành số nguyên `20231` để sort.

In [20]:

def parse_semester_string(sem_str):
    """
    Chuyển đổi 'HK1 2023-2024' -> 20231 (Int) để sort time-series.
    """
    s = str(sem_str).strip()
    if s.isdigit(): return int(s)
    
    digits = re.findall(r'\d+', s)
    if len(digits) >= 2:
        years = [int(d) for d in digits if len(d) == 4]
        sems = [int(d) for d in digits if len(d) == 1]
        if years and sems:
            return years[0] * 10 + sems[0]
    return 0

def fast_slope(y):
    """
    Tính hệ số góc (Trend) nhanh.
    Input: Array numpy (có thể chứa NaN).
    """
    # Lọc bỏ NaN trước khi tính (Quan trọng!)
    y_clean = y[~np.isnan(y)]
    n = len(y_clean)
    if n < 2: return 0.0
    
    x = np.arange(n)
    x_mean = np.mean(x)
    y_mean = np.mean(y_clean)
    
    numerator = np.sum((x - x_mean) * (y_clean - y_mean))
    denominator = np.sum((x - x_mean) ** 2)
    
    return numerator / (denominator + 1e-6)

In [21]:
tests = ["HK1 2023-2024", "HK2 2022-2023", "20231", "HK3 2020-2021", "abc"]
for t in tests:
    print(t, "->", parse_semester_string(t))

# Test fast_slope
print("slope tăng:", fast_slope(np.array([1, 2, 3, 4, 5], dtype=float)))
print("slope giảm:", fast_slope(np.array([5, 4, 3, 2, 1], dtype=float)))
print("slope có NaN:", fast_slope(np.array([1, np.nan, 3, 4], dtype=float)))
print("slope ít điểm:", fast_slope(np.array([np.nan], dtype=float)))

HK1 2023-2024 -> 20231
HK2 2022-2023 -> 20222
20231 -> 20231
HK3 2020-2021 -> 20203
abc -> 0
slope tăng: 0.99999990000001
slope giảm: -0.99999990000001
slope có NaN: 1.499999250000375
slope ít điểm: 0.0


##### 2. Feature Engineering Class

Class `FeatureEngineer` chịu trách nhiệm tạo ra 4 nhóm biến chính:

| Nhóm Feature | Mô tả | Ví dụ |
| :--- | :--- | :--- |
| **Admission** | Thông tin đầu vào | `diem_vuot_chuan`, `nam_tuoi` |
| **History** | Lịch sử kỳ trước (Lag features) | `Prev_GPA`, `load_factor` |
| **Trend** | Xu hướng học tập | `gpa_trend_slope`, `gpa_volatility` |
| **Risk** | Hành vi rủi ro | `aggressive_recovery` (Rớt môn nhưng đăng ký nhiều), `accumulated_fail_ratio` |

### Logic đặc biệt:
* **Aggressive Recovery:** Phát hiện sinh viên có dấu hiệu "gỡ gạc" (Kỳ trước rớt nhưng kỳ này đăng ký vượt quá năng lực).
* **Credit Velocity:** Tốc độ tích lũy tín chỉ trung bình mỗi kỳ.

In [None]:
class FeatureEngineer:    
    def __init__(self):
        # Các cột Category sẽ giữ lại
        self.cat_cols = ['PTXT', 'TOHOP_XT', 'MA_NGANH', 'KV_UT', 'KHOA_VIEN'] 

    def create_features(self, df):
        print("FEATURE ENGINEERING")
        df = df.copy()
        df = df.sort_values(['MA_SO_SV', 'semester_order']).reset_index(drop=True)
        
        # Convert Category
        for col in self.cat_cols:
            if col in df.columns:
                df[col] = df[col].astype(str).astype('category')
        # Groupby object
        g = df.groupby('MA_SO_SV')
        df['Prev_GPA_Raw'] = g['GPA'].shift(1)
        df['Prev_GPA'] = df['Prev_GPA_Raw'].fillna(-1)
        df['Prev_CPA'] = g['CPA'].shift(1).fillna(-1)
        df['Prev_TC_HOANTHANH'] = g['TC_HOANTHANH'].shift(1).fillna(0)
        df['Prev_TC_DANGKY'] = g['TC_DANGKY'].shift(1).fillna(0)
        
        # Cờ năm nhất (Chưa có lịch sử)
        df['is_freshman'] = (df['Prev_TC_DANGKY'] == 0).astype(int)
        # ---------------------------------------------------------
        df = self._create_admission_features(df)
        df = self._create_history_features(df)
        df = self._create_trend_features(df)  
        df = self._create_risk_features(df)
        
        # Dọn dẹp cột
        if 'Prev_GPA_Raw' in df.columns:
            df = df.drop(columns=['Prev_GPA_Raw'])
            
        return df

    def _create_admission_features(self, df):
        # Khoảng cách điểm đầu vào
        if 'DIEM_TRUNGTUYEN' in df.columns and 'DIEM_CHUAN' in df.columns:
            df['diem_vuot_chuan'] = df['DIEM_TRUNGTUYEN'] - df['DIEM_CHUAN']
        
        # Tuổi (giả định năm hiện tại 2026)
        if 'NAM_TUYENSINH' in df.columns:
            df['nam_tuoi'] = 2026 - df['NAM_TUYENSINH']
            
        # Số kỳ đã học (tính từ năm nhập học)
        df['semester_number'] = df.groupby('MA_SO_SV').cumcount() + 1
        return df

    def _create_history_features(self, df):
        # 1. Delta GPA-CPA: Đang học tốt hơn hay tệ hơn trung bình tích lũy?
        df['prev_gpa_cpa_diff'] = df['Prev_GPA'] - df['Prev_CPA']
        
        # 2. Tỷ lệ hoàn thành kỳ trước
        df['prev_completion_rate'] = df['Prev_TC_HOANTHANH'] / (df['Prev_TC_DANGKY'] + 1e-9)
        
        # 3. Load Factor (Áp lực): Đăng ký kỳ này / Sức học trung bình
        avg_capacity = df.groupby('MA_SO_SV')['Prev_TC_HOANTHANH'].transform(
            lambda x: x.rolling(window=5, min_periods=1).mean()
        ).fillna(15)
        
        df['load_factor'] = df['TC_DANGKY'] / (avg_capacity + 1e-9)
        
        # Biến cờ: Rớt môn kỳ trước
        df['failed_last_sem'] = (df['Prev_TC_HOANTHANH'] < df['Prev_TC_DANGKY']).astype(int)
        
        return df

    def _create_trend_features(self, df):
        g_raw = df.groupby('MA_SO_SV')['Prev_GPA_Raw']
        
        # 1. GPA Slope (Xu hướng điểm)
        df['gpa_trend_slope'] = g_raw.transform(
            lambda x: x.rolling(window=3, min_periods=2).apply(fast_slope, raw=True)
        ).fillna(0)
        
        # 2. GPA Volatility (Độ ổn định)
        df['gpa_volatility'] = g_raw.transform(
            lambda x: x.rolling(window=4, min_periods=2).std()
        ).fillna(0)
        
        # 3. Tích lũy tín chỉ (History)
        grouped = df.groupby('MA_SO_SV')
        cum_dangky = grouped['Prev_TC_DANGKY'].cumsum()
        cum_hoanthanh = grouped['Prev_TC_HOANTHANH'].cumsum()
        
        df['total_credits_failed'] = cum_dangky - cum_hoanthanh
        df['accumulated_fail_ratio'] = df['total_credits_failed'] / (cum_dangky + 1e-9)
        
        # 4. Credit Velocity (Tốc độ học)
        semester_count = grouped.cumcount() + 1
        df['credit_velocity'] = cum_hoanthanh / semester_count
        
        return df

    def _create_risk_features(self, df):
        # Hành vi "Gỡ gạc" (Aggressive Recovery): Rớt môn kỳ trước -> Đăng ký nhiều hơn kỳ này
        more_credits = (df['TC_DANGKY'] > df['Prev_TC_DANGKY'])
        df['aggressive_recovery'] = (df['failed_last_sem'] & more_credits).astype(int)
        
        # Kỳ vọng hoàn thành (Expected Credits) = Đăng ký * Tỷ lệ đậu toàn cục của SV đó
        df['expected_real_credits'] = df['TC_DANGKY'] * (1 - df['accumulated_fail_ratio'])
        
        return df

    def get_feature_columns(self, df):
        """Tự động lấy danh sách feature dựa trên prefix"""
        
        # Whitelist các prefix
        valid_prefixes = [
            'Prev_', 'prev_', 'sem_', 'diem_', 'nam_', 'is_', 
            'load_', 'aggressive_', 'gpa_', 'total_', 'accumulated_',
            'credit_', 'expected_', 'failed_'
        ]
        
        valid_exact = ['TC_DANGKY', 'DIEM_TRUNGTUYEN', 'DIEM_CHUAN', 'semester_number']
        valid_exact.extend(self.cat_cols)
        
        final_cols = []
        # Các cột target/meta cần loại bỏ
        ignore_cols = ['TC_HOANTHANH', 'GPA', 'CPA', 'semester_order', 'MA_SO_SV', 'HOC_KY', 'COMPLETION_RATE', 'Prev_GPA_Raw']
        
        for col in df.columns:
            if col in ignore_cols: continue
            
            is_valid = False
            if col in valid_exact: is_valid = True
            else:
                for prefix in valid_prefixes:
                    if col.startswith(prefix):
                        is_valid = True
                        break
            
            if is_valid: final_cols.append(col)
                
        return final_cols

In [26]:
# Gộp Train + Test để tạo feature
df_train_raw['set_type'] = 'TRAIN'
df_test_raw['set_type'] = 'TEST'
full_df = pd.concat([df_train_raw, df_test_raw], ignore_index=True)

In [29]:
# Feature Engineering
fe = FeatureEngineer()
full_df_fe = fe.create_features(full_df)

FEATURE ENGINEERING


In [30]:
full_df_fe

Unnamed: 0,MA_SO_SV,HOC_KY,CPA,GPA,TC_DANGKY,TC_HOANTHANH,semester_order,NAM_TUYENSINH,PTXT,TOHOP_XT,...,prev_completion_rate,load_factor,failed_last_sem,gpa_trend_slope,gpa_volatility,total_credits_failed,accumulated_fail_ratio,credit_velocity,aggressive_recovery,expected_real_credits
0,00003e092652,HK1 2023-2024,1.64,1.97,18,15,20231,2023,100,A00,...,0.000000,1.800000e+10,0,0.00,0.000000,0.0,0.000000,0.000000,0,18.000000
1,00003e092652,HK2 2023-2024,1.53,2.05,18,13,20232,2023,100,A00,...,0.833333,2.400000e+00,1,0.00,0.000000,3.0,0.166667,7.500000,0,15.000000
2,00003e092652,HK1 2024-2025,0.00,0.00,20,0,20241,2023,100,A00,...,0.722222,2.142857e+00,1,0.08,0.056569,8.0,0.222222,9.333333,1,15.555556
3,00027b0dec4c,HK1 2024-2025,0.00,0.00,19,0,20241,2024,100,A00,...,0.000000,1.900000e+10,0,0.00,0.000000,0.0,0.000000,0.000000,0,19.000000
4,000e15519006,HK1 2021-2022,3.85,3.85,9,9,20211,2021,1,D07,...,0.000000,9.000000e+09,0,0.00,0.000000,0.0,0.000000,0.000000,0,9.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122223,ffff4d891f10,HK1 2022-2023,3.04,3.04,18,18,20221,2022,100,A00,...,0.000000,1.800000e+10,0,0.00,0.000000,0.0,0.000000,0.000000,0,18.000000
122224,ffff4d891f10,HK2 2022-2023,3.16,3.12,18,18,20222,2022,100,A00,...,1.000000,2.000000e+00,0,0.00,0.000000,0.0,0.000000,9.000000,0,18.000000
122225,ffff4d891f10,HK1 2023-2024,2.88,3.00,21,21,20231,2022,100,A00,...,1.000000,1.750000e+00,0,0.08,0.056569,0.0,0.000000,12.000000,0,21.000000
122226,ffff4d891f10,HK2 2023-2024,3.16,3.06,18,18,20232,2022,100,A00,...,1.000000,1.263158e+00,0,-0.02,0.061101,0.0,0.000000,14.250000,0,18.000000


In [31]:
# 5. Tách lại Train/Test
train_final = full_df_fe[full_df_fe['set_type'] == 'TRAIN'].copy()
test_final = full_df_fe[full_df_fe['set_type'] == 'TEST'].copy()

# 6. Lấy features & Target
feature_cols = fe.get_feature_columns(train_final)
X = train_final[feature_cols]
y = train_final['TC_HOANTHANH']

print(f"Số lượng Features: {len(feature_cols)}")
print(f"Sample Features: {feature_cols[:5]}")

Số lượng Features: 24
Sample Features: ['TC_DANGKY', 'PTXT', 'TOHOP_XT', 'DIEM_TRUNGTUYEN', 'DIEM_CHUAN']


In [33]:
train_final.to_csv("../../data/processed/train_final.csv", index=False)
test_final.to_csv("../../data/processedtest_final.csv", index=False)