In [1]:
import pandas as pd
import numpy as np

ACADEMIC_PATH = r'../../data/raw/academic_records.csv'
ADMISSION_PATH = r'../../data/raw/admission.csv'
TEST_PATH = r'../../data/raw/test.csv'
academic_records = pd.read_csv(ACADEMIC_PATH)
admission = pd.read_csv(ADMISSION_PATH)

In [2]:
import pandas as pd
import numpy as np
import re

# --- H√ÄM M·ªöI: X·ª≠ l√Ω ƒë·ªãnh d·∫°ng h·ªçc k·ª≥ ph·ª©c t·∫°p ---
def parse_semester_string(sem_str):
    """
    Chuy·ªÉn ƒë·ªïi chu·ªói nh∆∞ 'HK1 2023-2024' th√†nh m√£ s·ªë 20231 ƒë·ªÉ sort ƒë∆∞·ª£c.
    Logic: NƒÉm * 10 + K·ª≥
    """
    s = str(sem_str).strip()
    
    # Tr∆∞·ªùng h·ª£p 1: D·∫°ng s·ªë s·∫µn (VD: 20231)
    if s.isdigit():
        return int(s)
    
    # Tr∆∞·ªùng h·ª£p 2: D·∫°ng ch·ªØ (VD: HK1 2023-2024 ho·∫∑c H·ªçc k·ª≥ 1 nƒÉm 2023)
    # T√¨m t·∫•t c·∫£ c√°c con s·ªë trong chu·ªói
    digits = re.findall(r'\d+', s)
    
    if len(digits) >= 2:
        # Gi·∫£ s·ª≠ s·ªë nh·ªè l√† k·ª≥, s·ªë l·ªõn (4 ch·ªØ s·ªë) l√† nƒÉm
        # T√¨m nƒÉm (th∆∞·ªùng l√† s·ªë c√≥ 4 ch·ªØ s·ªë ƒë·∫ßu ti√™n t√¨m th·∫•y)
        years = [int(d) for d in digits if len(d) == 4]
        sems = [int(d) for d in digits if len(d) == 1]
        
        if years and sems:
            year = years[0]
            sem = sems[0]
            return year * 10 + sem
            
    return 0 # Kh√¥ng x√°c ƒë·ªãnh

def clean_data_pipeline_v3(admission, academic_records):
    print("--- üöÄ B·∫ÆT ƒê·∫¶U QUY TR√åNH L√ÄM S·∫†CH D·ªÆ LI·ªÜU (FIXED VERSION) ---")
    
    adm = admission.copy()
    acad = academic_records.copy()
    
    # 1. Chu·∫©n h√≥a ID
    adm['MA_SO_SV'] = adm['MA_SO_SV'].astype(str)
    acad['MA_SO_SV'] = acad['MA_SO_SV'].astype(str)
    
    # ---------------------------------------------------------
    # [FIX QUAN TR·ªåNG] X·ª≠ l√Ω HOC_KY th√¥ng minh h∆°n
    # ---------------------------------------------------------
    print("-> ƒêang x·ª≠ l√Ω c·ªôt HOC_KY...")
    # T·∫°o c·ªôt HOC_KY_INT d√πng ƒë·ªÉ sort (VD: 20231)
    acad['HOC_KY_INT'] = acad['HOC_KY'].apply(parse_semester_string)
    
    # Ki·ªÉm tra xem c√≥ d√≤ng n√†o b·ªã l·ªói (b·∫±ng 0) kh√¥ng
    error_count = (acad['HOC_KY_INT'] == 0).sum()
    if error_count > 0:
        print(f"   ‚ö†Ô∏è C·∫£nh b√°o: C√≥ {error_count} d√≤ng kh√¥ng ƒë·ªçc ƒë∆∞·ª£c HOC_KY.")

    # Merge d·ªØ li·ªáu
    df = pd.merge(acad, adm, on='MA_SO_SV', how='left')
    
    # S·∫Øp x·∫øp theo Time-series chu·∫©n x√°c d·ª±a tr√™n c·ªôt v·ª´a t·∫°o
    df = df.sort_values(by=['MA_SO_SV', 'HOC_KY_INT']).reset_index(drop=True)
    
    # ---------------------------------------------------------
    # 2. X·ª¨ L√ù S·ªê LI·ªÜU & LOGIC
    # ---------------------------------------------------------
    cols_float = ['GPA', 'CPA', 'DIEM_TRUNGTUYEN', 'DIEM_CHUAN']
    cols_int = ['TC_DANGKY', 'TC_HOANTHANH']
    
    for col in cols_float:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
        
    for col in cols_int:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)

    # Logic: Ho√†n th√†nh <= ƒêƒÉng k√Ω
    df['TC_HOANTHANH'] = np.minimum(df['TC_HOANTHANH'], df['TC_DANGKY'])
    
    # Target Transformation
    df['COMPLETION_RATE'] = df['TC_HOANTHANH'] / (df['TC_DANGKY'] + 1e-9)
    df['COMPLETION_RATE'] = df['COMPLETION_RATE'].clip(0, 1)

    # Clip ƒëi·ªÉm s·ªë
    df['GPA'] = df['GPA'].clip(0, 4.0)
    df['CPA'] = df['CPA'].clip(0, 4.0)

    # Admission Gap Feature
    if 'DIEM_TRUNGTUYEN' in df.columns and 'DIEM_CHUAN' in df.columns:
        df['ADMISSION_GAP'] = df['DIEM_TRUNGTUYEN'] - df['DIEM_CHUAN']
    
    # L·ªçc r√°c
    initial_len = len(df)
    df = df[df['TC_DANGKY'] > 0]
    
    print(f"--- ‚úÖ HO√ÄN T·∫§T. K√≠ch th∆∞·ªõc data: {df.shape} ---")
    print("Sample HOC_KY_INT:", df['HOC_KY_INT'].head().tolist())
    
    return df

# Ch·∫°y th·ª≠ l·∫°i
df_clean = clean_data_pipeline_v3(admission, academic_records)
df_clean.head()

--- üöÄ B·∫ÆT ƒê·∫¶U QUY TR√åNH L√ÄM S·∫†CH D·ªÆ LI·ªÜU (FIXED VERSION) ---
-> ƒêang x·ª≠ l√Ω c·ªôt HOC_KY...
--- ‚úÖ HO√ÄN T·∫§T. K√≠ch th∆∞·ªõc data: (105726, 14) ---
Sample HOC_KY_INT: [20231, 20232, 20211, 20212, 20221]


Unnamed: 0,MA_SO_SV,HOC_KY,CPA,GPA,TC_DANGKY,TC_HOANTHANH,HOC_KY_INT,NAM_TUYENSINH,PTXT,TOHOP_XT,DIEM_TRUNGTUYEN,DIEM_CHUAN,COMPLETION_RATE,ADMISSION_GAP
0,00003e092652,HK1 2023-2024,1.64,1.97,18,15,20231,2023,100,A00,21.32,20.25,0.833333,1.07
1,00003e092652,HK2 2023-2024,1.53,2.05,18,13,20232,2023,100,A00,21.32,20.25,0.722222,1.07
2,000e15519006,HK1 2021-2022,3.85,3.85,9,9,20211,2021,1,D07,23.84,22.43,1.0,1.41
3,000e15519006,HK2 2021-2022,2.77,3.12,19,19,20212,2021,1,D07,23.84,22.43,1.0,1.41
4,000e15519006,HK1 2022-2023,2.83,2.98,21,21,20221,2021,1,D07,23.84,22.43,1.0,1.41


In [3]:
import pandas as pd
import numpy as np
import re

# ==============================================================================
# 1. C√ÅC H√ÄM TI·ªÜN √çCH (UTILS)
# ==============================================================================

def parse_semester_string(sem_str):
    """
    Chuy·ªÉn ƒë·ªïi 'HK1 2023-2024' -> 20231 (Int) ƒë·ªÉ sort time-series.
    """
    s = str(sem_str).strip()
    if s.isdigit(): return int(s)
    
    digits = re.findall(r'\d+', s)
    if len(digits) >= 2:
        years = [int(d) for d in digits if len(d) == 4]
        sems = [int(d) for d in digits if len(d) == 1]
        if years and sems:
            return years[0] * 10 + sems[0]
    return 0

def fast_slope(y):
    """
    T√≠nh h·ªá s·ªë g√≥c (Trend) nhanh.
    Input: Array numpy (c√≥ th·ªÉ ch·ª©a NaN).
    """
    # L·ªçc b·ªè NaN tr∆∞·ªõc khi t√≠nh (Quan tr·ªçng!)
    y_clean = y[~np.isnan(y)]
    n = len(y_clean)
    if n < 2: return 0.0
    
    x = np.arange(n)
    x_mean = np.mean(x)
    y_mean = np.mean(y_clean)
    
    numerator = np.sum((x - x_mean) * (y_clean - y_mean))
    denominator = np.sum((x - x_mean) ** 2)
    
    return numerator / (denominator + 1e-6)

# ==============================================================================
# 2. DATA PREPROCESSING PIPELINE
# ==============================================================================

def clean_data_pipeline(admission, academic_records, is_test=False):
    print("--- üöÄ 1. PREPROCESSING DATA ---")
    adm = admission.copy()
    acad = academic_records.copy()
    
    # Chu·∫©n h√≥a ID
    adm['MA_SO_SV'] = adm['MA_SO_SV'].astype(str)
    acad['MA_SO_SV'] = acad['MA_SO_SV'].astype(str)
    
    # T·∫°o Time-Index
    acad['semester_order'] = acad['HOC_KY'].apply(parse_semester_string)
    
    # Merge
    df = pd.merge(acad, adm, on='MA_SO_SV', how='left')
    
    # Sort Time-Series (C·ª∞C K·ª≤ QUAN TR·ªåNG)
    df = df.sort_values(by=['MA_SO_SV', 'semester_order']).reset_index(drop=True)
    
    # Numeric conversion
    cols_float = ['GPA', 'CPA', 'DIEM_TRUNGTUYEN', 'DIEM_CHUAN']
    cols_int = ['TC_DANGKY', 'TC_HOANTHANH']
    
    for col in cols_float:
        if col in df.columns: df[col] = pd.to_numeric(df[col], errors='coerce')
    for col in cols_int:
        if col in df.columns: df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)

    # Logic clean
    if not is_test:
        df['TC_HOANTHANH'] = np.minimum(df['TC_HOANTHANH'], df['TC_DANGKY'])
        df['GPA'] = df['GPA'].clip(0, 4.0)
        df['CPA'] = df['CPA'].clip(0, 4.0)
    
    # Target Transformation (Ch·ªâ d√πng cho train)
    if not is_test:
        df['COMPLETION_RATE'] = df['TC_HOANTHANH'] / (df['TC_DANGKY'] + 1e-9)
        df['COMPLETION_RATE'] = df['COMPLETION_RATE'].clip(0, 1)

    return df

# ==============================================================================
# 3. FEATURE ENGINEERING CLASS (CORE)
# ==============================================================================

class FeatureEngineer:    
    def __init__(self):
        # C√°c c·ªôt Category s·∫Ω gi·ªØ l·∫°i
        self.cat_cols = ['PTXT', 'TOHOP_XT', 'MA_NGANH', 'KV_UT', 'KHOA_VIEN'] 

    def create_features(self, df):
        print("--- ‚ö° 2. FEATURE ENGINEERING (FULL POWER) ---")
        df = df.copy()
        
        # Sort l·∫°i cho ch·∫Øc ch·∫Øn
        df = df.sort_values(['MA_SO_SV', 'semester_order']).reset_index(drop=True)
        
        # Convert Category
        for col in self.cat_cols:
            if col in df.columns:
                df[col] = df[col].astype(str).astype('category')

        # Groupby object
        g = df.groupby('MA_SO_SV')

        # ---------------------------------------------------------
        # A. BASE LAGS (C·ª±c k·ª≥ quan tr·ªçng: D√πng shift(1) ƒë·ªÉ tr√°nh Leakage)
        # ---------------------------------------------------------
        # T·∫°o bi·∫øn Raw (ch·ª©a NaN) ƒë·ªÉ t√≠nh to√°n th·ªëng k√™ ch√≠nh x√°c h∆°n
        df['Prev_GPA_Raw'] = g['GPA'].shift(1)
        
        # T·∫°o bi·∫øn Fill (ƒë·ªÉ model d√πng tr·ª±c ti·∫øp)
        df['Prev_GPA'] = df['Prev_GPA_Raw'].fillna(-1)
        df['Prev_CPA'] = g['CPA'].shift(1).fillna(-1)
        df['Prev_TC_HOANTHANH'] = g['TC_HOANTHANH'].shift(1).fillna(0)
        df['Prev_TC_DANGKY'] = g['TC_DANGKY'].shift(1).fillna(0)
        
        # C·ªù nƒÉm nh·∫•t (Ch∆∞a c√≥ l·ªãch s·ª≠)
        df['is_freshman'] = (df['Prev_TC_DANGKY'] == 0).astype(int)

        # ---------------------------------------------------------
        # B. G·ªåI C√ÅC NH√ìM FEATURE
        # ---------------------------------------------------------
        df = self._create_admission_features(df)
        df = self._create_history_features(df)
        df = self._create_trend_features(df)  # <--- Slope, Volatility n·∫±m ·ªü ƒë√¢y
        df = self._create_risk_features(df)
        
        # D·ªçn d·∫πp c·ªôt t·∫°m
        if 'Prev_GPA_Raw' in df.columns:
            df = df.drop(columns=['Prev_GPA_Raw'])
            
        return df

    def _create_admission_features(self, df):
        # Kho·∫£ng c√°ch ƒëi·ªÉm ƒë·∫ßu v√†o
        if 'DIEM_TRUNGTUYEN' in df.columns and 'DIEM_CHUAN' in df.columns:
            df['diem_vuot_chuan'] = df['DIEM_TRUNGTUYEN'] - df['DIEM_CHUAN']
        
        # Tu·ªïi (gi·∫£ ƒë·ªãnh nƒÉm hi·ªán t·∫°i 2025/2026)
        if 'NAM_TUYENSINH' in df.columns:
            df['nam_tuoi'] = 2026 - df['NAM_TUYENSINH']
            
        # S·ªë k·ª≥ ƒë√£ h·ªçc (t√≠nh t·ª´ nƒÉm nh·∫≠p h·ªçc)
        df['semester_number'] = df.groupby('MA_SO_SV').cumcount() + 1
        return df

    def _create_history_features(self, df):
        # 1. Delta GPA-CPA: ƒêang h·ªçc t·ªët h∆°n hay t·ªá h∆°n trung b√¨nh t√≠ch l≈©y?
        df['prev_gpa_cpa_diff'] = df['Prev_GPA'] - df['Prev_CPA']
        
        # 2. T·ª∑ l·ªá ho√†n th√†nh k·ª≥ tr∆∞·ªõc
        df['prev_completion_rate'] = df['Prev_TC_HOANTHANH'] / (df['Prev_TC_DANGKY'] + 1e-9)
        
        # 3. Load Factor (√Åp l·ª±c): ƒêƒÉng k√Ω k·ª≥ n√†y / S·ª©c h·ªçc trung b√¨nh
        avg_capacity = df.groupby('MA_SO_SV')['Prev_TC_HOANTHANH'].transform(
            lambda x: x.rolling(window=5, min_periods=1).mean()
        ).fillna(15)
        
        df['load_factor'] = df['TC_DANGKY'] / (avg_capacity + 1e-9)
        
        # Bi·∫øn c·ªù: R·ªõt m√¥n k·ª≥ tr∆∞·ªõc
        df['failed_last_sem'] = (df['Prev_TC_HOANTHANH'] < df['Prev_TC_DANGKY']).astype(int)
        
        return df

    def _create_trend_features(self, df):
        # S·ª≠ d·ª•ng Prev_GPA_Raw (c√≥ NaN) ƒë·ªÉ t√≠nh Slope ch√≠nh x√°c
        # N·∫øu d√πng Prev_GPA (c√≥ -1), Slope s·∫Ω b·ªã sai l·ªách l·ªõn
        g_raw = df.groupby('MA_SO_SV')['Prev_GPA_Raw']
        
        # 1. GPA Slope (Xu h∆∞·ªõng ƒëi·ªÉm)
        df['gpa_trend_slope'] = g_raw.transform(
            lambda x: x.rolling(window=3, min_periods=2).apply(fast_slope, raw=True)
        ).fillna(0)
        
        # 2. [RECOVERED] GPA Volatility (ƒê·ªô ·ªïn ƒë·ªãnh)
        df['gpa_volatility'] = g_raw.transform(
            lambda x: x.rolling(window=4, min_periods=2).std()
        ).fillna(0)
        
        # 3. T√≠ch l≈©y t√≠n ch·ªâ (History)
        grouped = df.groupby('MA_SO_SV')
        cum_dangky = grouped['Prev_TC_DANGKY'].cumsum()
        cum_hoanthanh = grouped['Prev_TC_HOANTHANH'].cumsum()
        
        df['total_credits_failed'] = cum_dangky - cum_hoanthanh
        df['accumulated_fail_ratio'] = df['total_credits_failed'] / (cum_dangky + 1e-9)
        
        # 4. [RECOVERED] Credit Velocity (T·ªëc ƒë·ªô h·ªçc)
        semester_count = grouped.cumcount() + 1
        df['credit_velocity'] = cum_hoanthanh / semester_count
        
        return df

    def _create_risk_features(self, df):
        # H√†nh vi "G·ª° g·∫°c" (Aggressive Recovery): R·ªõt m√¥n k·ª≥ tr∆∞·ªõc -> ƒêƒÉng k√Ω nhi·ªÅu h∆°n k·ª≥ n√†y
        more_credits = (df['TC_DANGKY'] > df['Prev_TC_DANGKY'])
        df['aggressive_recovery'] = (df['failed_last_sem'] & more_credits).astype(int)
        
        # K·ª≥ v·ªçng ho√†n th√†nh (Expected Credits) = ƒêƒÉng k√Ω * T·ª∑ l·ªá ƒë·∫≠u to√†n c·ª•c c·ªßa SV ƒë√≥
        df['expected_real_credits'] = df['TC_DANGKY'] * (1 - df['accumulated_fail_ratio'])
        
        return df

    def get_feature_columns(self, df):
        """T·ª± ƒë·ªông l·∫•y danh s√°ch feature d·ª±a tr√™n prefix"""
        
        # Whitelist c√°c prefix
        valid_prefixes = [
            'Prev_', 'prev_', 'sem_', 'diem_', 'nam_', 'is_', 
            'load_', 'aggressive_', 'gpa_', 'total_', 'accumulated_',
            'credit_', 'expected_', 'failed_'
        ]
        
        valid_exact = ['TC_DANGKY', 'DIEM_TRUNGTUYEN', 'DIEM_CHUAN', 'semester_number']
        valid_exact.extend(self.cat_cols)
        
        final_cols = []
        # C√°c c·ªôt target/meta c·∫ßn lo·∫°i b·ªè
        ignore_cols = ['TC_HOANTHANH', 'GPA', 'CPA', 'semester_order', 'MA_SO_SV', 'HOC_KY', 'COMPLETION_RATE', 'Prev_GPA_Raw']
        
        for col in df.columns:
            if col in ignore_cols: continue
            
            is_valid = False
            if col in valid_exact: is_valid = True
            else:
                for prefix in valid_prefixes:
                    if col.startswith(prefix):
                        is_valid = True
                        break
            
            if is_valid: final_cols.append(col)
                
        return final_cols

# ==============================================================================
# 4. EXECUTION BLOCK (CH·∫†Y TH·ª¨)
# ==============================================================================


test_raw = pd.read_csv(TEST_PATH)

print("--- B·∫ÆT ƒê·∫¶U PIPELINE ---")

# 1. Clean Data
df_train_raw = clean_data_pipeline(admission, academic_records, is_test=False)

# 2. Chu·∫©n b·ªã Test (G√°n dummy values ƒë·ªÉ ch·∫°y ƒë∆∞·ª£c pipeline)
test_copy = test_raw.copy()
for col in ['TC_HOANTHANH', 'GPA', 'CPA']: test_copy[col] = 0
df_test_raw = clean_data_pipeline(admission, test_copy, is_test=True)

# 3. G·ªôp Train + Test ƒë·ªÉ t·∫°o feature (Tr√°nh bi√™n gi·ªõi gi·ªØa c√°c k·ª≥ b·ªã ƒë·ª©t g√£y)
df_train_raw['set_type'] = 'TRAIN'
df_test_raw['set_type'] = 'TEST'
full_df = pd.concat([df_train_raw, df_test_raw], ignore_index=True)

# 4. Feature Engineering
fe = FeatureEngineer()
full_df_fe = fe.create_features(full_df)

# 5. T√°ch l·∫°i Train/Test
train_final = full_df_fe[full_df_fe['set_type'] == 'TRAIN'].copy()
test_final = full_df_fe[full_df_fe['set_type'] == 'TEST'].copy()

# 6. L·∫•y features & Target
feature_cols = fe.get_feature_columns(train_final)
X = train_final[feature_cols]
y = train_final['TC_HOANTHANH'] # Ho·∫∑c d√πng COMPLETION_RATE n·∫øu mu·ªën

print(f"‚úÖ Xong! S·ªë l∆∞·ª£ng Features: {len(feature_cols)}")
print(f"Sample Features: {feature_cols[:5]}")

--- B·∫ÆT ƒê·∫¶U PIPELINE ---
--- üöÄ 1. PREPROCESSING DATA ---
--- üöÄ 1. PREPROCESSING DATA ---
--- ‚ö° 2. FEATURE ENGINEERING (FULL POWER) ---
‚úÖ Xong! S·ªë l∆∞·ª£ng Features: 24
Sample Features: ['TC_DANGKY', 'PTXT', 'TOHOP_XT', 'DIEM_TRUNGTUYEN', 'DIEM_CHUAN']


In [4]:
# ==============================================================================
# 5. CHU·∫®N B·ªä D·ªÆ LI·ªÜU CHO TRAINING (THE BRIDGE)
# ==============================================================================
print("\n--- ‚úÇÔ∏è 3. SPLITTING TRAIN/VALID/TEST ---")

import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

# ƒê·ªãnh nghƒ©a m·ªëc th·ªùi gian (Theo ƒë·ªÅ b√†i)
SPLIT_SEM = 20231  # Train ƒë·∫øn h·∫øt HK1 2023-2024
VALID_SEM = 20232  # Valid l√† HK2 2023-2024

# T·∫°o Mask ƒë·ªÉ l·ªçc d·ªØ li·ªáu
# L∆∞u √Ω: train_final c·ªßa b·∫°n ƒëang ch·ª©a c·∫£ Train v√† Valid
mask_train = train_final['semester_order'] <= SPLIT_SEM
mask_valid = train_final['semester_order'] == VALID_SEM

# 1. ƒê·ªãnh nghƒ©a t·∫≠p TRAIN (D√πng ƒë·ªÉ d·∫°y model)
X_train = train_final[mask_train][feature_cols]
# Model h·ªçc T·ª∑ l·ªá (Rate) ch·ª© kh√¥ng h·ªçc s·ªë t√≠n ch·ªâ -> Quy v·ªÅ [0, 1]
y_train = train_final[mask_train]['TC_HOANTHANH'] / (train_final[mask_train]['TC_DANGKY'] + 1e-9)
y_train = y_train.clip(0, 1)

# 2. ƒê·ªãnh nghƒ©a t·∫≠p VALID (D√πng ƒë·ªÉ ki·ªÉm tra & Stacking)
X_valid = train_final[mask_valid][feature_cols]
y_valid_rate = train_final[mask_valid]['TC_HOANTHANH'] / (train_final[mask_valid]['TC_DANGKY'] + 1e-9)
y_valid_rate = y_valid_rate.clip(0, 1)
# C√°c bi·∫øn d√πng ƒë·ªÉ ƒëo l∆∞·ªùng RMSE th·ª±c t·∫ø
y_valid_credits = train_final[mask_valid]['TC_HOANTHANH'] 
valid_dangky = train_final[mask_valid]['TC_DANGKY']

# 3. ƒê·ªãnh nghƒ©a t·∫≠p TEST (D√πng ƒë·ªÉ n·ªôp b√†i)
X_test = test_final[feature_cols]
test_dangky = test_final['TC_DANGKY']

# X√°c ƒë·ªãnh danh s√°ch c·ªôt Category cho CatBoost
# (L·∫•y t·ª´ feature_cols nh·ªØng c·ªôt n·∫±m trong danh s√°ch cat_cols c·ªßa b·∫°n)
cat_features_list = ['PTXT', 'TOHOP_XT', 'MA_NGANH', 'KV_UT', 'KHOA_VIEN']
valid_cat_features = [c for c in feature_cols if c in cat_features_list]

print(f"üìå Train size: {X_train.shape}")
print(f"üìå Valid size: {X_valid.shape}")
print(f"üìå Test size:  {X_test.shape}")
print(f"üìå Categorical Features: {valid_cat_features}")

# ==============================================================================
# 6. HU·∫§N LUY·ªÜN STACKING ENSEMBLE (LAYER 1)
# ==============================================================================
print("\n--- üöÄ 4. ACTIVATING STACKING ENSEMBLE ---")

# --- MODEL 1: XGBOOST ---
print("   -> Training XGBoost...")
xgb_params = {
    'n_estimators': 2000,
    'learning_rate': 0.02,
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'objective': 'reg:squarederror',
    'n_jobs': -1,
    'random_state': 42,
    'enable_categorical': True, # ƒê√£ b·∫≠t h·ªó tr·ª£ category
    'tree_method': 'hist',
    'early_stopping_rounds': 100
}
model_xgb = xgb.XGBRegressor(**xgb_params)
model_xgb.fit(X_train, y_train, eval_set=[(X_valid, y_valid_rate)], verbose=False)

# --- MODEL 2: LIGHTGBM ---
print("   -> Training LightGBM...")
lgb_params = {
    'n_estimators': 2000,
    'learning_rate': 0.02,
    'num_leaves': 31,
    'objective': 'rmse',
    'metric': 'rmse',
    'n_jobs': -1,
    'random_state': 42,
    'verbose': -1
}
model_lgb = lgb.LGBMRegressor(**lgb_params)
model_lgb.fit(
    X_train, y_train, 
    eval_set=[(X_valid, y_valid_rate)], 
    eval_metric='rmse',
    callbacks=[lgb.early_stopping(100), lgb.log_evaluation(0)]
)

# --- MODEL 3: CATBOOST ---
print("   -> Training CatBoost...")
cat_params = {
    'iterations': 2000,
    'learning_rate': 0.02,
    'depth': 6,
    'loss_function': 'RMSE',
    'verbose': 0, # T·∫Øt log cho g·ªçn
    'random_state': 42,
    'allow_writing_files': False,
    'cat_features': valid_cat_features 
}
model_cat = CatBoostRegressor(**cat_params)
model_cat.fit(X_train, y_train, eval_set=(X_valid, y_valid_rate), verbose=False)

# ==============================================================================
# 7. LAYER 2: META-LEARNER (RIDGE REGRESSION) & PREDICTION
# ==============================================================================
print("\n--- üîÑ Layer 2: Blending & Prediction ---")

# 1. T·∫°o Meta-Features (D·ª± b√°o c·ªßa 3 model tr√™n t·∫≠p Valid v√† Test)
def get_preds(model, X):
    return np.clip(model.predict(X), 0, 1)

# Predict Valid
pred_xgb_valid = get_preds(model_xgb, X_valid)
pred_lgb_valid = get_preds(model_lgb, X_valid)
pred_cat_valid = get_preds(model_cat, X_valid)

# Predict Test
pred_xgb_test = get_preds(model_xgb, X_test)
pred_lgb_test = get_preds(model_lgb, X_test)
pred_cat_test = get_preds(model_cat, X_test)

# 2. X√¢y d·ª±ng Dataset cho Layer 2
X_meta_valid = pd.DataFrame({'XGB': pred_xgb_valid, 'LGB': pred_lgb_valid, 'CAT': pred_cat_valid})
X_meta_test = pd.DataFrame({'XGB': pred_xgb_test, 'LGB': pred_lgb_test, 'CAT': pred_cat_test})

# 3. Train Meta-Learner (Ridge)
meta_model = Ridge(alpha=10.0)
meta_model.fit(X_meta_valid, y_valid_rate)

# Xem tr·ªçng s·ªë
weights = meta_model.coef_
print(f"üìä Tr·ªçng s·ªë Stacking: XGB: {weights[0]:.2f} | LGB: {weights[1]:.2f} | CAT: {weights[2]:.2f}")

# 4. D·ª± b√°o cu·ªëi c√πng (Rate)
final_rate_valid = meta_model.predict(X_meta_valid)
final_rate_test = meta_model.predict(X_meta_test)

# ==============================================================================
# 8. H·∫¨U X·ª¨ L√ù & T·∫†O FILE N·ªòP (SUBMISSION)
# ==============================================================================

def post_process_credits(pred_rate, registered_credits):
    """Chuy·ªÉn Rate -> Credits v·ªõi c√°c r√†ng bu·ªôc v·∫≠t l√Ω"""
    # Clip Rate
    pred_rate = np.clip(pred_rate, 0, 1)
    
    # Magic Trick: N·∫øu r·∫•t t·ª± tin (>96%), ƒë·∫©y l√™n 100%
    pred_rate[pred_rate >= 0.96] = 1.0
    
    # Convert ra t√≠n ch·ªâ
    pred_credits = pred_rate * registered_credits
    
    # Hard Limit: Kh√¥ng v∆∞·ª£t qu√° ƒëƒÉng k√Ω
    pred_credits = np.minimum(pred_credits, registered_credits)
    
    return pred_credits

# ƒê√°nh gi√° RMSE tr√™n t·∫≠p Valid
final_credits_valid = post_process_credits(final_rate_valid, valid_dangky)
rmse_stacking = np.sqrt(mean_squared_error(y_valid_credits, final_credits_valid))
print(f"\n>>> ‚úÖ RMSE Stacking (Valid - HK2 23/24): {rmse_stacking:.4f}")

# T·∫°o file n·ªôp b√†i cho Test (HK1 24/25)
final_credits_test = post_process_credits(final_rate_test, test_dangky)
final_submission = final_credits_test


submission_df = pd.DataFrame({
    'MA_SO_SV': test_final['MA_SO_SV'],
    'PRED_TC_HOANTHANH': final_submission
})

filename = 'submission_final_stacking.csv'
submission_df.to_csv(filename, index=False)
print(f"\nüéâ ƒê√£ xu·∫•t file th√†nh c√¥ng: {filename}")
print(submission_df.head())


--- ‚úÇÔ∏è 3. SPLITTING TRAIN/VALID/TEST ---
üìå Train size: (90582, 24)
üìå Valid size: (15144, 24)
üìå Test size:  (16502, 24)
üìå Categorical Features: ['PTXT', 'TOHOP_XT']

--- üöÄ 4. ACTIVATING STACKING ENSEMBLE ---
   -> Training XGBoost...
   -> Training LightGBM...
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[266]	valid_0's rmse: 0.21248
   -> Training CatBoost...

--- üîÑ Layer 2: Blending & Prediction ---
üìä Tr·ªçng s·ªë Stacking: XGB: 0.22 | LGB: 0.29 | CAT: 0.49

>>> ‚úÖ RMSE Stacking (Valid - HK2 23/24): 3.7427

üéâ ƒê√£ xu·∫•t file th√†nh c√¥ng: submission_final_stacking.csv
        MA_SO_SV  PRED_TC_HOANTHANH
2   00003e092652          14.947369
3   00027b0dec4c          17.174156
10  000e15519006          17.446657
13  000ea6e12003          15.980199
16  00109b845a3d           5.040422


In [7]:
# ...existing code...
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error

# ƒê·∫£m b·∫£o final_credits_valid v√† y_valid_credits ƒë√£ s·∫µn s√†ng
final_credits_valid = post_process_credits(final_rate_valid, valid_dangky)

rmse_val = np.sqrt(mean_squared_error(y_valid_credits, final_credits_valid))
r2_val = r2_score(y_valid_credits, final_credits_valid)

# MAPE (Ch·ªâ t√≠nh tr√™n nh·ªØng d√≤ng c√≥ ƒëƒÉng k√Ω t√≠n ch·ªâ > 0)
mask = y_valid_credits > 0
mape_val = mean_absolute_percentage_error(
    y_valid_credits[mask],
    final_credits_valid[mask]
)

print(f"--- üìä K·∫æT QU·∫¢ KI·ªÇM TRA ---")
print(f"‚úÖ RMSE : {rmse_val:.4f} (t√≠n ch·ªâ)")
print(f"‚úÖ R^2  : {r2_val:.4f}")
print(f"‚úÖ MAPE : {mape_val:.2%}")
# ...existing code...

--- üìä K·∫æT QU·∫¢ KI·ªÇM TRA ---
‚úÖ RMSE : 3.7427 (t√≠n ch·ªâ)
‚úÖ R^2  : 0.7158
‚úÖ MAPE : 29.43%
