In [1]:
import pandas as pd

ACADEMIC_PATH = r'../../data/raw/academic_records.csv'
ADMISSION_PATH = r'../../data/raw/admission.csv'
TEST_PATH = r'../../data/raw/test.csv'
academic_records = pd.read_csv(ACADEMIC_PATH)
admission = pd.read_csv(ADMISSION_PATH)

In [2]:
import os
os.environ['PYTHONHASHSEED'] = '42'

import random
import numpy as np

random.seed(42)
np.random.seed(42)
np.random.set_state(np.random.RandomState(42).get_state())

In [3]:
import pandas as pd
import numpy as np
import re

# --- H√ÄM M·ªöI: X·ª≠ l√Ω ƒë·ªãnh d·∫°ng h·ªçc k·ª≥ ph·ª©c t·∫°p ---
def parse_semester_string(sem_str):
    """
    Chuy·ªÉn ƒë·ªïi chu·ªói nh∆∞ 'HK1 2023-2024' th√†nh m√£ s·ªë 20231 ƒë·ªÉ sort ƒë∆∞·ª£c.
    Logic: NƒÉm * 10 + K·ª≥
    """
    s = str(sem_str).strip()
    
    # Tr∆∞·ªùng h·ª£p 1: D·∫°ng s·ªë s·∫µn (VD: 20231)
    if s.isdigit():
        return int(s)
    
    # Tr∆∞·ªùng h·ª£p 2: D·∫°ng ch·ªØ (VD: HK1 2023-2024 ho·∫∑c H·ªçc k·ª≥ 1 nƒÉm 2023)
    # T√¨m t·∫•t c·∫£ c√°c con s·ªë trong chu·ªói
    digits = re.findall(r'\d+', s)
    
    if len(digits) >= 2:
        # Gi·∫£ s·ª≠ s·ªë nh·ªè l√† k·ª≥, s·ªë l·ªõn (4 ch·ªØ s·ªë) l√† nƒÉm
        # T√¨m nƒÉm (th∆∞·ªùng l√† s·ªë c√≥ 4 ch·ªØ s·ªë ƒë·∫ßu ti√™n t√¨m th·∫•y)
        years = [int(d) for d in digits if len(d) == 4]
        sems = [int(d) for d in digits if len(d) == 1]
        
        if years and sems:
            year = years[0]
            sem = sems[0]
            return year * 10 + sem
            
    return 0 # Kh√¥ng x√°c ƒë·ªãnh

def clean_data_pipeline_v3(admission, academic_records):
    print("--- üöÄ B·∫ÆT ƒê·∫¶U QUY TR√åNH L√ÄM S·∫†CH D·ªÆ LI·ªÜU (FIXED VERSION) ---")
    
    adm = admission.copy()
    acad = academic_records.copy()
    
    # 1. Chu·∫©n h√≥a ID
    adm['MA_SO_SV'] = adm['MA_SO_SV'].astype(str)
    acad['MA_SO_SV'] = acad['MA_SO_SV'].astype(str)
    
    # ---------------------------------------------------------
    # [FIX QUAN TR·ªåNG] X·ª≠ l√Ω HOC_KY th√¥ng minh h∆°n
    # ---------------------------------------------------------
    print("-> ƒêang x·ª≠ l√Ω c·ªôt HOC_KY...")
    # T·∫°o c·ªôt HOC_KY_INT d√πng ƒë·ªÉ sort (VD: 20231)
    acad['HOC_KY_INT'] = acad['HOC_KY'].apply(parse_semester_string)
    
    # Ki·ªÉm tra xem c√≥ d√≤ng n√†o b·ªã l·ªói (b·∫±ng 0) kh√¥ng
    error_count = (acad['HOC_KY_INT'] == 0).sum()
    if error_count > 0:
        print(f"   ‚ö†Ô∏è C·∫£nh b√°o: C√≥ {error_count} d√≤ng kh√¥ng ƒë·ªçc ƒë∆∞·ª£c HOC_KY.")

    # Merge d·ªØ li·ªáu
    df = pd.merge(acad, adm, on='MA_SO_SV', how='left')
    
    # S·∫Øp x·∫øp theo Time-series chu·∫©n x√°c d·ª±a tr√™n c·ªôt v·ª´a t·∫°o
    df = df.sort_values(by=['MA_SO_SV', 'HOC_KY_INT']).reset_index(drop=True)
    
    # ---------------------------------------------------------
    # 2. X·ª¨ L√ù S·ªê LI·ªÜU & LOGIC
    # ---------------------------------------------------------
    cols_float = ['GPA', 'CPA', 'DIEM_TRUNGTUYEN', 'DIEM_CHUAN']
    cols_int = ['TC_DANGKY', 'TC_HOANTHANH']
    
    for col in cols_float:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
        
    for col in cols_int:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)

    # Logic: Ho√†n th√†nh <= ƒêƒÉng k√Ω
    df['TC_HOANTHANH'] = np.minimum(df['TC_HOANTHANH'], df['TC_DANGKY'])
    
    # Target Transformation
    df['COMPLETION_RATE'] = df['TC_HOANTHANH'] / (df['TC_DANGKY'] + 1e-9)
    df['COMPLETION_RATE'] = df['COMPLETION_RATE'].clip(0, 1)

    # Clip ƒëi·ªÉm s·ªë
    df['GPA'] = df['GPA'].clip(0, 4.0)
    df['CPA'] = df['CPA'].clip(0, 4.0)

    # Admission Gap Feature
    if 'DIEM_TRUNGTUYEN' in df.columns and 'DIEM_CHUAN' in df.columns:
        df['ADMISSION_GAP'] = df['DIEM_TRUNGTUYEN'] - df['DIEM_CHUAN']
    
    # L·ªçc r√°c
    initial_len = len(df)
    df = df[df['TC_DANGKY'] > 0]
    
    print(f"--- ‚úÖ HO√ÄN T·∫§T. K√≠ch th∆∞·ªõc data: {df.shape} ---")
    print("Sample HOC_KY_INT:", df['HOC_KY_INT'].head().tolist())
    
    return df

# Ch·∫°y th·ª≠ l·∫°i
df_clean = clean_data_pipeline_v3(admission, academic_records)
df_clean.head()

--- üöÄ B·∫ÆT ƒê·∫¶U QUY TR√åNH L√ÄM S·∫†CH D·ªÆ LI·ªÜU (FIXED VERSION) ---
-> ƒêang x·ª≠ l√Ω c·ªôt HOC_KY...
--- ‚úÖ HO√ÄN T·∫§T. K√≠ch th∆∞·ªõc data: (105726, 14) ---
Sample HOC_KY_INT: [20231, 20232, 20211, 20212, 20221]


Unnamed: 0,MA_SO_SV,HOC_KY,CPA,GPA,TC_DANGKY,TC_HOANTHANH,HOC_KY_INT,NAM_TUYENSINH,PTXT,TOHOP_XT,DIEM_TRUNGTUYEN,DIEM_CHUAN,COMPLETION_RATE,ADMISSION_GAP
0,00003e092652,HK1 2023-2024,1.64,1.97,18,15,20231,2023,100,A00,21.32,20.25,0.833333,1.07
1,00003e092652,HK2 2023-2024,1.53,2.05,18,13,20232,2023,100,A00,21.32,20.25,0.722222,1.07
2,000e15519006,HK1 2021-2022,3.85,3.85,9,9,20211,2021,1,D07,23.84,22.43,1.0,1.41
3,000e15519006,HK2 2021-2022,2.77,3.12,19,19,20212,2021,1,D07,23.84,22.43,1.0,1.41
4,000e15519006,HK1 2022-2023,2.83,2.98,21,21,20221,2021,1,D07,23.84,22.43,1.0,1.41


In [4]:
import pandas as pd
import numpy as np
import re

# ==============================================================================
# 1. C√ÅC H√ÄM TI·ªÜN √çCH (UTILS)
# ==============================================================================

def parse_semester_string(sem_str):
    """
    Chuy·ªÉn ƒë·ªïi 'HK1 2023-2024' -> 20231 (Int) ƒë·ªÉ sort time-series.
    """
    s = str(sem_str).strip()
    if s.isdigit(): return int(s)
    
    digits = re.findall(r'\d+', s)
    if len(digits) >= 2:
        years = [int(d) for d in digits if len(d) == 4]
        sems = [int(d) for d in digits if len(d) == 1]
        if years and sems:
            return years[0] * 10 + sems[0]
    return 0

def fast_slope(y):
    """
    T√≠nh h·ªá s·ªë g√≥c (Trend) nhanh.
    Input: Array numpy (c√≥ th·ªÉ ch·ª©a NaN).
    """
    # L·ªçc b·ªè NaN tr∆∞·ªõc khi t√≠nh (Quan tr·ªçng!)
    y_clean = y[~np.isnan(y)]
    n = len(y_clean)
    if n < 2: return 0.0
    
    x = np.arange(n)
    x_mean = np.mean(x)
    y_mean = np.mean(y_clean)
    
    numerator = np.sum((x - x_mean) * (y_clean - y_mean))
    denominator = np.sum((x - x_mean) ** 2)
    
    return numerator / (denominator + 1e-6)

# ==============================================================================
# 2. DATA PREPROCESSING PIPELINE
# ==============================================================================

def clean_data_pipeline(admission, academic_records, is_test=False):
    print("--- üöÄ 1. PREPROCESSING DATA ---")
    adm = admission.copy()
    acad = academic_records.copy()
    
    # Chu·∫©n h√≥a ID
    adm['MA_SO_SV'] = adm['MA_SO_SV'].astype(str)
    acad['MA_SO_SV'] = acad['MA_SO_SV'].astype(str)
    
    # T·∫°o Time-Index
    acad['semester_order'] = acad['HOC_KY'].apply(parse_semester_string)
    
    # Merge
    df = pd.merge(acad, adm, on='MA_SO_SV', how='left')
    
    # Sort Time-Series (C·ª∞C K·ª≤ QUAN TR·ªåNG)
    df = df.sort_values(by=['MA_SO_SV', 'semester_order']).reset_index(drop=True)
    
    # Numeric conversion
    cols_float = ['GPA', 'CPA', 'DIEM_TRUNGTUYEN', 'DIEM_CHUAN']
    cols_int = ['TC_DANGKY', 'TC_HOANTHANH']
    
    for col in cols_float:
        if col in df.columns: df[col] = pd.to_numeric(df[col], errors='coerce')
    for col in cols_int:
        if col in df.columns: df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)

    # Logic clean
    if not is_test:
        df['TC_HOANTHANH'] = np.minimum(df['TC_HOANTHANH'], df['TC_DANGKY'])
        df['GPA'] = df['GPA'].clip(0, 4.0)
        df['CPA'] = df['CPA'].clip(0, 4.0)
    
    # Target Transformation (Ch·ªâ d√πng cho train)
    if not is_test:
        df['COMPLETION_RATE'] = df['TC_HOANTHANH'] / (df['TC_DANGKY'] + 1e-9)
        df['COMPLETION_RATE'] = df['COMPLETION_RATE'].clip(0, 1)

    return df

# ==============================================================================
# 3. FEATURE ENGINEERING CLASS (CORE)
# ==============================================================================

class FeatureEngineer:    
    def __init__(self):
        # C√°c c·ªôt Category s·∫Ω gi·ªØ l·∫°i
        self.cat_cols = ['PTXT', 'TOHOP_XT', 'MA_NGANH', 'KV_UT', 'KHOA_VIEN'] 

    def create_features(self, df):
        print("--- ‚ö° 2. FEATURE ENGINEERING (FULL POWER) ---")
        df = df.copy()
        
        # Sort l·∫°i cho ch·∫Øc ch·∫Øn
        df = df.sort_values(['MA_SO_SV', 'semester_order']).reset_index(drop=True)
        
        # Convert Category
        for col in self.cat_cols:
            if col in df.columns:
                df[col] = df[col].astype(str).astype('category')

        # Groupby object
        g = df.groupby('MA_SO_SV')

        # ---------------------------------------------------------
        # A. BASE LAGS (C·ª±c k·ª≥ quan tr·ªçng: D√πng shift(1) ƒë·ªÉ tr√°nh Leakage)
        # ---------------------------------------------------------
        # T·∫°o bi·∫øn Raw (ch·ª©a NaN) ƒë·ªÉ t√≠nh to√°n th·ªëng k√™ ch√≠nh x√°c h∆°n
        df['Prev_GPA_Raw'] = g['GPA'].shift(1)
        
        # T·∫°o bi·∫øn Fill (ƒë·ªÉ model d√πng tr·ª±c ti·∫øp)
        df['Prev_GPA'] = df['Prev_GPA_Raw'].fillna(-1)
        df['Prev_CPA'] = g['CPA'].shift(1).fillna(-1)
        df['Prev_TC_HOANTHANH'] = g['TC_HOANTHANH'].shift(1).fillna(0)
        df['Prev_TC_DANGKY'] = g['TC_DANGKY'].shift(1).fillna(0)
        
        # C·ªù nƒÉm nh·∫•t (Ch∆∞a c√≥ l·ªãch s·ª≠)
        df['is_freshman'] = (df['Prev_TC_DANGKY'] == 0).astype(int)

        # ---------------------------------------------------------
        # B. G·ªåI C√ÅC NH√ìM FEATURE
        # ---------------------------------------------------------
        df = self._create_admission_features(df)
        df = self._create_history_features(df)
        df = self._create_trend_features(df)  # <--- Slope, Volatility n·∫±m ·ªü ƒë√¢y
        df = self._create_risk_features(df)
        
        # D·ªçn d·∫πp c·ªôt t·∫°m
        if 'Prev_GPA_Raw' in df.columns:
            df = df.drop(columns=['Prev_GPA_Raw'])
            
        return df

    def _create_admission_features(self, df):
        # Kho·∫£ng c√°ch ƒëi·ªÉm ƒë·∫ßu v√†o
        if 'DIEM_TRUNGTUYEN' in df.columns and 'DIEM_CHUAN' in df.columns:
            df['diem_vuot_chuan'] = df['DIEM_TRUNGTUYEN'] - df['DIEM_CHUAN']
        
        # Tu·ªïi (gi·∫£ ƒë·ªãnh nƒÉm hi·ªán t·∫°i 2025/2026)
        if 'NAM_TUYENSINH' in df.columns:
            df['nam_tuoi'] = 2026 - df['NAM_TUYENSINH']
            
        # S·ªë k·ª≥ ƒë√£ h·ªçc (t√≠nh t·ª´ nƒÉm nh·∫≠p h·ªçc)
        df['semester_number'] = df.groupby('MA_SO_SV').cumcount() + 1
        return df

    def _create_history_features(self, df):
        # 1. Delta GPA-CPA: ƒêang h·ªçc t·ªët h∆°n hay t·ªá h∆°n trung b√¨nh t√≠ch l≈©y?
        df['prev_gpa_cpa_diff'] = df['Prev_GPA'] - df['Prev_CPA']
        
        # 2. T·ª∑ l·ªá ho√†n th√†nh k·ª≥ tr∆∞·ªõc
        df['prev_completion_rate'] = df['Prev_TC_HOANTHANH'] / (df['Prev_TC_DANGKY'] + 1e-9)
        
        # 3. Load Factor (√Åp l·ª±c): ƒêƒÉng k√Ω k·ª≥ n√†y / S·ª©c h·ªçc trung b√¨nh
        avg_capacity = df.groupby('MA_SO_SV')['Prev_TC_HOANTHANH'].transform(
            lambda x: x.rolling(window=5, min_periods=1).mean()
        ).fillna(15)
        
        df['load_factor'] = df['TC_DANGKY'] / (avg_capacity + 1e-9)
        
        # Bi·∫øn c·ªù: R·ªõt m√¥n k·ª≥ tr∆∞·ªõc
        df['failed_last_sem'] = (df['Prev_TC_HOANTHANH'] < df['Prev_TC_DANGKY']).astype(int)
        
        return df

    def _create_trend_features(self, df):
        # S·ª≠ d·ª•ng Prev_GPA_Raw (c√≥ NaN) ƒë·ªÉ t√≠nh Slope ch√≠nh x√°c
        # N·∫øu d√πng Prev_GPA (c√≥ -1), Slope s·∫Ω b·ªã sai l·ªách l·ªõn
        g_raw = df.groupby('MA_SO_SV')['Prev_GPA_Raw']
        
        # 1. GPA Slope (Xu h∆∞·ªõng ƒëi·ªÉm)
        df['gpa_trend_slope'] = g_raw.transform(
            lambda x: x.rolling(window=3, min_periods=2).apply(fast_slope, raw=True)
        ).fillna(0)
        
        # 2. [RECOVERED] GPA Volatility (ƒê·ªô ·ªïn ƒë·ªãnh)
        df['gpa_volatility'] = g_raw.transform(
            lambda x: x.rolling(window=4, min_periods=2).std()
        ).fillna(0)
        
        # 3. T√≠ch l≈©y t√≠n ch·ªâ (History)
        grouped = df.groupby('MA_SO_SV')
        cum_dangky = grouped['Prev_TC_DANGKY'].cumsum()
        cum_hoanthanh = grouped['Prev_TC_HOANTHANH'].cumsum()
        
        df['total_credits_failed'] = cum_dangky - cum_hoanthanh
        df['accumulated_fail_ratio'] = df['total_credits_failed'] / (cum_dangky + 1e-9)
        
        # 4. [RECOVERED] Credit Velocity (T·ªëc ƒë·ªô h·ªçc)
        semester_count = grouped.cumcount() + 1
        df['credit_velocity'] = cum_hoanthanh / semester_count
        
        return df

    def _create_risk_features(self, df):
        # H√†nh vi "G·ª° g·∫°c" (Aggressive Recovery): R·ªõt m√¥n k·ª≥ tr∆∞·ªõc -> ƒêƒÉng k√Ω nhi·ªÅu h∆°n k·ª≥ n√†y
        more_credits = (df['TC_DANGKY'] > df['Prev_TC_DANGKY'])
        df['aggressive_recovery'] = (df['failed_last_sem'] & more_credits).astype(int)
        
        # K·ª≥ v·ªçng ho√†n th√†nh (Expected Credits) = ƒêƒÉng k√Ω * T·ª∑ l·ªá ƒë·∫≠u to√†n c·ª•c c·ªßa SV ƒë√≥
        df['expected_real_credits'] = df['TC_DANGKY'] * (1 - df['accumulated_fail_ratio'])
        
        return df

    def get_feature_columns(self, df):
        """T·ª± ƒë·ªông l·∫•y danh s√°ch feature d·ª±a tr√™n prefix"""
        
        # Whitelist c√°c prefix
        valid_prefixes = [
            'Prev_', 'prev_', 'sem_', 'diem_', 'nam_', 'is_', 
            'load_', 'aggressive_', 'gpa_', 'total_', 'accumulated_',
            'credit_', 'expected_', 'failed_'
        ]
        
        valid_exact = ['TC_DANGKY', 'DIEM_TRUNGTUYEN', 'DIEM_CHUAN', 'semester_number']
        valid_exact.extend(self.cat_cols)
        
        final_cols = []
        # C√°c c·ªôt target/meta c·∫ßn lo·∫°i b·ªè
        ignore_cols = ['TC_HOANTHANH', 'GPA', 'CPA', 'semester_order', 'MA_SO_SV', 'HOC_KY', 'COMPLETION_RATE', 'Prev_GPA_Raw']
        
        for col in df.columns:
            if col in ignore_cols: continue
            
            is_valid = False
            if col in valid_exact: is_valid = True
            else:
                for prefix in valid_prefixes:
                    if col.startswith(prefix):
                        is_valid = True
                        break
            
            if is_valid: final_cols.append(col)
                
        return final_cols

# ==============================================================================
# 4. EXECUTION BLOCK (CH·∫†Y TH·ª¨)
# ==============================================================================


test_raw = pd.read_csv(TEST_PATH)

print("--- B·∫ÆT ƒê·∫¶U PIPELINE ---")

# 1. Clean Data
df_train_raw = clean_data_pipeline(admission, academic_records, is_test=False)

# 2. Chu·∫©n b·ªã Test (G√°n dummy values ƒë·ªÉ ch·∫°y ƒë∆∞·ª£c pipeline)
test_copy = test_raw.copy()
for col in ['TC_HOANTHANH', 'GPA', 'CPA']: test_copy[col] = 0
df_test_raw = clean_data_pipeline(admission, test_copy, is_test=True)

# 3. G·ªôp Train + Test ƒë·ªÉ t·∫°o feature (Tr√°nh bi√™n gi·ªõi gi·ªØa c√°c k·ª≥ b·ªã ƒë·ª©t g√£y)
df_train_raw['set_type'] = 'TRAIN'
df_test_raw['set_type'] = 'TEST'
full_df = pd.concat([df_train_raw, df_test_raw], ignore_index=True)

# 4. Feature Engineering
fe = FeatureEngineer()
full_df_fe = fe.create_features(full_df)

# 5. T√°ch l·∫°i Train/Test
train_final = full_df_fe[full_df_fe['set_type'] == 'TRAIN'].copy()
test_final = full_df_fe[full_df_fe['set_type'] == 'TEST'].copy()

# 6. L·∫•y features & Target
feature_cols = fe.get_feature_columns(train_final)
X = train_final[feature_cols]
y = train_final['TC_HOANTHANH'] # Ho·∫∑c d√πng COMPLETION_RATE n·∫øu mu·ªën

print(f"‚úÖ Xong! S·ªë l∆∞·ª£ng Features: {len(feature_cols)}")
print(f"Sample Features: {feature_cols[:5]}")

--- B·∫ÆT ƒê·∫¶U PIPELINE ---
--- üöÄ 1. PREPROCESSING DATA ---
--- üöÄ 1. PREPROCESSING DATA ---
--- ‚ö° 2. FEATURE ENGINEERING (FULL POWER) ---
‚úÖ Xong! S·ªë l∆∞·ª£ng Features: 24
Sample Features: ['TC_DANGKY', 'PTXT', 'TOHOP_XT', 'DIEM_TRUNGTUYEN', 'DIEM_CHUAN']


In [5]:
import optuna
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
import numpy as np

print("--- üöÄ 3. TRAINING PHASE (LIGHTGBM) ---")

SPLIT_SEM = 20231  # Train ƒë·∫øn h·∫øt HK1 2023-2024
VALID_SEM = 20232  # Valid l√† HK2 2023-2024

# L·ªçc d·ªØ li·ªáu t·ª´ full_df_fe (k·∫øt qu·∫£ c·ªßa b∆∞·ªõc Feature Engineering tr∆∞·ªõc)
# L∆∞u √Ω: Lo·∫°i b·ªè c√°c d√≤ng Test ra kh·ªèi qu√° tr√¨nh train
df_modeling = full_df_fe[full_df_fe['set_type'] == 'TRAIN'].copy()

# Chia Train/Valid
train_mask = df_modeling['semester_order'] <= SPLIT_SEM
valid_mask = df_modeling['semester_order'] == VALID_SEM

X_train = df_modeling[train_mask][feature_cols]
y_train = df_modeling[train_mask]['COMPLETION_RATE'] # Train tr√™n Rate
w_train = df_modeling[train_mask]['TC_DANGKY']       # (Optional) Tr·ªçng s·ªë: M√¥n nhi·ªÅu t√≠n quan tr·ªçng h∆°n

X_valid = df_modeling[valid_mask][feature_cols]
y_valid_rate = df_modeling[valid_mask]['COMPLETION_RATE']
y_valid_credits = df_modeling[valid_mask]['TC_HOANTHANH'] # Target th·∫≠t (s·ªë t√≠n) ƒë·ªÉ ƒëo RMSE
valid_credits_dangky = df_modeling[valid_mask]['TC_DANGKY'] # D√πng ƒë·ªÉ convert Rate -> Credits

print(f"K√≠ch th∆∞·ªõc t·∫≠p Train: {X_train.shape}")
print(f"K√≠ch th∆∞·ªõc t·∫≠p Valid: {X_valid.shape}")

print("--- üîç STARTING OPTUNA FOR LIGHTGBM ---")

def objective(trial):
    params = {
        'n_estimators': 2000,
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.05),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.6, 1.0),
        'lambda_l1': trial.suggest_float('lambda_l1', 0, 10),
        'lambda_l2': trial.suggest_float('lambda_l2', 0, 10),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 10),
        'objective': 'regression',
        'random_state': 42,
        'n_jobs': -1,
        'verbosity': -1
    }
    
    model = lgb.LGBMRegressor(**params)
    
    model.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid_rate)],
        callbacks=[lgb.early_stopping(100, verbose=False)],
    )
    
    pred_rate_valid = model.predict(X_valid)
    pred_credits_valid = pred_rate_valid * valid_credits_dangky
    pred_credits_valid = np.minimum(pred_credits_valid, valid_credits_dangky)
    rmse = np.sqrt(mean_squared_error(y_valid_credits, pred_credits_valid))
    
    return rmse

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

print("\n--- ‚úÖ OPTUNA FINISHED ---")
print(f"Best RMSE: {study.best_value:.4f}")
print("Best Params:", study.best_params)

best_lgb_params = study.best_params
best_lgb_params.update({
    'n_estimators': 2000,
    'objective': 'regression',
    'random_state': 42,
    'n_jobs': -1,
    'verbosity': -1
})

model_lgb = lgb.LGBMRegressor(**best_lgb_params)
model_lgb.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid_rate)],
    callbacks=[lgb.early_stopping(100, verbose=False)],
)
print("-> Trained model_lgb with optimal parameters from Optuna!")

  from .autonotebook import tqdm as notebook_tqdm
[32m[I 2026-02-03 16:06:33,367][0m A new study created in memory with name: no-name-a6f8797a-982d-4550-b806-b3c5e9e4750c[0m


--- üöÄ 3. TRAINING PHASE (LIGHTGBM) ---
K√≠ch th∆∞·ªõc t·∫≠p Train: (90582, 24)
K√≠ch th∆∞·ªõc t·∫≠p Valid: (15144, 24)
--- üîç STARTING OPTUNA FOR LIGHTGBM ---


[32m[I 2026-02-03 16:06:34,169][0m Trial 0 finished with value: 3.7418489418398075 and parameters: {'learning_rate': 0.04077845053878282, 'max_depth': 7, 'subsample': 0.9948559403032202, 'feature_fraction': 0.7416392855318249, 'lambda_l1': 6.915262136418355, 'lambda_l2': 2.284848345411885, 'min_child_samples': 7}. Best is trial 0 with value: 3.7418489418398075.[0m
[32m[I 2026-02-03 16:06:35,261][0m Trial 1 finished with value: 3.7594963201012543 and parameters: {'learning_rate': 0.048467353484969534, 'max_depth': 3, 'subsample': 0.8737280556090405, 'feature_fraction': 0.8848253752282301, 'lambda_l1': 3.6588981513707663, 'lambda_l2': 1.3502597234095415, 'min_child_samples': 7}. Best is trial 0 with value: 3.7418489418398075.[0m
[32m[I 2026-02-03 16:06:36,423][0m Trial 2 finished with value: 3.7492384699267847 and parameters: {'learning_rate': 0.023650440036115356, 'max_depth': 9, 'subsample': 0.8804557232923362, 'feature_fraction': 0.9846086854469294, 'lambda_l1': 6.062215147246


--- ‚úÖ OPTUNA FINISHED ---
Best RMSE: 3.7369
Best Params: {'learning_rate': 0.03509800137296282, 'max_depth': 10, 'subsample': 0.9994473627762352, 'feature_fraction': 0.65912041809177, 'lambda_l1': 9.377714369236545, 'lambda_l2': 9.694036269197362, 'min_child_samples': 9}
-> Trained model_lgb with optimal parameters from Optuna!


In [9]:
y_pred_rate = model_lgb.predict(X_valid)
y_pred = y_pred_rate * valid_credits_dangky

In [15]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

# === TRY IMPORT OPTIONAL DEPENDENCIES ===
SHAP_AVAILABLE = False
LIME_AVAILABLE = False

try:
    import shap
    SHAP_AVAILABLE = True
except Exception as e:
    print(f"‚ö†Ô∏è SHAP not available: {str(e)[:80]}")

try:
    import lime
    import lime.lime_tabular
    LIME_AVAILABLE = True
except Exception as e:
    print(f"‚ö†Ô∏è LIME not available: {str(e)[:80]}")

# ========================================
# METRICS (LU√îN HO·∫†T ƒê·ªòNG)
# ========================================

def calculate_mape(y_true, y_pred, epsilon=1e-8):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    mask = np.abs(y_true) > epsilon
    if not np.any(mask):
        return np.nan
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100

def calculate_regression_metrics(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    mape = calculate_mape(y_true, y_pred)
    return {'rmse': rmse, 'r2': r2, 'mape': mape}

def print_metrics(metrics, prefix=""):
    print(f"{prefix}RMSE: {metrics['rmse']:.4f} | R¬≤: {metrics['r2']:.4f} | MAPE: {metrics['mape']:.2f}%")

# ========================================
# FEATURE IMPORTANCE (LU√îN HO·∫†T ƒê·ªòNG)
# ========================================

def get_feature_importance(model, feature_names=None):
    if hasattr(model, 'feature_importances_'):
        importance = model.feature_importances_
    elif hasattr(model, 'get_feature_importance'):  # CatBoost
        importance = model.get_feature_importance()
    else:
        raise ValueError("Model kh√¥ng h·ªó tr·ª£ feature importance")
    
    if feature_names is None:
        feature_names = [f"Feature_{i}" for i in range(len(importance))]
    
    df = pd.DataFrame({
        'feature': feature_names,
        'importance': importance
    }).sort_values('importance', ascending=False).reset_index(drop=True)
    df['importance_pct'] = (df['importance'] / df['importance'].sum() * 100).round(2)
    return df

def plot_feature_importance(importance_df, top_n=20, title="Feature Importance"):
    df_plot = importance_df.head(top_n).copy()
    df_plot['feature'] = df_plot['feature'].astype(str)
    
    fig = px.bar(
        df_plot, x='importance', y='feature', orientation='h',
        color='importance', color_continuous_scale='Viridis',
        text=df_plot['importance_pct'].apply(lambda x: f"{x}%"),
        title=title
    )
    fig.update_layout(
        yaxis=dict(autorange="reversed"),
        xaxis_title="Importance",
        yaxis_title="Feature",
        height=max(400, top_n * 25),
        template="plotly_white"
    )
    fig.update_traces(textposition='outside')
    return fig

# ========================================
# DATA PREPROCESSING FOR EXPLAINABILITY
# ========================================

def prepare_data_for_explainers(X, fit_encoders=False, encoders=None):
    """
    Chuy·ªÉn ƒë·ªïi categorical features ‚Üí numeric cho SHAP/LIME.
    Tr·∫£ v·ªÅ: (X_numeric, feature_names, categorical_info, encoders)
    """
    if isinstance(X, pd.DataFrame):
        X = X.copy()
    else:
        X = pd.DataFrame(X)
    
    categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
    numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
    
    new_encoders = {} if encoders is None else encoders.copy()
    X_processed = X.copy()
    
    if categorical_cols:
        if fit_encoders:
            # Fit m·ªõi encoders
            for col in categorical_cols:
                le = LabelEncoder()
                # X·ª≠ l√Ω missing values tr∆∞·ªõc khi encode
                X_processed[col] = X_processed[col].fillna('MISSING').astype(str)
                le.fit(X_processed[col])
                X_processed[col] = le.transform(X_processed[col])
                new_encoders[col] = le
        else:
            # D√πng encoders c√≥ s·∫µn
            for col in categorical_cols:
                if col in new_encoders:
                    X_processed[col] = X_processed[col].fillna('MISSING').astype(str)
                    X_processed[col] = new_encoders[col].transform(X_processed[col])
                else:
                    # Fallback: g√°n gi√° tr·ªã m·∫∑c ƒë·ªãnh
                    X_processed[col] = 0
                    warnings.warn(f"‚ö†Ô∏è Categorical encoder missing for '{col}', using default value 0")
    
    # ƒê·∫£m b·∫£o t·∫•t c·∫£ c·ªôt l√† numeric
    X_processed = X_processed.astype(float)
    
    return (
        X_processed.values,
        X.columns.tolist(),
        {'categorical_cols': categorical_cols, 'numeric_cols': numeric_cols},
        new_encoders
    )

# ========================================
# SHAP (X·ª¨ L√ù CATEGORICAL T·ª∞ ƒê·ªòNG)
# ========================================

def get_shap_values(model, X_sample, model_type=None):
    if not SHAP_AVAILABLE:
        raise RuntimeError("SHAP ch∆∞a ƒë∆∞·ª£c c√†i ƒë·∫∑t")
    
    # Chu·∫©n b·ªã d·ªØ li·ªáu numeric
    X_num, feature_names, _, _ = prepare_data_for_explainers(X_sample, fit_encoders=False)
    
    if model_type == 'catboost':
        explainer = shap.TreeExplainer(model, feature_perturbation="tree_path_dependent")
    else:
        explainer = shap.TreeExplainer(model)
    
    shap_values = explainer.shap_values(X_num)
    return shap_values, feature_names, X_num

def plot_shap_summary(shap_values, X_sample, feature_names, max_display=20, title="SHAP Summary Plot"):
    if not SHAP_AVAILABLE:
        print("‚ö†Ô∏è SHAP kh√¥ng kh·∫£ d·ª•ng - skip plot")
        return None
    
    # T·∫°o DataFrame SHAP
    shap_df = pd.DataFrame(shap_values, columns=feature_names)
    mean_abs_shap = shap_df.abs().mean().sort_values(ascending=False)
    top_features = mean_abs_shap.head(max_display).index.tolist()
    
    # Chu·∫©n b·ªã d·ªØ li·ªáu cho beeswarm-style plot
    plot_data = []
    for feature in top_features[::-1]:  # reverse ƒë·ªÉ feature quan tr·ªçng nh·∫•t ·ªü tr√™n
        shap_vals = shap_df[feature].values
        feat_vals = X_sample[:, feature_names.index(feature)]
        colors = feat_vals  # d√πng gi√° tr·ªã feature ƒë·ªÉ t√¥ m√†u
        
        for i in range(len(shap_vals)):
            plot_data.append({
                'feature': feature,
                'shap_value': shap_vals[i],
                'feature_value': colors[i]
            })
    
    df_plot = pd.DataFrame(plot_data)
    
    # T·∫°o beeswarm plot b·∫±ng Plotly
    fig = px.scatter(
        df_plot,
        x='shap_value',
        y='feature',
        color='feature_value',
        color_continuous_scale='RdBu',
        title=title,
        height=max(400, len(top_features) * 30),
        hover_data=['feature_value']
    )
    
    fig.update_layout(
        xaxis_title="SHAP Value (impact on model output)",
        yaxis_title="Feature",
        template="plotly_white",
        showlegend=True
    )
    fig.add_vline(x=0, line_dash="dash", line_color="gray", line_width=1)
    return fig

# ========================================
# LIME (X·ª¨ L√ù CATEGORICAL ƒê√öNG C√ÅCH)
# ========================================

def explain_instance_lime(model, X_train, X_instance, feature_names=None, 
                         categorical_features=None, categorical_names=None, 
                         num_features=10, num_samples=5000):
    if not LIME_AVAILABLE:
        raise RuntimeError("LIME ch∆∞a ƒë∆∞·ª£c c√†i ƒë·∫∑t")
    
    # Chu·∫©n b·ªã training data cho LIME (numeric)
    X_train_num, train_feat_names, cat_info, encoders = prepare_data_for_explainers(
        X_train, fit_encoders=True
    )
    
    # Chu·∫©n b·ªã instance c·∫ßn gi·∫£i th√≠ch
    if isinstance(X_instance, pd.DataFrame):
        X_instance = X_instance.copy()
    else:
        X_instance = pd.DataFrame(X_instance, columns=train_feat_names)
    
    X_instance_num, _, _, _ = prepare_data_for_explainers(
        X_instance, fit_encoders=False, encoders=encoders
    )
    
    # X√°c ƒë·ªãnh categorical feature indices
    if categorical_features is None:
        categorical_features = [
            i for i, col in enumerate(train_feat_names) 
            if col in cat_info['categorical_cols']
        ]
    
    # T·∫°o categorical_names mapping
    if categorical_names is None and cat_info['categorical_cols']:
        categorical_names = {}
        for col in cat_info['categorical_cols']:
            idx = train_feat_names.index(col)
            le = encoders.get(col)
            if le:
                categorical_names[idx] = list(le.classes_)
    
    # T·∫°o explainer v·ªõi categorical info
    explainer = lime.lime_tabular.LimeTabularExplainer(
        X_train_num,
        feature_names=train_feat_names,
        categorical_features=categorical_features,
        categorical_names=categorical_names,
        mode='regression',
        verbose=False,
        random_state=42,
        kernel_width=3,
        discretize_continuous=False  # Gi·ªØ nguy√™n gi√° tr·ªã li√™n t·ª•c
    )
    
    # Gi·∫£i th√≠ch instance
    exp = explainer.explain_instance(
        X_instance_num[0],
        model.predict,
        num_features=num_features,
        num_samples=num_samples
    )
    
    return exp, encoders

def plot_lime_explanation(lime_explanation, title="LIME Explanation"):
    if not LIME_AVAILABLE:
        print("‚ö†Ô∏è LIME kh√¥ng kh·∫£ d·ª•ng - skip plot")
        return None
    
    exp_list = lime_explanation.as_list()
    df = pd.DataFrame(exp_list, columns=['feature', 'weight'])
    df = df.sort_values('weight', key=abs, ascending=True).reset_index(drop=True)
    colors = ['green' if w > 0 else 'red' for w in df['weight']]
    
    fig = go.Figure(go.Bar(
        y=df['feature'], 
        x=df['weight'], 
        orientation='h',
        marker_color=colors,
        text=df['weight'].apply(lambda x: f"{x:+.3f}"),
        textposition='auto',
    ))
    
    fig.update_layout(
        title=title,
        xaxis_title="Feature Contribution to Prediction",
        yaxis_title="Feature",
        template="plotly_white",
        height=300 + len(df) * 35,
        xaxis=dict(showgrid=True, zeroline=True, zerolinewidth=2, zerolinecolor='gray')
    )
    fig.add_vline(x=0, line_dash="dash", line_color="gray", line_width=1)
    return fig

# ========================================
# HELPER: SHOW ALL METRICS & PLOTS
# ========================================

def show_all(model, X_valid, y_valid, y_pred, 
             X_train=None, model_type=None, 
             instance_idx=0, top_n_features=15):
    """
    Hi·ªÉn th·ªã ƒë·∫ßy ƒë·ªß metrics + visualizations cho model.
    
    Parameters:
    -----------
    model : model ƒë√£ train (XGBoost/LightGBM/CatBoost)
    X_valid : pd.DataFrame - validation features
    y_valid : array-like - true labels (credits)
    y_pred : array-like - predicted credits
    X_train : pd.DataFrame - training features (c·∫ßn cho LIME)
    model_type : str - 'xgboost', 'lightgbm', 'catboost'
    instance_idx : int - index c·ªßa instance ƒë·ªÉ gi·∫£i th√≠ch b·∫±ng SHAP/LIME
    top_n_features : int - s·ªë feature hi·ªÉn th·ªã trong plots
    """
    # 1. Metrics
    print("="*60)
    print("üìä REGRESSION METRICS")
    print("="*60)
    metrics = calculate_regression_metrics(y_valid, y_pred)
    print_metrics(metrics, "Validation: ")
    print()
    
    # 2. Feature Importance
    print("="*60)
    print("üìà FEATURE IMPORTANCE")
    print("="*60)
    imp_df = get_feature_importance(model, feature_names=X_valid.columns.tolist())
    fig_imp = plot_feature_importance(imp_df, top_n=top_n_features)
    fig_imp.show()
    print()
    
    # 3. SHAP Summary (n·∫øu c√≥)
    if SHAP_AVAILABLE and X_valid.shape[0] > 0:
        print("="*60)
        print("üîç SHAP SUMMARY PLOT")
        print("="*60)
        try:
            # Sample 100 instances ƒë·ªÉ t√≠nh SHAP nhanh h∆°n
            sample_size = min(100, len(X_valid))
            X_sample = X_valid.iloc[:sample_size] if isinstance(X_valid, pd.DataFrame) else X_valid[:sample_size]
            
            shap_vals, feat_names, X_num = get_shap_values(model, X_sample, model_type=model_type)
            fig_shap = plot_shap_summary(shap_vals, X_num, feat_names, max_display=top_n_features)
            if fig_shap:
                fig_shap.show()
        except Exception as e:
            print(f"‚ö†Ô∏è SHAP error: {str(e)[:100]}")
    print()
    
    # 4. LIME Explanation (n·∫øu c√≥)
    if LIME_AVAILABLE and X_train is not None and len(X_valid) > instance_idx:
        print("="*60)
        print(f"üî¨ LIME EXPLANATION (Instance #{instance_idx})")
        print("="*60)
        try:
            instance = X_valid.iloc[[instance_idx]] if isinstance(X_valid, pd.DataFrame) else X_valid[instance_idx:instance_idx+1]
            lime_exp, _ = explain_instance_lime(
                model, 
                X_train=X_train, 
                X_instance=instance,
                feature_names=X_valid.columns.tolist(),
                num_features=top_n_features
            )
            fig_lime = plot_lime_explanation(lime_exp, title=f"LIME: Instance #{instance_idx}")
            if fig_lime:
                fig_lime.show()
        except Exception as e:
            print(f"‚ö†Ô∏è LIME error: {str(e)[:100]}")
    print()
    print("="*60)
    print("‚úÖ DONE")
    print("="*60)

In [16]:
show_all(
    model=model_lgb,
    X_valid=X_valid,
    y_valid=y_valid_credits,
    y_pred=y_pred,
    X_train=X_train,          # C·∫ßn cho LIME
    model_type='lightgbm',    # 'xgboost', 'lightgbm', 'catboost'
    instance_idx=0,           # Instance n√†o ƒë·ªÉ gi·∫£i th√≠ch
    top_n_features=15
)

üìä REGRESSION METRICS
Validation: RMSE: 3.7369 | R¬≤: 0.7167 | MAPE: 29.99%

üìà FEATURE IMPORTANCE



üîç SHAP SUMMARY PLOT



üî¨ LIME EXPLANATION (Instance #0)



‚úÖ DONE


In [None]:
def submission(model, output_filename):
    print("\n--- 4. PREDICTING TEST SET (HK1 2024-2025) ---")
    test_final = full_df_fe[full_df_fe['set_type'] == 'TEST'].copy()
    X_test = test_final[feature_cols]
    test_dangky = test_final['TC_DANGKY']

    # D·ª± b√°o
    pred_rate_test = model.predict(X_test)

    # H·∫≠u x·ª≠ l√Ω t∆∞∆°ng t·ª±
    pred_rate_test = np.clip(pred_rate_test, 0, 1)
    pred_credits_test = pred_rate_test * test_dangky
    pred_credits_test = np.minimum(pred_credits_test, test_dangky)

    final_submission = pred_credits_test

    # T·∫°o DataFrame n·ªôp b√†i
    submission_df = pd.DataFrame({
        'MA_SO_SV': test_final['MA_SO_SV'],
        'PRED_TC_HOANTHANH': final_submission
    })

    # L∆∞u file
    submission_df.to_csv(output_filename, index=False)
    print(f"‚úÖ ƒê√£ l∆∞u file k·∫øt qu·∫£: {output_filename}")
    print(submission_df.head())

In [20]:
submission(model=model_lgb, output_filename="submission_lightgbm_v1.csv")


--- 4. PREDICTING TEST SET (HK1 2024-2025) ---
‚úÖ ƒê√£ l∆∞u file k·∫øt qu·∫£: submission_lightgbm_v1.csv
        MA_SO_SV  PRED_TC_HOANTHANH
2   00003e092652          15.788996
3   00027b0dec4c          17.560963
10  000e15519006          17.641771
13  000ea6e12003          16.617705
16  00109b845a3d           5.089775
