In [1]:
import pandas as pd
import numpy as np
import sys

from pathlib import Path

ROOT = Path().resolve().parents[1]
sys.path.append(str(ROOT))

In [2]:
ACADEMIC_PATH = r'../../data/raw/academic_records.csv'
ADMISSION_PATH = r'../../data/raw/admission.csv'
TEST_PATH = r'../../data/raw/test.csv'
academic_records = pd.read_csv(ACADEMIC_PATH)
admission = pd.read_csv(ADMISSION_PATH)

In [3]:
import os
import random
import numpy as np
import pandas as pd
import joblib
from datetime import datetime
from src.config import MODELS_DIR, OUTPUT_DIR


def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    

def save_model(model, filename, directory='models'):
    filepath = MODELS_DIR / filename
    joblib.dump(model, filepath)
    print(f"Model saved to: {filepath}")
    return str(filepath)


def load_model(filename, directory='models'):
    filepath = MODELS_DIR / filename
    model = joblib.load(filepath)
    print(f"Model loaded from: {filepath}")
    return model


def save_submission(predictions, student_ids, team_name, directory='output'):
    submission = pd.DataFrame({
        'MA_SO_SV': student_ids,
        'PRED_TC_HOANTHANH': predictions.astype(int)
    })
    
    filename = f"{team_name}.csv"
    filepath = OUTPUT_DIR / filename
    submission.to_csv(filepath, index=False)
    print(f"Submission saved to: {filepath}")
    return str(filepath)


def create_semester_code(year, semester):
    next_year = year + 1
    return f"HK{semester} {year}-{next_year}"


def parse_semester_code(semester_code):
    parts = semester_code.strip().split()
    semester = int(parts[0].replace('HK', ''))
    year_range = parts[1].split('-')
    year = int(year_range[0])
    return year, semester


def get_semester_order(semester_code):
    year, semester = parse_semester_code(semester_code)
    return year * 10 + semester


def calculate_semester_from_admission(admission_year, current_semester_code):
    current_year, current_sem = parse_semester_code(current_semester_code)
    years_diff = current_year - admission_year
    return years_diff * 2 + current_sem


def log_experiment(experiment_name, metrics, params, directory='output'):
    log_file = OUTPUT_DIR / 'experiment_log.csv'

    log_entry = {
        'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        'experiment': experiment_name,
        **metrics,
        **{f'param_{k}': v for k, v in params.items()}
    }

    if log_file.exists() and log_file.stat().st_size > 0:
        log_df = pd.read_csv(log_file)
        log_df = pd.concat(
            [log_df, pd.DataFrame([log_entry])],
            ignore_index=True
        )
    else:
        log_df = pd.DataFrame([log_entry])

    log_df.to_csv(log_file, index=False)
    print(f"Experiment logged to: {log_file}")

    return log_file

def memory_usage(df):
    memory_mb = df.memory_usage(deep=True).sum() / 1024**2
    return f"{memory_mb:.2f} MB"

In [4]:
import pandas as pd
import numpy as np

def clean_data_pipeline(admission, academic_records):
    print("--- BẮT ĐẦU QUY TRÌNH LÀM SẠCH DỮ LIỆU ---")
    
    # =========================================================================
    # BƯỚC 1: XỬ LÝ ĐỊNH DẠNG & KIỂU DỮ LIỆU (Formatting)
    # =========================================================================
    print("Step 1: Formatting & Merging...")
    
    # 1.1. Chuẩn hóa MA_SO_SV về dạng String
    admission['MA_SO_SV'] = admission['MA_SO_SV'].astype(str)
    academic_records['MA_SO_SV'] = academic_records['MA_SO_SV'].astype(str)
    
    # 1.2. Chuyển đổi HOC_KY sang số nguyên (giả định hàm hoc_ky_to_code đã được định nghĩa bên ngoài)
    # Nếu chưa có hàm này, bạn cần định nghĩa logic convert (ví dụ: '20231' -> int)
    # academic_records['HOC_KY'] = academic_records['HOC_KY'].apply(hoc_ky_to_code)
    
    # Merge dữ liệu (Inner Join để chỉ lấy sinh viên có thông tin ở cả 2 bảng)
    df = pd.merge(academic_records, admission, on='MA_SO_SV', how='inner')
    
    # 1.3. Định dạng lại các cột số (Ép kiểu float/int)
    numeric_floats = ['GPA', 'CPA', 'DIEM_TRUNGTUYEN', 'DIEM_CHUAN']
    for col in numeric_floats:
        df[col] = pd.to_numeric(df[col], errors='coerce') 
        
    numeric_ints = ['TC_DANGKY', 'TC_HOANTHANH']
    for col in numeric_ints:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)

    # Sắp xếp dữ liệu
    df = df.sort_values(by=['MA_SO_SV', 'HOC_KY']).reset_index(drop=True)

    # =========================================================================
    # BƯỚC 2: KIỂM TRA LOGIC & LÀM SẠCH NHIỄU (Sanity Checks)
    # =========================================================================
    print("Step 2: Sanity Checks & Logic Cleaning...")
    
    initial_rows = len(df)

    # 3.1. Logic Tín chỉ: Hoàn thành không được lớn hơn Đăng ký
    df['TC_HOANTHANH'] = np.minimum(df['TC_HOANTHANH'], df['TC_DANGKY'])
    
    # 3.2. Logic Điểm số (0 <= GPA/CPA <= 4.0)
    df['GPA'] = df['GPA'].clip(lower=0.0, upper=4.0)
    df['CPA'] = df['CPA'].clip(lower=0.0, upper=4.0)
    
    # -----------------------------------------------------------
    # [MỚI] 3.3. Logic Tuyển sinh: DIEM_TRUNGTUYEN >= DIEM_CHUAN
    # -----------------------------------------------------------
    # Loại bỏ các dòng mà điểm trúng tuyển nhỏ hơn điểm chuẩn.
    # Lưu ý: Các giá trị NaN (do lỗi format ở B1) cũng sẽ bị loại bỏ trong phép so sánh này.
    rows_before_score_filter = len(df)
    df = df[df['DIEM_TRUNGTUYEN'] >= df['DIEM_CHUAN']]
    dropped_score_rows = rows_before_score_filter - len(df)
    print(f" -> Đã loại bỏ {dropped_score_rows} dòng do Điểm trúng tuyển < Điểm chuẩn.")

    # 3.4. Xóa dữ liệu rác (TC_DANGKY = 0)
    rows_before_credit_filter = len(df)
    df = df[df['TC_DANGKY'] > 0].copy()
    dropped_credit_rows = rows_before_credit_filter - len(df)
    print(f" -> Đã loại bỏ {dropped_credit_rows} dòng rác (TC_DANGKY=0).")
    
    # Tổng kết
    total_dropped = initial_rows - len(df)
    print(f"--- HOÀN TẤT: Tổng cộng đã loại bỏ {total_dropped} dòng nhiễu. Kích thước data cuối: {df.shape} ---")
    
    return df

In [5]:
df = clean_data_pipeline(admission, academic_records)

--- BẮT ĐẦU QUY TRÌNH LÀM SẠCH DỮ LIỆU ---
Step 1: Formatting & Merging...
Step 2: Sanity Checks & Logic Cleaning...
 -> Đã loại bỏ 0 dòng do Điểm trúng tuyển < Điểm chuẩn.
 -> Đã loại bỏ 0 dòng rác (TC_DANGKY=0).
--- HOÀN TẤT: Tổng cộng đã loại bỏ 0 dòng nhiễu. Kích thước data cuối: (105726, 11) ---


In [6]:
df.head()

Unnamed: 0,MA_SO_SV,HOC_KY,CPA,GPA,TC_DANGKY,TC_HOANTHANH,NAM_TUYENSINH,PTXT,TOHOP_XT,DIEM_TRUNGTUYEN,DIEM_CHUAN
0,00003e092652,HK1 2023-2024,1.64,1.97,18,15,2023,100,A00,21.32,20.25
1,00003e092652,HK2 2023-2024,1.53,2.05,18,13,2023,100,A00,21.32,20.25
2,000e15519006,HK1 2021-2022,3.85,3.85,9,9,2021,1,D07,23.84,22.43
3,000e15519006,HK1 2022-2023,2.83,2.98,21,21,2021,1,D07,23.84,22.43
4,000e15519006,HK1 2023-2024,1.5,2.73,20,14,2021,1,D07,23.84,22.43


In [7]:
def split_data(merged_df, train_end='HK1 2023-2024', valid_semester='HK2 2023-2024'):
    print("Splitting data into train and validation sets...")
    merged_df['semester_order'] = merged_df['HOC_KY'].apply(get_semester_order)
    train_end_order = get_semester_order(train_end)
    valid_order = get_semester_order(valid_semester)
    train_df = merged_df[merged_df['semester_order'] <= train_end_order].copy()
    valid_df = merged_df[merged_df['semester_order'] == valid_order].copy()    
    print(f"Train data: {train_df.shape}")
    print(f"Valid data: {valid_df.shape}")
        
    return train_df, valid_df

train_df, valid_df = split_data(df)

Splitting data into train and validation sets...
Train data: (90582, 12)
Valid data: (15144, 12)


In [8]:
from src.features import FeatureEngineer

engineer = FeatureEngineer()

test_df = pd.read_csv(TEST_PATH)

X_train, y_train, X_valid, y_valid, X_test, feature_cols = engineer.prepare_full_pipeline(
    train_df=train_df,
    valid_df=valid_df,
    test_df=test_df,
    target_col='TC_HOANTHANH' 
)

# 3. Kiểm tra kết quả
print(f"Số lượng features: {len(feature_cols)}")
print("Train shape:", X_train.shape)
print("Valid shape:", X_valid.shape)
if X_test is not None:
    print("Test shape:", X_test.shape)

# List các feature để bạn xem qua
print("\nDanh sách features:", feature_cols)

Số lượng features: 28
Train shape: (90582, 28)
Valid shape: (15144, 28)
Test shape: (16502, 28)

Danh sách features: ['TC_DANGKY', 'NAM_TUYENSINH', 'PTXT', 'TOHOP_XT', 'DIEM_TRUNGTUYEN', 'DIEM_CHUAN', 'hoc_ky_nam', 'hoc_ky_so', 'is_semester_2', 'semester_number', 'diem_vuot_chuan', 'diem_ratio', 'student_age_academic', 'gpa_lag1', 'avg_gpa_historical', 'cpa_lag1', 'num_previous_semesters', 'tc_dangky_high', 'tc_dangky_low', 'gpa_cpa_lagged_diff', 'gpa_volatility_hist', 'gpa_trend_slope', 'prev_cpa_excellent', 'prev_cpa_good', 'prev_cpa_poor', 'momentum_load_index', 'is_natural_science_entry', 'is_overdue_student']


In [9]:
from sklearn.preprocessing import LabelEncoder

def encode_categorical_features(X, categorical_cols, label_encoders=None, is_training=True):
    X = X.copy()
    if label_encoders is None:
        label_encoders = {}
    for col in categorical_cols:
        if col in X.columns:
            if is_training:
                le = LabelEncoder()
                X[col] = le.fit_transform(X[col].astype(str))
                label_encoders[col] = le
            else:
                le = label_encoders.get(col)
                if le is not None:
                    # Xử lý giá trị mới chưa từng xuất hiện
                    X[col] = X[col].astype(str).apply(lambda x: x if x in le.classes_ else 'Unknown')
                    if 'Unknown' not in le.classes_:
                        le.classes_ = np.append(le.classes_, 'Unknown')
                    X[col] = le.transform(X[col])
    return X, label_encoders

In [10]:
categorical_cols = ['PTXT', 'TOHOP_XT']

# Encode train
X_train_enc, label_encoders = encode_categorical_features(X_train, categorical_cols, is_training=True)
# Encode valid
X_valid_enc, _ = encode_categorical_features(X_valid, categorical_cols, label_encoders, is_training=False)
# Encode test
X_test_enc, _ = encode_categorical_features(X_test, categorical_cols, label_encoders, is_training=False)


In [11]:
X_train_enc.info()

<class 'pandas.DataFrame'>
Index: 90582 entries, 0 to 90580
Data columns (total 28 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   TC_DANGKY                 90582 non-null  int64  
 1   NAM_TUYENSINH             90582 non-null  float64
 2   PTXT                      90582 non-null  int64  
 3   TOHOP_XT                  90582 non-null  int64  
 4   DIEM_TRUNGTUYEN           90582 non-null  float64
 5   DIEM_CHUAN                90582 non-null  float64
 6   hoc_ky_nam                90582 non-null  int64  
 7   hoc_ky_so                 90582 non-null  int64  
 8   is_semester_2             90582 non-null  int64  
 9   semester_number           90582 non-null  float64
 10  diem_vuot_chuan           90582 non-null  float64
 11  diem_ratio                90582 non-null  float64
 12  student_age_academic      90582 non-null  float64
 13  gpa_lag1                  90582 non-null  float64
 14  avg_gpa_historical    

In [12]:
X_valid_enc.info()

<class 'pandas.DataFrame'>
Index: 15144 entries, 90582 to 105725
Data columns (total 28 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   TC_DANGKY                 15144 non-null  int64  
 1   NAM_TUYENSINH             15144 non-null  float64
 2   PTXT                      15144 non-null  int64  
 3   TOHOP_XT                  15144 non-null  int64  
 4   DIEM_TRUNGTUYEN           15144 non-null  float64
 5   DIEM_CHUAN                15144 non-null  float64
 6   hoc_ky_nam                15144 non-null  int64  
 7   hoc_ky_so                 15144 non-null  int64  
 8   is_semester_2             15144 non-null  int64  
 9   semester_number           15144 non-null  float64
 10  diem_vuot_chuan           15144 non-null  float64
 11  diem_ratio                15144 non-null  float64
 12  student_age_academic      15144 non-null  float64
 13  gpa_lag1                  15144 non-null  float64
 14  avg_gpa_historica

In [13]:
X_test_enc.info()

<class 'pandas.DataFrame'>
Index: 16502 entries, 116794 to 120854
Data columns (total 28 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   TC_DANGKY                 16502 non-null  int64  
 1   NAM_TUYENSINH             0 non-null      float64
 2   PTXT                      16502 non-null  int64  
 3   TOHOP_XT                  16502 non-null  int64  
 4   DIEM_TRUNGTUYEN           0 non-null      float64
 5   DIEM_CHUAN                0 non-null      float64
 6   hoc_ky_nam                16502 non-null  int64  
 7   hoc_ky_so                 16502 non-null  int64  
 8   is_semester_2             16502 non-null  int64  
 9   semester_number           0 non-null      float64
 10  diem_vuot_chuan           0 non-null      float64
 11  diem_ratio                0 non-null      float64
 12  student_age_academic      0 non-null      float64
 13  gpa_lag1                  16502 non-null  float64
 14  avg_gpa_historic

In [14]:
import xgboost as xgb

model = xgb.XGBRegressor()
model.fit(X_train_enc, y_train, eval_set=[(X_valid_enc, y_valid)], verbose=True)

[0]	validation_0-rmse:5.84406
[1]	validation_0-rmse:5.09224
[2]	validation_0-rmse:4.66181
[3]	validation_0-rmse:4.41383
[4]	validation_0-rmse:4.26996
[5]	validation_0-rmse:4.19297
[6]	validation_0-rmse:4.14361
[7]	validation_0-rmse:4.10796
[8]	validation_0-rmse:4.08423
[9]	validation_0-rmse:4.06532
[10]	validation_0-rmse:4.05380
[11]	validation_0-rmse:4.04251
[12]	validation_0-rmse:4.04889
[13]	validation_0-rmse:4.05765
[14]	validation_0-rmse:4.04902
[15]	validation_0-rmse:4.04390
[16]	validation_0-rmse:4.05467
[17]	validation_0-rmse:4.05347
[18]	validation_0-rmse:4.05723
[19]	validation_0-rmse:4.05459
[20]	validation_0-rmse:4.07837
[21]	validation_0-rmse:4.08008
[22]	validation_0-rmse:4.08256
[23]	validation_0-rmse:4.09056
[24]	validation_0-rmse:4.11693
[25]	validation_0-rmse:4.11768
[26]	validation_0-rmse:4.11899
[27]	validation_0-rmse:4.11787
[28]	validation_0-rmse:4.15748
[29]	validation_0-rmse:4.15696
[30]	validation_0-rmse:4.15625
[31]	validation_0-rmse:4.15164
[32]	validation_0-

0,1,2
,"objective  objective: typing.Union[str, xgboost.sklearn._SklObjWProto, typing.Callable[[typing.Any, typing.Any], typing.Tuple[numpy.ndarray, numpy.ndarray]], NoneType] Specify the learning task and the corresponding learning objective or a custom objective function to be used. For custom objective, see :doc:`/tutorials/custom_metric_obj` and :ref:`custom-obj-metric` for more information, along with the end note for function signatures.",'reg:squarederror'
,"base_score  base_score: typing.Union[float, typing.List[float], NoneType] The initial prediction score of all instances, global bias.",
,booster,
,"callbacks  callbacks: typing.Optional[typing.List[xgboost.callback.TrainingCallback]] List of callback functions that are applied at end of each iteration. It is possible to use predefined callbacks by using :ref:`Callback API `. .. note::  States in callback are not preserved during training, which means callback  objects can not be reused for multiple training sessions without  reinitialization or deepcopy. .. code-block:: python  for params in parameters_grid:  # be sure to (re)initialize the callbacks before each run  callbacks = [xgb.callback.LearningRateScheduler(custom_rates)]  reg = xgboost.XGBRegressor(**params, callbacks=callbacks)  reg.fit(X, y)",
,colsample_bylevel  colsample_bylevel: typing.Optional[float] Subsample ratio of columns for each level.,
,colsample_bynode  colsample_bynode: typing.Optional[float] Subsample ratio of columns for each split.,
,colsample_bytree  colsample_bytree: typing.Optional[float] Subsample ratio of columns when constructing each tree.,
,"device  device: typing.Optional[str] .. versionadded:: 2.0.0 Device ordinal, available options are `cpu`, `cuda`, and `gpu`.",
,"early_stopping_rounds  early_stopping_rounds: typing.Optional[int] .. versionadded:: 1.6.0 - Activates early stopping. Validation metric needs to improve at least once in  every **early_stopping_rounds** round(s) to continue training. Requires at  least one item in **eval_set** in :py:meth:`fit`. - If early stopping occurs, the model will have two additional attributes:  :py:attr:`best_score` and :py:attr:`best_iteration`. These are used by the  :py:meth:`predict` and :py:meth:`apply` methods to determine the optimal  number of trees during inference. If users want to access the full model  (including trees built after early stopping), they can specify the  `iteration_range` in these inference methods. In addition, other utilities  like model plotting can also use the entire model. - If you prefer to discard the trees after `best_iteration`, consider using the  callback function :py:class:`xgboost.callback.EarlyStopping`. - If there's more than one item in **eval_set**, the last entry will be used for  early stopping. If there's more than one metric in **eval_metric**, the last  metric will be used for early stopping.",
,enable_categorical  enable_categorical: bool See the same parameter of :py:class:`DMatrix` for details.,False


In [15]:
y_pred = model.predict(X_test)

final_preds = y_pred
final_preds = np.minimum(final_preds, test_df['TC_DANGKY'])
final_preds = np.maximum(final_preds, 0) # Không được âm

# 5. Lưu kết quả
submission = pd.DataFrame({
    'MA_SO_SV': test_df['MA_SO_SV'],
    'PRED_TC_HOANTHANH': final_preds.astype(float)
})

submission.to_csv('CC_submission.csv', index=False)
print("Đã lưu file submission.csv thành công!")

Đã lưu file submission.csv thành công!


In [16]:
import optuna
import xgboost as xgb
from sklearn.metrics import root_mean_squared_error

def objective(trial):

    params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',

        'n_estimators': trial.suggest_int('n_estimators', 200, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-6, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-6, 10.0, log=True),

        'tree_method': 'hist',
        'n_jobs': -1,
        'random_state': 42,
        'verbosity': 0
    }

    model = xgb.XGBRegressor(**params)

    # ✅ fit bình thường, KHÔNG eval_set, KHÔNG callback
    model.fit(X_train_enc, y_train)

    preds = model.predict(X_valid_enc)

    rmse = root_mean_squared_error(y_valid, preds)

    return rmse


# ---------------------------------------------------------
# QUAN TRỌNG: direction='minimize' vì RMSE càng thấp càng tốt
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

print('-' * 50)
print('Best RMSE (Valid):', study.best_value)
print('Best Params:', study.best_params)

# ---------------------------------------------------------
# Train lại model cuối cùng
print("\nTraining final model with best params...")
best_params = study.best_params
best_params['objective'] = 'reg:squarederror'
best_params['eval_metric'] = 'rmse'

final_model = xgb.XGBRegressor(**best_params)
final_model.fit(
    X_train_enc, y_train,
    eval_set=[(X_valid_enc, y_valid)],
    verbose=True
)

  from .autonotebook import tqdm as notebook_tqdm
[32m[I 2026-01-30 16:14:07,219][0m A new study created in memory with name: no-name-e077aed9-9d17-4e00-a7f6-e1993795e0d4[0m
[32m[I 2026-01-30 16:14:09,947][0m Trial 0 finished with value: 4.003534729413612 and parameters: {'n_estimators': 800, 'max_depth': 7, 'learning_rate': 0.013842008499700745, 'subsample': 0.9366638946786671, 'colsample_bytree': 0.8338712965092596, 'min_child_weight': 4, 'reg_alpha': 0.0004381406228945771, 'reg_lambda': 0.2228010420799147}. Best is trial 0 with value: 4.003534729413612.[0m
[32m[I 2026-01-30 16:14:11,919][0m Trial 1 finished with value: 3.9797256209921943 and parameters: {'n_estimators': 925, 'max_depth': 4, 'learning_rate': 0.01678565591390516, 'subsample': 0.8802236128020262, 'colsample_bytree': 0.6682279043179191, 'min_child_weight': 7, 'reg_alpha': 7.652047446514141, 'reg_lambda': 4.857253637784816e-06}. Best is trial 1 with value: 3.9797256209921943.[0m
[32m[I 2026-01-30 16:14:12,431]

--------------------------------------------------
Best RMSE (Valid): 3.968161242665941
Best Params: {'n_estimators': 459, 'max_depth': 5, 'learning_rate': 0.017569498409052253, 'subsample': 0.6798106038996907, 'colsample_bytree': 0.6315273675914803, 'min_child_weight': 9, 'reg_alpha': 0.0001680111463276296, 'reg_lambda': 1.271622211900278}

Training final model with best params...
[0]	validation_0-rmse:7.02931
[1]	validation_0-rmse:6.95616
[2]	validation_0-rmse:6.88166
[3]	validation_0-rmse:6.81834
[4]	validation_0-rmse:6.74588
[5]	validation_0-rmse:6.68682
[6]	validation_0-rmse:6.62336
[7]	validation_0-rmse:6.55748
[8]	validation_0-rmse:6.49277
[9]	validation_0-rmse:6.42950
[10]	validation_0-rmse:6.37488
[11]	validation_0-rmse:6.31606
[12]	validation_0-rmse:6.25518
[13]	validation_0-rmse:6.19760
[14]	validation_0-rmse:6.16931
[15]	validation_0-rmse:6.11519
[16]	validation_0-rmse:6.05991
[17]	validation_0-rmse:6.01487
[18]	validation_0-rmse:5.96589
[19]	validation_0-rmse:5.93166
[20]	

0,1,2
,"objective  objective: typing.Union[str, xgboost.sklearn._SklObjWProto, typing.Callable[[typing.Any, typing.Any], typing.Tuple[numpy.ndarray, numpy.ndarray]], NoneType] Specify the learning task and the corresponding learning objective or a custom objective function to be used. For custom objective, see :doc:`/tutorials/custom_metric_obj` and :ref:`custom-obj-metric` for more information, along with the end note for function signatures.",'reg:squarederror'
,"base_score  base_score: typing.Union[float, typing.List[float], NoneType] The initial prediction score of all instances, global bias.",
,booster,
,"callbacks  callbacks: typing.Optional[typing.List[xgboost.callback.TrainingCallback]] List of callback functions that are applied at end of each iteration. It is possible to use predefined callbacks by using :ref:`Callback API `. .. note::  States in callback are not preserved during training, which means callback  objects can not be reused for multiple training sessions without  reinitialization or deepcopy. .. code-block:: python  for params in parameters_grid:  # be sure to (re)initialize the callbacks before each run  callbacks = [xgb.callback.LearningRateScheduler(custom_rates)]  reg = xgboost.XGBRegressor(**params, callbacks=callbacks)  reg.fit(X, y)",
,colsample_bylevel  colsample_bylevel: typing.Optional[float] Subsample ratio of columns for each level.,
,colsample_bynode  colsample_bynode: typing.Optional[float] Subsample ratio of columns for each split.,
,colsample_bytree  colsample_bytree: typing.Optional[float] Subsample ratio of columns when constructing each tree.,0.6315273675914803
,"device  device: typing.Optional[str] .. versionadded:: 2.0.0 Device ordinal, available options are `cpu`, `cuda`, and `gpu`.",
,"early_stopping_rounds  early_stopping_rounds: typing.Optional[int] .. versionadded:: 1.6.0 - Activates early stopping. Validation metric needs to improve at least once in  every **early_stopping_rounds** round(s) to continue training. Requires at  least one item in **eval_set** in :py:meth:`fit`. - If early stopping occurs, the model will have two additional attributes:  :py:attr:`best_score` and :py:attr:`best_iteration`. These are used by the  :py:meth:`predict` and :py:meth:`apply` methods to determine the optimal  number of trees during inference. If users want to access the full model  (including trees built after early stopping), they can specify the  `iteration_range` in these inference methods. In addition, other utilities  like model plotting can also use the entire model. - If you prefer to discard the trees after `best_iteration`, consider using the  callback function :py:class:`xgboost.callback.EarlyStopping`. - If there's more than one item in **eval_set**, the last entry will be used for  early stopping. If there's more than one metric in **eval_metric**, the last  metric will be used for early stopping.",
,enable_categorical  enable_categorical: bool See the same parameter of :py:class:`DMatrix` for details.,False


In [17]:
y_pred = final_model.predict(X_test)

final_preds = y_pred
final_preds = np.minimum(final_preds, test_df['TC_DANGKY'])
final_preds = np.maximum(final_preds, 0) # Không được âm

# 5. Lưu kết quả
submission = pd.DataFrame({
    'MA_SO_SV': test_df['MA_SO_SV'],
    'PRED_TC_HOANTHANH': final_preds.astype(float)
})

submission.to_csv('CC_submission.csv', index=False)
print("Đã lưu file submission.csv thành công!")

Đã lưu file submission.csv thành công!


In [18]:
import pandas as pd
import plotly.express as px

# =========================
# Lấy importance + tên feature
# =========================
importance_df = pd.DataFrame({
    "feature": X_test.columns,                     # tên thật
    "importance": final_model.feature_importances_
})

# sort giảm dần
importance_df = importance_df.sort_values(
    "importance",
    ascending=False
)

# =========================
# Plot bằng Plotly
# =========================
fig = px.bar(
    importance_df,
    x="importance",
    y="feature",
    orientation="h",
    height=max(600, len(importance_df) * 20),  # auto scale theo số feature
    title="Feature Importance (XGBoost)",
)

fig.update_layout(
    yaxis=dict(autorange="reversed"),  # feature quan trọng nhất lên trên
    showlegend=False
)

fig.show()
