In [1]:
# ================================================================
# Train mô hình dự đoán học viên bỏ học (IZONE_Manager)
# Dữ liệu: ML2.csv
# ---------------------------------------------------------------
# Tác giả: Nguyễn Tuấn Anh
# Ngày: 2025-10-31
# ================================================================

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import f1_score, make_scorer
import joblib

# ================================================================
# 1️⃣ ĐỌC DỮ LIỆU
# ================================================================
df = pd.read_csv("D:/Downloads/KLTN_Trae/IZONE_Web/Backend/ML_Models/TrainData.csv")
print(f"✅ Đọc dữ liệu thành công: {df.shape[0]} mẫu, {df.shape[1]} cột")

# ================================================================
# 2️⃣ XỬ LÝ DỮ LIỆU
# ================================================================
# Xác định cột target
target_col = "Target_BoHoc"

# Loại bỏ các cột ID
drop_cols = ["DangKyID", "HocVienID"]
X = df.drop(columns=drop_cols + [target_col], errors="ignore")
y = df[target_col]

# Xử lý giá trị thiếu (nếu có)
X = X.fillna(0)

# ================================================================
# 3️⃣ CHIA TRAIN / TEST
# ================================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"📊 Train: {X_train.shape}, Test: {X_test.shape}")
print(f"🔹 Tỷ lệ bỏ học: {y_train.mean():.2%}")

# ================================================================
# 4️⃣ TÌM SIÊU THAM SỐ TỐI ƯU (RandomForest + class_weight balanced)
# ================================================================

# Define the parameter space for hyperparameter tuning
param_dist = {
    'n_estimators': np.arange(100, 501, 100),
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': np.arange(2, 11, 2),
    'min_samples_leaf': np.arange(1, 11, 2),
    'class_weight': ['balanced', 'balanced_subsample']
}

# Create a scorer based on F1-score for the positive class (dropout)
f1_scorer = make_scorer(f1_score, pos_label=1)

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_distributions=param_dist,
    n_iter=50,  # Number of parameter settings that are sampled
    cv=5,  # Number of cross-validation folds
    scoring=f1_scorer,
    random_state=42,
    n_jobs=-1, # Use all available cores
    verbose=1 # Print progress messages
)

# Perform the hyperparameter search
random_search.fit(X_train, y_train)

# Print the best parameters and the corresponding best score
print("Best parameters found: ", random_search.best_params_)
print("Best F1-score obtained: ", random_search.best_score_)

# ================================================================
# 5️⃣ HUẤN LUYỆN MÔ HÌNH VỚI SIÊU THAM SỐ TỐI ƯU
# ================================================================
best_params = random_search.best_params_
model = RandomForestClassifier(**best_params, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)
print("✅ Huấn luyện mô hình với siêu tham số tối ưu xong.")

# ================================================================
# 6️⃣ TỐI ƯU NGƯỠNG DỰ ĐOÁN (THRESHOLD) ĐỂ TĂNG RECALL (hoặc F1)
# ================================================================
y_prob = model.predict_proba(X_test)[:, 1]

# Find optimal threshold based on F1-score (as done in subsequent cells)
best_f1 = 0
best_th = 0
for th in np.arange(0.05, 0.95, 0.01):
    y_pred_th = (y_prob >= th).astype(int)
    f1 = f1_score(y_test, y_pred_th, pos_label=1)
    if f1 > best_f1:
        best_f1 = f1
        best_th = th

print(f"\n🎯 Ngưỡng tối ưu dựa trên F1-score: {best_th:.3f}")
y_pred = (y_prob >= best_th).astype(int)


# ================================================================
# 7️⃣ ĐÁNH GIÁ MÔ HÌNH
# ================================================================
print("\n=== 📈 Báo cáo đánh giá mô hình ===")
print(classification_report(y_test, y_pred, digits=3))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Calculate ROC AUC for evaluation
fpr, tpr, thresholds_roc = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)
print(f"ROC AUC: {roc_auc:.3f}")


# ================================================================
# 8️⃣ (Tuỳ chọn) Lưu mô hình và kết quả
# ================================================================
joblib.dump(model, "model_dropout_tuned.pkl")
print("\n💾 Đã lưu mô hình tối ưu vào file model_dropout_tuned.pkl")

✅ Đọc dữ liệu thành công: 4272 mẫu, 14 cột
📊 Train: (3417, 11), Test: (855, 11)
🔹 Tỷ lệ bỏ học: 14.43%
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best parameters found:  {'n_estimators': np.int64(100), 'min_samples_split': np.int64(6), 'min_samples_leaf': np.int64(3), 'max_depth': 10, 'class_weight': 'balanced_subsample'}
Best F1-score obtained:  0.7233912506586726
✅ Huấn luyện mô hình với siêu tham số tối ưu xong.

🎯 Ngưỡng tối ưu dựa trên F1-score: 0.460

=== 📈 Báo cáo đánh giá mô hình ===
              precision    recall  f1-score   support

           0      0.967     0.919     0.943       732
           1      0.629     0.813     0.709       123

    accuracy                          0.904       855
   macro avg      0.798     0.866     0.826       855
weighted avg      0.918     0.904     0.909       855

Confusion Matrix:
[[673  59]
 [ 23 100]]
ROC AUC: 0.931

💾 Đã lưu mô hình tối ưu vào file model_dropout_tuned.pkl


In [2]:
import numpy as np
from sklearn.metrics import classification_report, f1_score

best_f1 = 0
best_th = 0

# Iterate through a range of thresholds to find the optimal one
for th in np.arange(0.05, 0.95, 0.01):
    y_pred = (y_prob >= th).astype(int)
    f1 = f1_score(y_test, y_pred, pos_label=1)
    if f1 > best_f1:
        best_f1 = f1
        best_th = th

print(f"\nOptimal Threshold based on F1-score: {best_th:.3f}")
y_pred_optimal = (y_prob >= best_th).astype(int)
print("\n=== 📈 Báo cáo đánh giá mô hình với ngưỡng tối ưu F1 ===")
print(classification_report(y_test, y_pred_optimal, digits=3))


Optimal Threshold based on F1-score: 0.460

=== 📈 Báo cáo đánh giá mô hình với ngưỡng tối ưu F1 ===
              precision    recall  f1-score   support

           0      0.967     0.919     0.943       732
           1      0.629     0.813     0.709       123

    accuracy                          0.904       855
   macro avg      0.798     0.866     0.826       855
weighted avg      0.918     0.904     0.909       855



In [None]:
# Get the predicted probabilities for the test set
y_prob_dropout = model.predict_proba(X_test)[:, 1]

# Create a DataFrame to display the predicted probabilities along with the test set index for context
predicted_probabilities_df = pd.DataFrame({'Predicted_Dropout_Probability': y_prob_dropout}, index=X_test.index)

# Display the predicted probabilities for all students in the test set
print("Predicted probabilities of dropout for the test set:")
display(predicted_probabilities_df)

Predicted probabilities of dropout for the test set:


Unnamed: 0,Predicted_Dropout_Probability
1302,0.389825
3152,0.922224
3831,0.064979
2291,0.020850
567,0.048854
...,...
1471,0.015159
231,0.002522
1835,0.225971
3706,0.894320


In [None]:
# Identify necessary import statements
# import pandas as pd
# import numpy as np
# import joblib

# Locate the code that loads the saved model file
# model = joblib.load("model_dropout_tuned.pkl")

# Find the code responsible for preprocessing new data
# This would involve selecting the same columns as X_train and handling NaNs (e.g., filling with 0)
# Example: new_data_processed = new_data[X_train.columns].fillna(0)

# Identify the code that uses the loaded model to predict the probability of dropout
# y_prob_new = model.predict_proba(new_data_processed)[:, 1]

print("Identified necessary components for model deployment:")
print("- Import statements for pandas, numpy, and joblib.")
print("- Code to load the model using joblib.load('model_dropout_tuned.pkl').")
print("- Code to preprocess new data (select features and handle missing values).")
print("- Code to predict probabilities using model.predict_proba().")

Identified necessary components for model deployment:
- Import statements for pandas, numpy, and joblib.
- Code to load the model using joblib.load('model_dropout_tuned.pkl').
- Code to preprocess new data (select features and handle missing values).
- Code to predict probabilities using model.predict_proba().


In [None]:
# ================================================================
# Model Deployment for Learning Management System Integration
# ---------------------------------------------------------------
# This section contains the necessary code and instructions for
# integrating the trained Random Forest model into a learning
# management system to predict the percentage risk of a student
# dropping out.
# ================================================================

import pandas as pd
import numpy as np
import joblib

# ================================================================
# 1️⃣ Load the trained model
# ================================================================
try:
    model = joblib.load("model_dropout_tuned.pkl")
    print("✅ Loaded the trained model 'model_dropout_tuned.pkl'")
except FileNotFoundError:
    print("❌ Error: 'model_dropout_tuned.pkl' not found. Make sure the model file is in the correct directory.")
    model = None # Set model to None to prevent errors in subsequent steps

# ================================================================
# 2️⃣ Define a function for prediction on new data
# ================================================================
def predict_dropout_risk(new_student_data: pd.DataFrame) -> pd.DataFrame:
    """
    Predicts the percentage risk of dropout for new student data.

    Args:
        new_student_data: A pandas DataFrame containing new student data.
                          Must have the same columns as the training data
                          (excluding IDs and the target).

    Returns:
        A pandas DataFrame with the predicted dropout probability (as a
        percentage) for each student.
    """
    if model is None:
        print("❌ Model not loaded. Cannot make predictions.")
        return pd.DataFrame()

    # Define the expected feature columns based on the training data
    # This assumes X_train was defined in a previous step and is available
    try:
        feature_cols = X_train.columns
    except NameError:
        print("❌ Error: X_train is not defined. Cannot determine feature columns.")
        return pd.DataFrame()


    # Select the relevant feature columns and handle missing values
    # Use errors='ignore' in case the input DataFrame has extra columns
    processed_data = new_student_data[feature_cols].fillna(0)

    # Ensure the processed data has the same columns and order as the training data
    processed_data = processed_data.reindex(columns=feature_cols, fill_value=0)

    # Predict the probability of dropout (class 1)
    # The model predicts probabilities for each class. We want the probability of dropout.
    dropout_probabilities = model.predict_proba(processed_data)[:, 1]

    # Convert probabilities to percentage risk
    dropout_risk_percentage = dropout_probabilities * 100

    # Create a DataFrame to return the results
    results_df = pd.DataFrame({
        'Predicted_Dropout_Risk_Percentage': dropout_risk_percentage
        # Optionally, include the original index or a unique identifier if available in new_student_data
        # 'Original_Index': new_student_data.index
    }, index=new_student_data.index) # Keep the original index for mapping back

    return results_df

# ================================================================
# 3️⃣ (Optional) Example Usage
# ================================================================
# To test the function, you would create a sample DataFrame
# with the same columns as the training data (X_train).
# For example:
# sample_new_data = pd.DataFrame({
#     'LopID': [101, 102],
#     'TyLeChuyenCan_NuaDau': [0.7, 0.9],
#     'SoBuoiVang_NuaDau': [3, 1],
#     'SoBuoiVangDau': [1, 0],
#     'DiemGiuaKy': [5.0, 8.5],
#     'GioiTinh': [1, 0], # Assuming 1 for male, 0 for female based on original data
#     'VungMien': [1, 2], # Assuming numerical encoding
#     'SoNgayHoc': [30, 45],
#     'SoNgayDangKySom': [10, 20],
#     'TuoiHocVien': [22, 25],
#     'KhoaHocID': [1, 2],
#     'GiangVienID': [5, 8],
#     'DiaDiemID': [3, 7]
# })
#
# dropout_predictions = predict_dropout_risk(sample_new_data)
# print("\nExample Prediction Results:")
# display(dropout_predictions)

print("\n✅ Deployment section prepared.")
print("Use the 'predict_dropout_risk' function with new student data to get dropout risk percentages.")


✅ Loaded the trained model 'model_dropout_tuned.pkl'

✅ Deployment section prepared.
Use the 'predict_dropout_risk' function with new student data to get dropout risk percentages.
