In [40]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

data = pd.read_csv("Preparation_data_onehot.csv")

X = data.drop("dropout_status", axis=1)
y = data["dropout_status"]

X_train,X_test,y_train,y_test = train_test_split(
  X, y, test_size=0.2, random_state=42, stratify=y
)

rf = RandomForestClassifier(
  n_estimators=100,
  random_state=42,
  max_depth=6,
  min_samples_split=8,
  min_samples_leaf=3,
  max_features="sqrt"
)

rf.fit(X_train, y_train)

rf_y_test_pred = rf.predict(X_test)

rf_acc = accuracy_score(y_test, rf_y_test_pred)
print("Accuracy:", rf_acc)




Accuracy: 0.9520624303232998


In [41]:
rf_y_pred = cross_val_predict(rf, X, y, cv=5)
rf_cro_acc = accuracy_score(y, rf_y_pred)
print(rf_cro_acc)


0.94220040169605


In [42]:
print(classification_report(y_test, rf_y_test_pred))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97       681
           1       1.00      0.80      0.89       216

    accuracy                           0.95       897
   macro avg       0.97      0.90      0.93       897
weighted avg       0.95      0.95      0.95       897



In [34]:
rf_y_pred = rf.predict(X_test)
cm = confusion_matrix(y_test, rf_y_pred)

tn, fp, fn, tp = cm.ravel()


print("="*80)
print("Confusion Matrix of Random Forest Model (Test Set)")
print("-"*80)
print(f"                                            Prediction result")
print(f"                              Did not drop out（0）    Drop out（1）")
print(f"Actually did not drop out（0）        {tn:<10}               {fp:<10}")
print(f"Actual dropout（1）                   {fn:<10}               {tp:<10}")
print("-"*80)

total_samples = len(y_test)
accuracy = (tp + tn) / total_samples  # Accuracy
recall_dropout = tp / (tp + fn)       # Dropout recall rate
precision_dropout = tp / (tp + fp)    # Dropout class precision rate
f1_dropout = 2 * (precision_dropout * recall_dropout) / (precision_dropout + recall_dropout)  # Dropout F1 score

print(f"\nKey indicators：")
print(f"1. Recall;{recall_dropout:.4f}")
print(f"2. Precision;{precision_dropout:.4f}")
print(f"3. F1 score;{f1_dropout:.4f}")
print(f"4. Accuracy;{accuracy:.4f}")
print("="*80)

Confusion Matrix of Random Forest Model (Test Set)
--------------------------------------------------------------------------------
                                            Prediction result
                              Did not drop out（0）    Drop out（1）
Actually did not drop out（0）        681                      0         
Actual dropout（1）                   101                      115       
--------------------------------------------------------------------------------

Key indicators：
1. Recall;0.5324
2. Precision;1.0000
3. F1 score;0.6949
4. Accuracy;0.8874


In [35]:
nb_model = GaussianNB()

nb_model.fit(X_train, y_train)

nb_y_test_pred = nb_model.predict(X_test)

nb_accuracy = accuracy_score(y_test, nb_y_test_pred)
print("Accuracy:", nb_accuracy)

Accuracy: 0.8617614269788183


In [37]:
nb_y_pred = cross_val_predict(nb_model, X, y, cv=5)
nb_cro_acc = accuracy_score(y, nb_y_pred)
print(nb_cro_acc)

0.8482481588931042


In [43]:
print(classification_report(y_test, nb_y_test_pred))

              precision    recall  f1-score   support

           0       0.86      0.97      0.91       681
           1       0.85      0.52      0.64       216

    accuracy                           0.86       897
   macro avg       0.86      0.74      0.78       897
weighted avg       0.86      0.86      0.85       897



In [39]:
nb_cm = confusion_matrix(y_test, nb_y_test_pred)

tn_nb, fp_nb, fn_nb, tp_nb = nb_cm.ravel()

print("="*80)
print("Naive Bayes Confusion Matrix（Test set）")
print("-"*80)
print(f"                Prediction result")
print(f"                          Did not drop out（0）    drop out of（1）    ")
print(f"Actually did not drop out（0）         {tn_nb:<10}       {fp_nb:<10}")
print(f"Actual dropout（1）                    {fn_nb:<10}       {tp_nb:<10}")
print(f"Total                                  {tn_nb + fn_nb:<10}       {fp_nb + tp_nb:<10}")
print("-"*80)


recall_nb = tp_nb / (tp_nb + fn_nb) if (tp_nb + fn_nb) != 0 else 0.0
precision_nb = tp_nb / (tp_nb + fp_nb) if (tp_nb + fp_nb) != 0 else 0.0
f1_nb = 2 * (precision_nb * recall_nb) / (precision_nb + recall_nb) if (precision_nb + recall_nb) != 0 else 0.0
accuracy_nb = (tp_nb + tn_nb) / len(y_test)

print(f"\nKey indicators：")
print(f"1. Recall rate;{recall_nb:.4f}")
print(f"2. Precision;{precision_nb:.4f}")
print(f"3. F1 score;{f1_nb:.4f}")
print(f"4. Accuracy;{accuracy_nb:.4f}")
print("="*80)

Naive Bayes Confusion Matrix（Test set）
--------------------------------------------------------------------------------
                Prediction result
                          Did not drop out（0）    drop out of（1）    
Actually did not drop out（0）         661              20        
Actual dropout（1）                    104              112       
Total                                  765              132       
--------------------------------------------------------------------------------

Key indicators：
1. Recall rate;0.5185
2. Precision;0.8485
3. F1 score;0.6437
4. Accuracy;0.8618
