In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

data = pd.read_csv("Preparation_data_onehot.csv")

X = data.drop("dropout_status", axis=1)
y = data["dropout_status"]

X_train,X_test,y_train,y_test = train_test_split(
  X, y, test_size=0.2, random_state=42, stratify=y
)

rf = RandomForestClassifier(
  n_estimators=100,
  random_state=42,
  max_depth=4,
  min_samples_split=8,
  min_samples_leaf=3,
  max_features="sqrt"
)

rf.fit(X_train, y_train)

rf_y_pred = rf.predict(X_test)

rf_acc = accuracy_score(y_test, rf_y_pred)
print("Accuracy:", rf_acc)




Accuracy: 0.8405797101449275


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

rf_y_pred = rf.predict(X_test)
cm = confusion_matrix(y_test, rf_y_pred)

tn, fp, fn, tp = cm.ravel()


print("="*80)
print("Confusion Matrix of Random Forest Model (Test Set)")
print("-"*80)
print(f"                                            Prediction result")
print(f"                              Did not drop out（0）    Drop out（1）")
print(f"Actually did not drop out（0）        {tn:<10}               {fp:<10}")
print(f"Actual dropout（1）                   {fn:<10}               {tp:<10}")
print("-"*80)

total_samples = len(y_test)
accuracy = (tp + tn) / total_samples  # Accuracy
recall_dropout = tp / (tp + fn)       # Dropout recall rate
precision_dropout = tp / (tp + fp)    # Dropout class precision rate
f1_dropout = 2 * (precision_dropout * recall_dropout) / (precision_dropout + recall_dropout)  # Dropout F1 score

print(f"\nKey indicators：")
print(f"1. Recall;{recall_dropout:.4f}")
print(f"2. Precision;{precision_dropout:.4f}")
print(f"3. F1 score;{f1_dropout:.4f}")
print(f"4. Accuracy;{accuracy:.4f}")
print("="*80)

Confusion Matrix of Random Forest Model (Test Set)
--------------------------------------------------------------------------------
                                            Prediction result
                              Did not drop out（0）    Drop out（1）
Actually did not drop out（0）        681                      0         
Actual dropout（1）                   143                      73        
--------------------------------------------------------------------------------

Key indicators：
1. Recall;0.3380
2. Precision;1.0000
3. F1 score;0.5052
4. Accuracy;0.8406


In [None]:
nb_model = GaussianNB()

nb_model.fit(X_train, y_train)

nb_y_pred = nb_model.predict(X_test)

nb_accuracy = accuracy_score(y_test, nb_y_pred)
print("Accuracy:", nb_accuracy)

Accuracy: 0.8472686733556298


In [None]:
nb_cm = confusion_matrix(y_test, nb_y_pred)

tn_nb, fp_nb, fn_nb, tp_nb = nb_cm.ravel()

print("="*80)
print("Naive Bayes Confusion Matrix（Test set）")
print("-"*80)
print(f"                Prediction result")
print(f"                          Did not drop out（0）    drop out of（1）    ")
print(f"Actually did not drop out（0）         {tn_nb:<10}       {fp_nb:<10}")
print(f"Actual dropout（1）                    {fn_nb:<10}       {tp_nb:<10}")
print(f"Total                                  {tn_nb + fn_nb:<10}       {fp_nb + tp_nb:<10}")
print("-"*80)


recall_nb = tp_nb / (tp_nb + fn_nb) if (tp_nb + fn_nb) != 0 else 0.0
precision_nb = tp_nb / (tp_nb + fp_nb) if (tp_nb + fp_nb) != 0 else 0.0
f1_nb = 2 * (precision_nb * recall_nb) / (precision_nb + recall_nb) if (precision_nb + recall_nb) != 0 else 0.0
accuracy_nb = (tp_nb + tn_nb) / len(y_test)

print(f"\nKey indicators：")
print(f"1. Recall rate;{recall_nb:.4f}")
print(f"2. Precision;{precision_nb:.4f}")
print(f"3. F1 score;{f1_nb:.4f}")
print(f"4. Accuracy;{accuracy_nb:.4f}")
print("="*80)

Naive Bayes Confusion Matrix（Test set）
--------------------------------------------------------------------------------
                Prediction result
                          Did not drop out（0）    drop out of（1）    
Actually did not drop out（0）         658              23        
Actual dropout（1）                    114              102       
Total                                  772              125       
--------------------------------------------------------------------------------

Key indicators：
1. Recall rate;0.4722
2. Precision;0.8160
3. F1 score;0.5982
4. Accuracy;0.8473
