# Dataset analyse

In [1]:
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, chi2, f_classif
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier

from classifier_test_framework import ClassifierTestFramework
import pandas as pd

df = pd.read_csv("graduation_train.csv")

# print(df.isnull().sum())

num_cols = df.select_dtypes(include=["int64", "float64"]).columns
cat_cols = df.select_dtypes(include=["object"]).columns

for colum in df.columns:
    df = df[df[colum] != ""]

print(df)

df["curricular_units_1st_sem_grade_rounded"] = df["curricular_units_1st_sem_grade"].round(0)
df["curricular_units_2nd_sem_grade_rounded"] = df["curricular_units_2nd_sem_grade"].round(0)

      student_id  marital_status  application_mode  application_order  course  \
0              1               1                15                  1       4   
1              2               2                12                  1      17   
2              3               1                 1                  1      10   
3              4               1                 1                  4      12   
4              5               2                12                  1       3   
...          ...             ...               ...                ...     ...   
2899        2900               1                 1                  2      14   
2900        2901               1                 8                  2      15   
2901        2902               1                 4                  1      13   
2902        2903               1                 1                  1       6   
2903        2904               2                12                  1      16   

      attendance_type  prev

# Report

In [4]:
from sklearn.metrics import confusion_matrix, RocCurveDisplay
import graphviz
from sklearn import tree
from sklearn import metrics
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    precision_score,
    recall_score,
    classification_report,
    confusion_matrix,
    roc_curve
)

def report(title, test_items, pass_df, clf):
    print(f"==={title}===")
    predictions = clf.predict(test_items)
    y_pred_proba = clf.predict_proba(test_items)[:, 1]  # Probabilities for AUC

    # 1. Accuracy
    accuracy = accuracy_score(pass_df["target"], predictions)

    # 2. AUC
    auc = roc_auc_score(pass_df["target"], y_pred_proba)

    # 3. Precision & Recall for both classes
    precision_0 = precision_score(pass_df["target"], predictions, pos_label=0)
    recall_0 = recall_score(pass_df["target"], predictions, pos_label=0)
    precision_1 = precision_score(pass_df["target"], predictions, pos_label=1)
    recall_1 = recall_score(pass_df["target"], predictions, pos_label=1)

    # 4. Full classification report (optional but useful)
    report = classification_report(pass_df["target"], predictions)

    # Output everything
    print("=== Metrics Report ===")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"AUC: {auc:.4f}")
    print(f"Precision (Class 0): {precision_0:.4f}")
    print(f"Recall (Class 0):    {recall_0:.4f}")
    print(f"Precision (Class 1): {precision_1:.4f}")
    print(f"Recall (Class 1):    {recall_1:.4f}")
    print("\nFull Classification Report:")
    print(report)



## load train data


In [5]:
colums = ["curricular_units_1st_sem_grade_rounded", "curricular_units_1st_sem_evaluations", "curricular_units_1st_sem_approved",
          "curricular_units_2nd_sem_grade_rounded", "curricular_units_2nd_sem_evaluations", "curricular_units_2nd_sem_approved",
          "course", "previous_qualification", "special_needs"]

X = train_df[colums].values.tolist()
Y = list(train_df["target"])

## decision_tree

In [22]:
colums = [col for col in df.columns if col != "target"]

X = train_df[colums].values.tolist()
Y = list(train_df["target"])

decision_tree_classifier = tree.DecisionTreeClassifier()
clf = decision_tree_classifier.fit(X, Y)


dot_data = tree.export_graphviz(
    clf,
    out_file=None,
    feature_names=colums,
    class_names=["Passed", "Drop out"],
    filled=True,
    rounded=True,
    special_characters=True
)

graph = graphviz.Source(dot_data)
graph.render("decision_tree")  # Saves as 'decision_tree.pdf'
graph.view()  # Opens the tree

report("decision_tree", test_df[colums].values.tolist(), test_df, decision_tree_classifier)

===decision_tree===
=== Metrics Report ===
Accuracy: 0.8330
AUC: 0.8295
Precision (Class 0): 0.7265
Recall (Class 0):    0.8182
Precision (Class 1): 0.8994
Recall (Class 1):    0.8407

Full Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.82      0.77       198
           1       0.90      0.84      0.87       383

    accuracy                           0.83       581
   macro avg       0.81      0.83      0.82       581
weighted avg       0.84      0.83      0.84       581



## NearestNeighbor

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
import numpy as np

knn_classifier = Pipeline(
    steps=[("scaler", StandardScaler()), ("knn", KNeighborsClassifier(n_neighbors=5))]
)
knn_classifier.fit(X, Y)
report("decision_tree", test_df[colums].values.tolist(), test_df, knn_classifier)

===decision_tree===
=== Metrics Report ===
Accuracy: 0.8692
AUC: 0.9181
Precision (Class 0): 0.8877
Recall (Class 0):    0.7511
Precision (Class 1): 0.8604
Recall (Class 1):    0.9417

Full Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.75      0.81       221
           1       0.86      0.94      0.90       360

    accuracy                           0.87       581
   macro avg       0.87      0.85      0.86       581
weighted avg       0.87      0.87      0.87       581



## naive bayes

In [6]:
from sklearn.naive_bayes import GaussianNB

gnb_classifier = GaussianNB()

gnb_classifier.fit(X, Y)

report("decision_tree", test_df[colums].values.tolist(), test_df, gnb_classifier)

===decision_tree===
=== Metrics Report ===
Accuracy: 0.8107
AUC: 0.8873
Precision (Class 0): 0.8936
Recall (Class 0):    0.5701
Precision (Class 1): 0.7841
Recall (Class 1):    0.9583

Full Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.57      0.70       221
           1       0.78      0.96      0.86       360

    accuracy                           0.81       581
   macro avg       0.84      0.76      0.78       581
weighted avg       0.83      0.81      0.80       581



## ensemble method

In [7]:
from sklearn.ensemble import RandomForestClassifier

clf_classifier = RandomForestClassifier(n_estimators=10)
clf_classifier.fit(X, Y)
report("decision_tree", test_df[colums].values.tolist(), test_df, clf_classifier)

===decision_tree===
=== Metrics Report ===
Accuracy: 0.8692
AUC: 0.9272
Precision (Class 0): 0.8469
Recall (Class 0):    0.8009
Precision (Class 1): 0.8817
Recall (Class 1):    0.9111

Full Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.80      0.82       221
           1       0.88      0.91      0.90       360

    accuracy                           0.87       581
   macro avg       0.86      0.86      0.86       581
weighted avg       0.87      0.87      0.87       581



# clasifier tester

In [8]:
from classifier_test_framework import ClassifierTestFramework
import pandas as pd

colums = ["curricular_units_1st_sem_grade_rounded", "curricular_units_1st_sem_evaluations", "curricular_units_1st_sem_approved",
          "curricular_units_2nd_sem_grade_rounded", "curricular_units_2nd_sem_evaluations", "curricular_units_2nd_sem_approved",
          "course", "previous_qualification", "special_needs"]

classifier_test_framework = ClassifierTestFramework(df, colums)

classifier_test_framework.add_classifier("DecisionTree", tree.DecisionTreeClassifier())
classifier_test_framework.add_classifier("knn", Pipeline(
    steps=[("scaler", StandardScaler()), ("knn", KNeighborsClassifier(n_neighbors=5))]
))
classifier_test_framework.add_classifier("Naive_bayes", GaussianNB())
classifier_test_framework.add_classifier("Ensemble_method", RandomForestClassifier(n_estimators=10))
for key, report in classifier_test_framework.get_results().items():
    print(f"==={key}===")
    print(report)

===DecisionTree===
              precision    recall  f1-score   support

           0       0.78      0.76      0.77       227
           1       0.85      0.86      0.85       354

    accuracy                           0.82       581
   macro avg       0.81      0.81      0.81       581
weighted avg       0.82      0.82      0.82       581

===knn===
              precision    recall  f1-score   support

           0       0.95      0.73      0.82       227
           1       0.85      0.97      0.91       354

    accuracy                           0.88       581
   macro avg       0.90      0.85      0.86       581
weighted avg       0.89      0.88      0.87       581

===Naive_bayes===
              precision    recall  f1-score   support

           0       0.92      0.68      0.78       227
           1       0.82      0.96      0.89       354

    accuracy                           0.85       581
   macro avg       0.87      0.82      0.83       581
weighted avg       0.86    