# Dataset analyse

In [11]:
import pandas as pd
import matplotlib.pyplot as plt
from scipy.constants import precision
from sklearn.model_selection import train_test_split


df = pd.read_csv("graduation_train.csv")

for colum in df.columns:
    df = df[df[colum] != ""]

df["curricular_units_1st_sem_grade_rounded"] = df["curricular_units_1st_sem_grade"].round(0)
df["curricular_units_2nd_sem_grade_rounded"] = df["curricular_units_2nd_sem_grade"].round(0)


train_df, test_df = train_test_split(df, test_size=0.2)
# df = df[df["InvoiceDate"] < pd.Timestamp('today')]#remove purchases in the future
# df = df[df.Invoice.str.isnumeric()]#remove all Invoices that are not numbers
# #df = df[df.StockCode.str.isnumeric()]#remove all StockCodes that are not numbers not all stock codes are intigers
# df = df[pd.to_numeric(df.Price, errors='coerce').notnull()]
# df["Customer ID"] = pd.to_numeric(df["Customer ID"], errors="raise", downcast='integer')
# df = df[pd.to_numeric(df["Customer ID"], errors='coerce', downcast='integer').notnull()]
# df = df[df.duplicated(subset=["StockCode", "Description"], keep=False)]#remove inconsistent row between stockcode and description
#
# print(df)

#https://scikit-learn.org/stable/modules/ensemble.html


# Report

In [12]:
from sklearn.metrics import confusion_matrix, RocCurveDisplay
import graphviz
from sklearn import tree
from sklearn import metrics
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    precision_score,
    recall_score,
    classification_report,
    confusion_matrix,
    roc_curve
)

def report(title, test_items, pass_df, clf):
    print(f"==={title}===")
    predictions = clf.predict(test_items)
    y_pred_proba = clf.predict_proba(test_items)[:, 1]  # Probabilities for AUC

    # 1. Accuracy
    accuracy = accuracy_score(pass_df["target"], predictions)

    # 2. AUC
    auc = roc_auc_score(pass_df["target"], y_pred_proba)

    # 3. Precision & Recall for both classes
    precision_0 = precision_score(pass_df["target"], predictions, pos_label=0)
    recall_0 = recall_score(pass_df["target"], predictions, pos_label=0)
    precision_1 = precision_score(pass_df["target"], predictions, pos_label=1)
    recall_1 = recall_score(pass_df["target"], predictions, pos_label=1)

    # 4. Full classification report (optional but useful)
    report = classification_report(pass_df["target"], predictions)

    # Output everything
    print("=== Metrics Report ===")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"AUC: {auc:.4f}")
    print(f"Precision (Class 0): {precision_0:.4f}")
    print(f"Recall (Class 0):    {recall_0:.4f}")
    print(f"Precision (Class 1): {precision_1:.4f}")
    print(f"Recall (Class 1):    {recall_1:.4f}")
    print("\nFull Classification Report:")
    print(report)



## load train data


In [13]:
colums = ["curricular_units_1st_sem_grade_rounded", "curricular_units_1st_sem_evaluations", "curricular_units_1st_sem_approved",
          "curricular_units_2nd_sem_grade_rounded", "curricular_units_2nd_sem_evaluations", "curricular_units_2nd_sem_approved",
          "course", "previous_qualification", "special_needs"]

X = train_df[colums].values.tolist()
Y = list(train_df["target"])

## decision_tree

In [14]:
decision_tree_classifier = tree.DecisionTreeClassifier()
clf = decision_tree_classifier.fit(X, Y)

dot_data = tree.export_graphviz(
    clf,
    out_file=None,
    feature_names=colums,
    class_names=["Passed", "Drop out"],
    filled=True,
    rounded=True,
    special_characters=True
)

graph = graphviz.Source(dot_data)
graph.render("decision_tree")  # Saves as 'decision_tree.pdf'
#graph.view()  # Opens the tree

report("decision_tree", test_df[colums].values.tolist(), test_df, decision_tree_classifier)

===decision_tree===
=== Metrics Report ===
Accuracy: 0.8296
AUC: 0.8418
Precision (Class 0): 0.7763
Recall (Class 0):    0.7727
Precision (Class 1): 0.8619
Recall (Class 1):    0.8643

Full Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.77      0.77       220
           1       0.86      0.86      0.86       361

    accuracy                           0.83       581
   macro avg       0.82      0.82      0.82       581
weighted avg       0.83      0.83      0.83       581



## NearestNeighbor

In [15]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
import numpy as np

knn_classifier = Pipeline(
    steps=[("scaler", StandardScaler()), ("knn", KNeighborsClassifier(n_neighbors=5))]
)
knn_classifier.fit(X, Y)
report("decision_tree", test_df[colums].values.tolist(), test_df, knn_classifier)

===decision_tree===
=== Metrics Report ===
Accuracy: 0.8571
AUC: 0.8931
Precision (Class 0): 0.8743
Recall (Class 0):    0.7273
Precision (Class 1): 0.8492
Recall (Class 1):    0.9363

Full Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.73      0.79       220
           1       0.85      0.94      0.89       361

    accuracy                           0.86       581
   macro avg       0.86      0.83      0.84       581
weighted avg       0.86      0.86      0.85       581



## naive bayes

In [16]:
from sklearn.naive_bayes import GaussianNB

gnb_classifier = GaussianNB()

gnb_classifier.fit(X, Y)

report("decision_tree", test_df[colums].values.tolist(), test_df, gnb_classifier)

===decision_tree===
=== Metrics Report ===
Accuracy: 0.8382
AUC: 0.8970
Precision (Class 0): 0.9038
Recall (Class 0):    0.6409
Precision (Class 1): 0.8141
Recall (Class 1):    0.9584

Full Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.64      0.75       220
           1       0.81      0.96      0.88       361

    accuracy                           0.84       581
   macro avg       0.86      0.80      0.82       581
weighted avg       0.85      0.84      0.83       581



## ensemble method

In [17]:
from sklearn.ensemble import RandomForestClassifier

clf_classifier = RandomForestClassifier(n_estimators=10)
clf_classifier.fit(X, Y)
report("decision_tree", test_df[colums].values.tolist(), test_df, clf_classifier)

===decision_tree===
=== Metrics Report ===
Accuracy: 0.8640
AUC: 0.9078
Precision (Class 0): 0.8439
Recall (Class 0):    0.7864
Precision (Class 1): 0.8750
Recall (Class 1):    0.9114

Full Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.79      0.81       220
           1       0.88      0.91      0.89       361

    accuracy                           0.86       581
   macro avg       0.86      0.85      0.85       581
weighted avg       0.86      0.86      0.86       581



# clasifier tester

In [21]:
from classifier_test_framework import ClassifierTestFramework
import pandas as pd

colums = ["curricular_units_1st_sem_grade_rounded", "curricular_units_1st_sem_evaluations", "curricular_units_1st_sem_approved",
          "curricular_units_2nd_sem_grade_rounded", "curricular_units_2nd_sem_evaluations", "curricular_units_2nd_sem_approved",
          "course", "previous_qualification", "special_needs"]

classifier_test_framework = ClassifierTestFramework(df, colums)

classifier_test_framework.add_classifier("DecisionTree", tree.DecisionTreeClassifier())
classifier_test_framework.add_classifier("knn", Pipeline(
    steps=[("scaler", StandardScaler()), ("knn", KNeighborsClassifier(n_neighbors=5))]
))
classifier_test_framework.add_classifier("Naive_bayes", GaussianNB())
classifier_test_framework.add_classifier("Ensemble_method", RandomForestClassifier(n_estimators=10))
for key, report in classifier_test_framework.get_results().items():
    print(f"==={key}===")
    print(report)

{'DecisionTree': '              precision    recall  f1-score   support\n\n           0       0.73      0.81      0.77       219\n           1       0.88      0.81      0.85       362\n\n    accuracy                           0.81       581\n   macro avg       0.80      0.81      0.81       581\nweighted avg       0.82      0.81      0.82       581\n', 'knn': '              precision    recall  f1-score   support\n\n           0       0.91      0.79      0.85       219\n           1       0.88      0.95      0.92       362\n\n    accuracy                           0.89       581\n   macro avg       0.90      0.87      0.88       581\nweighted avg       0.89      0.89      0.89       581\n', 'Naive_bayes': '              precision    recall  f1-score   support\n\n           0       0.87      0.59      0.70       219\n           1       0.79      0.94      0.86       362\n\n    accuracy                           0.81       581\n   macro avg       0.83      0.77      0.78       581\nweigh