## Preparing the tools



In [50]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

Importing the dataset and exploring it

In [51]:
data = pd.read_csv('data.csv',sep=';' )
data.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,17,5,171,1,1,122.0,1,19,12,...,0,0,0,0,0.0,0,10.8,1.4,1.74,Dropout
1,1,15,1,9254,1,1,160.0,1,1,3,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate
2,1,1,5,9070,1,1,122.0,1,37,37,...,0,6,0,0,0.0,0,10.8,1.4,1.74,Dropout
3,1,17,2,9773,1,1,122.0,1,38,37,...,0,6,10,5,12.4,0,9.4,-0.8,-3.12,Graduate
4,2,39,1,8014,0,1,100.0,1,37,38,...,0,6,6,6,13.0,0,13.9,-0.3,0.79,Graduate


In [52]:
## DataPreparation
label = data.columns[-1]
features = list(data.columns[:-1])
num_features = len(features)

print("Number of features: ", num_features)
print("Features: ", features)
print("Label: ", label, ", Can be: ", data[label].unique())






Number of features:  36
Features:  ['Marital status', 'Application mode', 'Application order', 'Course', 'Daytime/evening attendance\t', 'Previous qualification', 'Previous qualification (grade)', 'Nacionality', "Mother's qualification", "Father's qualification", "Mother's occupation", "Father's occupation", 'Admission grade', 'Displaced', 'Educational special needs', 'Debtor', 'Tuition fees up to date', 'Gender', 'Scholarship holder', 'Age at enrollment', 'International', 'Curricular units 1st sem (credited)', 'Curricular units 1st sem (enrolled)', 'Curricular units 1st sem (evaluations)', 'Curricular units 1st sem (approved)', 'Curricular units 1st sem (grade)', 'Curricular units 1st sem (without evaluations)', 'Curricular units 2nd sem (credited)', 'Curricular units 2nd sem (enrolled)', 'Curricular units 2nd sem (evaluations)', 'Curricular units 2nd sem (approved)', 'Curricular units 2nd sem (grade)', 'Curricular units 2nd sem (without evaluations)', 'Unemployment rate', 'Inflation 

## Data Balancing

We will use SMOTE to balance the dataset. SMOTE stands for Synthetic Minority Over-sampling Technique, which is a method to create synthetic samples for the minority class in an imbalanced dataset.

In [None]:
# Balancing the dataset
Dropout = data[data[label] == 'Dropout']
Graduate = data[data[label] == 'Graduate']
Enrolled = data[data[label] == 'Enrolled']

print("Graduate: ", Graduate.shape[0])
print("Dropout: ", Dropout.shape[0])
print("Enrolled: ", Enrolled.shape[0])


Graduate:  2209
Dropout:  1421
Enrolled:  794


In [59]:
from imblearn.over_sampling import SMOTE

X = data[features]
y = data[label]

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

print("Original dataset shape",y.value_counts())
print("Resampled dataset shape", y_resampled.value_counts())

data_resampled = pd.DataFrame(X_resampled,columns=features)
data_resampled[label]= y_resampled

print(data_resampled.head())
print(data_resampled[label].value_counts())

Original dataset shape Target
Graduate    2209
Dropout     1421
Enrolled     794
Name: count, dtype: int64
Resampled dataset shape Target
Dropout     2209
Graduate    2209
Enrolled    2209
Name: count, dtype: int64
   Marital status  Application mode  Application order  Course  \
0               1                17                  5     171   
1               1                15                  1    9254   
2               1                 1                  5    9070   
3               1                17                  2    9773   
4               2                39                  1    8014   

   Daytime/evening attendance\t  Previous qualification  \
0                             1                       1   
1                             1                       1   
2                             1                       1   
3                             1                       1   
4                             0                       1   

   Previous qualification (grade)

## Trainning and Comparing Models

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import precision_score,recall_score,f1_score,confusion_matrix


def model_compare(algorithm,X,y):
    
