# Data Preprocesing Notebook

In [128]:
import numpy as np
import pandas as pd

In [129]:
# Importing raw data set of student dropout and academic success
raw_data = pd.read_csv('data/data.csv', sep=";")
raw_data.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,Mother's occupation,Father's occupation,Admission grade,Displaced,Educational special needs,Debtor,Tuition fees up to date,Gender,Scholarship holder,Age at enrollment,International,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,17,5,171,1,1,122.0,1,19,12,5,9,127.3,1,0,0,1,1,0,20,0,0,0,0,0,0.0,0,0,0,0,0,0.0,0,10.8,1.4,1.74,Dropout
1,1,15,1,9254,1,1,160.0,1,1,3,3,3,142.5,1,0,0,0,1,0,19,0,0,6,6,6,14.0,0,0,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate
2,1,1,5,9070,1,1,122.0,1,37,37,9,9,124.8,1,0,0,0,1,0,19,0,0,6,0,0,0.0,0,0,6,0,0,0.0,0,10.8,1.4,1.74,Dropout
3,1,17,2,9773,1,1,122.0,1,38,37,5,3,119.6,1,0,0,1,0,0,20,0,0,6,8,6,13.428571,0,0,6,10,5,12.4,0,9.4,-0.8,-3.12,Graduate
4,2,39,1,8014,0,1,100.0,1,37,38,9,9,141.5,0,0,0,1,0,0,45,0,0,6,9,5,12.333333,0,0,6,6,6,13.0,0,13.9,-0.3,0.79,Graduate


In [130]:
# Removing non-ordinal (nominal) features
nonordinal_features = ['Displaced', 'Educational special needs', 'Debtor', 'Tuition fees up to date', 'Gender', 
                    'Scholarship holder', 'International', 'Marital Status', 'Daytime/evening attendance	', 'Application mode', 'Application order',
                    'Course', 'Previous qualification', "Mother's qualification", "Father's qualification",
                    "Mother's occupation", "Father's occupation", 'Nacionality', 'Marital status']

processed_dataset = raw_data.drop(columns=nonordinal_features, errors='ignore', axis=1)
processed_dataset.head()

Unnamed: 0,Previous qualification (grade),Admission grade,Age at enrollment,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,122.0,127.3,20,0,0,0,0,0.0,0,0,0,0,0,0.0,0,10.8,1.4,1.74,Dropout
1,160.0,142.5,19,0,6,6,6,14.0,0,0,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate
2,122.0,124.8,19,0,6,0,0,0.0,0,0,6,0,0,0.0,0,10.8,1.4,1.74,Dropout
3,122.0,119.6,20,0,6,8,6,13.428571,0,0,6,10,5,12.4,0,9.4,-0.8,-3.12,Graduate
4,100.0,141.5,45,0,6,9,5,12.333333,0,0,6,6,6,13.0,0,13.9,-0.3,0.79,Graduate


In [131]:
# Determining the proportion of labels for nondroput vs dropout to ensure (and assess) relatively good label balance
target = processed_dataset['Target'].to_numpy()

total_points = len(target)

nondropout = np.where((target == 'Graduate') | (target == 'Enrolled'))[0]

dropout_proportion = ((total_points - len(nondropout)) / total_points)
nondropout_proportion = 1 - dropout_proportion

print(f"Dropout proportion:     {dropout_proportion*100:.2f}%")
print(f"Non-dropout proportion: {nondropout_proportion*100:.2f}%")


Dropout proportion:     32.12%
Non-dropout proportion: 67.88%


The proportion of labels is sufficiently balanced to proceed with modeling.

In [132]:

processed_dataset['y_labels'] = np.where((processed_dataset['Target'] == 'Graduate') | (processed_dataset['Target'] == 'Enrolled'), 1, -1)
processed_dataset.head()


Unnamed: 0,Previous qualification (grade),Admission grade,Age at enrollment,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target,y_labels
0,122.0,127.3,20,0,0,0,0,0.0,0,0,0,0,0,0.0,0,10.8,1.4,1.74,Dropout,-1
1,160.0,142.5,19,0,6,6,6,14.0,0,0,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate,1
2,122.0,124.8,19,0,6,0,0,0.0,0,0,6,0,0,0.0,0,10.8,1.4,1.74,Dropout,-1
3,122.0,119.6,20,0,6,8,6,13.428571,0,0,6,10,5,12.4,0,9.4,-0.8,-3.12,Graduate,1
4,100.0,141.5,45,0,6,9,5,12.333333,0,0,6,6,6,13.0,0,13.9,-0.3,0.79,Graduate,1


In [133]:
# Feature Scaling
# Resources Used: 
#   - https://stackoverflow.com/questions/26225344/why-feature-scaling-in-svm
#   - https://scikit-learn.org/stable/modules/preprocessing.html

from sklearn.preprocessing import MinMaxScaler

features = processed_dataset.columns.drop('Target')

# Normalizes scale of features so that all data points are in range between 0 and 1
min_max_scaler = MinMaxScaler()
processed_dataset[features] = min_max_scaler.fit_transform(processed_dataset[features])

processed_dataset.head()


Unnamed: 0,Previous qualification (grade),Admission grade,Age at enrollment,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target,y_labels
0,0.284211,0.34,0.056604,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.372093,0.488889,0.766182,Dropout,0.0
1,0.684211,0.5,0.037736,0.0,0.230769,0.133333,0.230769,0.741722,0.0,0.0,0.26087,0.181818,0.3,0.735897,0.0,0.732558,0.111111,0.640687,Graduate,1.0
2,0.284211,0.313684,0.037736,0.0,0.230769,0.0,0.0,0.0,0.0,0.0,0.26087,0.0,0.0,0.0,0.0,0.372093,0.488889,0.766182,Dropout,0.0
3,0.284211,0.258947,0.056604,0.0,0.230769,0.177778,0.230769,0.711447,0.0,0.0,0.26087,0.30303,0.25,0.667692,0.0,0.209302,0.0,0.124174,Graduate,1.0
4,0.052632,0.489474,0.528302,0.0,0.230769,0.2,0.192308,0.653422,0.0,0.0,0.26087,0.181818,0.3,0.7,0.0,0.732558,0.111111,0.640687,Graduate,1.0


We are performing 5-fold cross-validation with a linear SVM on a fixed, pre-defined folds to quantify each feature’s significance. By fitting a purely linear model, we can collect its coefficients across folds and apply statistical tests (large t-statistics ↔ small p-values) to rigorously determine which features are truly significant.

In [134]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import KFold
from scipy.stats import ttest_1samp

In [135]:
feature_columns = processed_dataset.columns.drop(['Target', 'y_labels'])
X = processed_dataset[feature_columns].to_numpy(dtype=float)
y = processed_dataset['y_labels'].to_numpy(dtype=int)

In [136]:
kf = KFold(n_splits=5, shuffle=False)   # shuffle=False ensures folds remain fixed across iterations
coefs = []    

In [137]:
# Choosing a linear model 
sgd_svm = SGDClassifier(
    loss='hinge',         
    penalty='l2',       
    learning_rate='optimal',
    max_iter=1000,
    tol=1e-4,
    random_state=0
)

In [138]:

for tr_idx, _ in kf.split(X):
    sgd_svm.fit(X[tr_idx], y[tr_idx])
    coefs.append(sgd_svm.coef_.ravel())

coefs = np.vstack(coefs) 

In [139]:
# Statistical Analysis
coef_mean = coefs.mean(axis=0)
coef_std  = coefs.std(axis=0, ddof=1)          
coef_ratio = coef_std / coef_mean
t_stats, p_vals = ttest_1samp(coefs, popmean=0.0, axis=0)

results = pd.DataFrame({
    'feature'   : feature_columns,
    'mean_coef' : coef_mean,
    'std_coef'  : coef_std,
    'ratio'     : coef_ratio,
    't_stat'    : t_stats,
    'p_value'   : p_vals
}).sort_values('p_value')

print(results)

                                           feature  mean_coef  std_coef      ratio     t_stat   p_value
6              Curricular units 1st sem (approved)   5.077315  0.231933   0.045680  48.950397  0.000001
2                                Age at enrollment  -1.216301  0.074518  -0.061266 -36.497570  0.000003
12             Curricular units 2nd sem (approved)   8.421425  0.522875   0.062089  36.014125  0.000004
10             Curricular units 2nd sem (enrolled)  -6.301770  0.412962  -0.065531 -34.122259  0.000004
7                 Curricular units 1st sem (grade)  -0.895191  0.075816  -0.084693 -26.402089  0.000012
4              Curricular units 1st sem (enrolled)  -3.216235  0.640707  -0.199210 -11.224659  0.000359
3              Curricular units 1st sem (credited)  -0.961903  0.207080  -0.215282 -10.386701  0.000485
9              Curricular units 2nd sem (credited)  -2.042352  0.449366  -0.220024 -10.162848  0.000528
13                Curricular units 2nd sem (grade)   0.614231  0

All features with a p-value less than 0.05 are statistically significant and will be kept. Features with higher p-values will be excluded from the model.

Significant Features (p < 0.05):
- Curricular units 1st sem (approved)
- Age at enrollment
- Curricular units 2nd sem (approved)
- Curricular units 2nd sem (enrolled)
- Curricular units 1st sem (grade)
- Curricular units 1st sem (enrolled)
- Curricular units 1st sem (credited)
- Curricular units 2nd sem (credited)
- Curricular units 2nd sem (grade)
- Curricular units 2nd sem (evaluations)
- Curricular units 1st sem (without evaluations)
- Admission grade
- Unemployment rate

Non-Significant Features (p ≥ 0.05):
- Inflation rate
- Curricular units 1st sem (evaluations)
- GDP
- Previous qualification (grade)
- Curricular units 2nd sem (without evaluations)

In [143]:
# Removing non-significant features
nonsignificant_features = ['Inflation rate', 'Curricular units 1st sem (evaluations)', 'GDP', 'Previous qualification (grade)', 'Curricular units 2nd sem (without evaluations)']

processed_dataset = processed_dataset.drop(columns=nonsignificant_features, errors='ignore', axis=1)
processed_dataset.head()

Unnamed: 0,Admission grade,Age at enrollment,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Unemployment rate,Target,y_labels
0,0.34,0.056604,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.372093,Dropout,0.0
1,0.5,0.037736,0.0,0.230769,0.230769,0.741722,0.0,0.0,0.26087,0.181818,0.3,0.735897,0.732558,Graduate,1.0
2,0.313684,0.037736,0.0,0.230769,0.0,0.0,0.0,0.0,0.26087,0.0,0.0,0.0,0.372093,Dropout,0.0
3,0.258947,0.056604,0.0,0.230769,0.230769,0.711447,0.0,0.0,0.26087,0.30303,0.25,0.667692,0.209302,Graduate,1.0
4,0.489474,0.528302,0.0,0.230769,0.192308,0.653422,0.0,0.0,0.26087,0.181818,0.3,0.7,0.732558,Graduate,1.0


In [144]:
processed_dataset.to_csv('data/processed-data.csv', index=False)