In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import KFold
from scipy.stats import ttest_1samp

In [None]:
feature_columns = processed_dataset.columns.drop(['Target', 'y_labels'])
X = processed_dataset[feature_columns].to_numpy(dtype=float)
y = processed_dataset['y_labels'].to_numpy(dtype=int)

In [None]:
kf = KFold(n_splits=5, shuffle=False)   # shuffle=False ensures folds remain fixed across iterations
coefs = []  

In [None]:
# Choosing a linear model 
sgd_svm = SGDClassifier(
    loss='hinge',         
    penalty='l2',       
    learning_rate='optimal',
    max_iter=1000,
    tol=1e-4,
    random_state=0
)

In [None]:

for tr_idx, _ in kf.split(X):
    sgd_svm.fit(X[tr_idx], y[tr_idx])
    coefs.append(sgd_svm.coef_.ravel())

coefs = np.vstack(coefs) 

In [None]:
# Statistical Analysis
coef_mean = coefs.mean(axis=0)
coef_std  = coefs.std(axis=0, ddof=1)          
t_stats, p_vals = ttest_1samp(coefs, popmean=0.0, axis=0)

results = pd.DataFrame({
    'feature'   : feature_columns,
    'mean_coef' : coef_mean,
    'std_coef'  : coef_std,
    't_stat'    : t_stats,
    'p_value'   : p_vals
}).sort_values('p_value')

print(results)

All features with a p-value less than 0.05 are statistically significant and will be kept. Features with higher p-values will be excluded from the model.

Significant Features (p < 0.05):
- Curricular units 1st sem (approved)
- Age at enrollment
- Curricular units 2nd sem (approved)
- Curricular units 2nd sem (enrolled)
- Curricular units 1st sem (grade)
- Curricular units 1st sem (enrolled)
- Curricular units 1st sem (credited)
- Curricular units 2nd sem (credited)
- Curricular units 2nd sem (grade)
- Curricular units 2nd sem (evaluations)
- Curricular units 1st sem (without evaluations)
- Admission grade
- Unemployment rate

Non-Significant Features (p ≥ 0.05):
- Inflation rate
- Curricular units 1st sem (evaluations)
- GDP
- Previous qualification (grade)
- Curricular units 2nd sem (without evaluations)

In [None]:
# Removing non-significant features
nonsignificant_features = ['Inflation rate', 'Curricular units 1st sem (evaluations)', 'GDP', 'Previous qualification (grade)', 'Curricular units 2nd sem (without evaluations)']

processed_dataset = processed_dataset.drop(columns=nonsignificant_features, errors='ignore', axis=1)
processed_dataset.head()