In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import KFold
from scipy.stats import ttest_1samp

In [4]:
processed_dataset = pd.read_csv('data/processed-data.csv')
processed_dataset.head()

Unnamed: 0,Previous qualification (grade),Admission grade,Age at enrollment,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target,y_labels
0,0.284211,0.34,0.056604,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.372093,0.488889,0.766182,Dropout,0.0
1,0.684211,0.5,0.037736,0.0,0.230769,0.133333,0.230769,0.741722,0.0,0.0,0.26087,0.181818,0.3,0.735897,0.0,0.732558,0.111111,0.640687,Graduate,1.0
2,0.284211,0.313684,0.037736,0.0,0.230769,0.0,0.0,0.0,0.0,0.0,0.26087,0.0,0.0,0.0,0.0,0.372093,0.488889,0.766182,Dropout,0.0
3,0.284211,0.258947,0.056604,0.0,0.230769,0.177778,0.230769,0.711447,0.0,0.0,0.26087,0.30303,0.25,0.667692,0.0,0.209302,0.0,0.124174,Graduate,1.0
4,0.052632,0.489474,0.528302,0.0,0.230769,0.2,0.192308,0.653422,0.0,0.0,0.26087,0.181818,0.3,0.7,0.0,0.732558,0.111111,0.640687,Graduate,1.0


In [5]:
feature_columns = processed_dataset.columns.drop(['Target', 'y_labels'])
X = processed_dataset[feature_columns].to_numpy(dtype=float)
y = processed_dataset['y_labels'].to_numpy(dtype=int)

In [6]:
kf = KFold(n_splits=5, shuffle=False)   # shuffle=False ensures folds remain fixed across iterations
coefs = []  

In [7]:
# Choosing a linear model 
sgd_svm = SGDClassifier(
    loss='hinge',         
    penalty='l2',       
    learning_rate='optimal',
    max_iter=1000,
    tol=1e-4,
    random_state=0
)

In [8]:

for tr_idx, _ in kf.split(X):
    sgd_svm.fit(X[tr_idx], y[tr_idx])
    coefs.append(sgd_svm.coef_.ravel())

coefs = np.vstack(coefs) 

In [9]:
# Statistical Analysis
coef_mean = coefs.mean(axis=0)
coef_std  = coefs.std(axis=0, ddof=1)          
t_stats, p_vals = ttest_1samp(coefs, popmean=0.0, axis=0)

results = pd.DataFrame({
    'feature'   : feature_columns,
    'mean_coef' : coef_mean,
    'std_coef'  : coef_std,
    't_stat'    : t_stats,
    'p_value'   : p_vals
}).sort_values('p_value')

print(results)

                                           feature  mean_coef  std_coef  \
6              Curricular units 1st sem (approved)   5.077315  0.231933   
2                                Age at enrollment  -1.216301  0.074518   
12             Curricular units 2nd sem (approved)   8.421425  0.522875   
10             Curricular units 2nd sem (enrolled)  -6.301770  0.412962   
7                 Curricular units 1st sem (grade)  -0.895191  0.075816   
4              Curricular units 1st sem (enrolled)  -3.216235  0.640707   
3              Curricular units 1st sem (credited)  -0.961903  0.207080   
9              Curricular units 2nd sem (credited)  -2.042352  0.449366   
13                Curricular units 2nd sem (grade)   0.614231  0.192646   
11          Curricular units 2nd sem (evaluations)   0.761693  0.266705   
8   Curricular units 1st sem (without evaluations)   0.795203  0.297378   
1                                  Admission grade   0.270322  0.139745   
15                       

All features with a p-value less than 0.05 are statistically significant and will be kept. Features with higher p-values will be excluded from the model.

Significant Features (p < 0.05):
- Curricular units 1st sem (approved)
- Age at enrollment
- Curricular units 2nd sem (approved)
- Curricular units 2nd sem (enrolled)
- Curricular units 1st sem (grade)
- Curricular units 1st sem (enrolled)
- Curricular units 1st sem (credited)
- Curricular units 2nd sem (credited)
- Curricular units 2nd sem (grade)
- Curricular units 2nd sem (evaluations)
- Curricular units 1st sem (without evaluations)
- Admission grade
- Unemployment rate

Non-Significant Features (p â‰¥ 0.05):
- Inflation rate
- Curricular units 1st sem (evaluations)
- GDP
- Previous qualification (grade)
- Curricular units 2nd sem (without evaluations)

In [11]:
# Removing non-significant features
nonsignificant_features = ['Inflation rate', 'Curricular units 1st sem (evaluations)', 'GDP', 'Previous qualification (grade)', 'Curricular units 2nd sem (without evaluations)']

processed_dataset = processed_dataset.drop(columns=nonsignificant_features, errors='ignore', axis=1)
processed_dataset.head()

Unnamed: 0,Admission grade,Age at enrollment,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Unemployment rate,Target,y_labels
0,0.34,0.056604,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.372093,Dropout,0.0
1,0.5,0.037736,0.0,0.230769,0.230769,0.741722,0.0,0.0,0.26087,0.181818,0.3,0.735897,0.732558,Graduate,1.0
2,0.313684,0.037736,0.0,0.230769,0.0,0.0,0.0,0.0,0.26087,0.0,0.0,0.0,0.372093,Dropout,0.0
3,0.258947,0.056604,0.0,0.230769,0.230769,0.711447,0.0,0.0,0.26087,0.30303,0.25,0.667692,0.209302,Graduate,1.0
4,0.489474,0.528302,0.0,0.230769,0.192308,0.653422,0.0,0.0,0.26087,0.181818,0.3,0.7,0.732558,Graduate,1.0


In [13]:
processed_dataset.to_csv('data/significant-features-data-data.csv', index=False)
processed_dataset.head()

Unnamed: 0,Admission grade,Age at enrollment,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Unemployment rate,Target,y_labels
0,0.34,0.056604,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.372093,Dropout,0.0
1,0.5,0.037736,0.0,0.230769,0.230769,0.741722,0.0,0.0,0.26087,0.181818,0.3,0.735897,0.732558,Graduate,1.0
2,0.313684,0.037736,0.0,0.230769,0.0,0.0,0.0,0.0,0.26087,0.0,0.0,0.0,0.372093,Dropout,0.0
3,0.258947,0.056604,0.0,0.230769,0.230769,0.711447,0.0,0.0,0.26087,0.30303,0.25,0.667692,0.209302,Graduate,1.0
4,0.489474,0.528302,0.0,0.230769,0.192308,0.653422,0.0,0.0,0.26087,0.181818,0.3,0.7,0.732558,Graduate,1.0
