In [151]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, train_test_split
from scipy.stats import ttest_1samp

In [152]:
processed_dataset = pd.read_csv('data/processed-data.csv')
processed_dataset.head()

Unnamed: 0,Previous qualification (grade),Admission grade,Age at enrollment,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target,y_labels
0,0.284211,0.34,0.056604,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.372093,0.488889,0.766182,Dropout,-1
1,0.684211,0.5,0.037736,0.0,0.230769,0.133333,0.230769,0.741722,0.0,0.0,0.26087,0.181818,0.3,0.735897,0.0,0.732558,0.111111,0.640687,Graduate,1
2,0.284211,0.313684,0.037736,0.0,0.230769,0.0,0.0,0.0,0.0,0.0,0.26087,0.0,0.0,0.0,0.0,0.372093,0.488889,0.766182,Dropout,-1
3,0.284211,0.258947,0.056604,0.0,0.230769,0.177778,0.230769,0.711447,0.0,0.0,0.26087,0.30303,0.25,0.667692,0.0,0.209302,0.0,0.124174,Graduate,1
4,0.052632,0.489474,0.528302,0.0,0.230769,0.2,0.192308,0.653422,0.0,0.0,0.26087,0.181818,0.3,0.7,0.0,0.732558,0.111111,0.640687,Graduate,1


In [153]:
feature_columns = processed_dataset.columns.drop(['Target', 'y_labels'])
X = processed_dataset[feature_columns].to_numpy(dtype=float)
y = processed_dataset['y_labels'].to_numpy(dtype=int)

In [154]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, shuffle=True)

In [155]:
# Augment X matrices with column of 1s (for intercept)
X_train = np.hstack([X_train, np.ones((X_train.shape[0], 1))])
X_test = np.hstack([X_test, np.ones((X_test.shape[0], 1))])

In [156]:
# Train a soft-margin linear SVM via SGD using hinge loss and slack variable C
def train_linear_svm_sgd(X, y, epochs=1000, lr=0.01, C=1.0):
    n_samples, n_features = X.shape
    w = np.zeros(n_features) 
    
    for _ in range(epochs):
        for i in range(n_samples):
            x_i, y_i = X[i], y[i]
            margin = y_i * (w.dot(x_i))
            if margin >= 1:
                grad_w = w
            else:
                grad_w = w - C * y_i * x_i
            w -= lr * grad_w
    return w

In [157]:
# Testing model on testing set
w_final = train_linear_svm_sgd(X_train, y_train, epochs=1000, lr=0.01, C=1.0)

In [158]:
coeffs    = w_final[:-1]
intercept = w_final[-1]

print(f"Intercept: {intercept:.4f}\n")
print("Feature Coefficients:")
for name, coeff in zip(feature_columns, coeffs):
    print(f"- {name:50s} {coeff: .4f}")

Intercept: 0.2163

Feature Coefficients:
- Previous qualification (grade)                      0.0934
- Admission grade                                     0.0748
- Age at enrollment                                  -0.0098
- Curricular units 1st sem (credited)                -0.0036
- Curricular units 1st sem (enrolled)                 0.0492
- Curricular units 1st sem (evaluations)              0.0367
- Curricular units 1st sem (approved)                 0.0780
- Curricular units 1st sem (grade)                    0.2139
- Curricular units 1st sem (without evaluations)     -0.0064
- Curricular units 2nd sem (credited)                -0.0006
- Curricular units 2nd sem (enrolled)                 0.0573
- Curricular units 2nd sem (evaluations)              0.0510
- Curricular units 2nd sem (approved)                 0.1123
- Curricular units 2nd sem (grade)                    0.2435
- Curricular units 2nd sem (without evaluations)     -0.0084
- Unemployment rate                         

In [160]:
# Test Accuracy
test_preds = np.sign(X_test.dot(w_final))
test_acc = (test_preds == y_test).mean()
print(f"\nTest Accuracy: {test_acc:.3f}")


Test Accuracy: 0.679


Because the linear SVM produces unsatisfactory test accuracy, we’ll switch to the dual-optimization (kernel) SVM. To ensure our feature set is robust, we’ll first run five-fold cross-validation on the linear SVM using fixed folds each time in order to evaluate each coefficient via statistical analysis. Features that fail to reach some threshold (i.e. features with p-value >= 0.05) will be removed. We expect that this will improve the performance of the subsequent kernel-based SVM.

In [161]:
kf = KFold(n_splits=5, shuffle=False) # shuffle=False ensures folds remain fixed across iterations
coefs, val_accs = [], []

In [162]:
for tr_idx, val_idx in kf.split(X_train):
    X_tr, y_tr = X_train[tr_idx], y_train[tr_idx]
    X_val, y_val = X_train[val_idx], y_train[val_idx]
    
    w_fold = train_linear_svm_sgd(
        X_tr, y_tr,
        epochs=500,
        lr=0.01,
        C=1.0
    )
    coefs.append(w_fold)
    
    preds = np.sign(X_val.dot(w_fold))
    val_accs.append((preds == y_val).mean())

coefs = np.vstack(coefs)

In [164]:
print("Validation Accuracies:", val_accs)
print(f"Mean Cross-Validation Accuracy: {np.mean(val_accs):.3f} ± {np.std(val_accs, ddof=1):.3f}")

Validation Accuracies: [0.6624293785310734, 0.652542372881356, 0.7076271186440678, 0.7033898305084746, 0.6676096181046676]
Mean Cross-Validation Accuracy: 0.679 ± 0.025


Since the 5-fold cross-validation accuracy is very similar to the test accuracy, we will proceed with statistical analysis to identify the most significant features in an effort to improve the performance of the kernel-based SVM.

In [165]:
# Statistical Analysis on 5-fold Cross Validation
coef_mean = coefs.mean(axis=0)
coef_std  = coefs.std(axis=0, ddof=1)
t_stats, p_vals = ttest_1samp(coefs, popmean=0.0, axis=0)

results = pd.DataFrame({
    'feature': list(feature_columns) + ['bias'],
    'mean_w': coef_mean,
    'std_w' : coef_std,
    't_stat': t_stats,
    'p_value': p_vals
}).sort_values('p_value')

results

Unnamed: 0,feature,mean_w,std_w,t_stat,p_value
13,Curricular units 2nd sem (grade),0.243052,0.001084,501.187239,9.509106e-11
0,Previous qualification (grade),0.093023,0.000893,233.005463,2.035324e-09
12,Curricular units 2nd sem (approved),0.112812,0.001107,227.884931,2.224503e-09
1,Admission grade,0.074425,0.000865,192.337208,4.383487e-09
15,Unemployment rate,0.087134,0.001797,108.410363,4.34132e-08
6,Curricular units 1st sem (approved),0.079027,0.002395,73.792838,2.020984e-07
7,Curricular units 1st sem (grade),0.217639,0.008281,58.7651,5.021522e-07
18,bias,0.220423,0.009168,53.76252,7.165257e-07
11,Curricular units 2nd sem (evaluations),0.052467,0.003298,35.572183,3.727559e-06
17,GDP,0.128679,0.008187,35.146078,3.911141e-06


All features with a p-value less than 0.05 are statistically significant and will be kept. Features with higher p-values will be excluded from the model.

Significant Features (p < 0.05):
- Curricular units 2nd sem (grade)
- Previous qualification (grade)
- Curricular units 2nd sem (approved)
- Admission grade
- Unemployment rate
- Curricular units 1st sem (approved)
- Curricular units 1st sem (grade)
- Curricular units 2nd sem (evaluations)
- GDP
- Inflation rate
- Curricular units 2nd sem (enrolled)
- Curricular units 1st sem (enrolled)
- Curricular units 1st sem (evaluations)
- Curricular units 2nd sem (without evaluations)
- Age at enrollment

Non-Significant Features (p ≥ 0.05):
- Curricular units 1st sem (without evaluations)
- Curricular units 2nd sem (credited)
- Curricular units 1st sem (credited)

In [166]:
# Removing non-significant features
nonsignificant_features = ['Curricular units 1st sem (without evaluations)', 'Curricular units 2nd sem (credited)','Curricular units 1st sem (credited)']

processed_dataset = processed_dataset.drop(columns=nonsignificant_features, errors='ignore', axis=1)
processed_dataset.head()

Unnamed: 0,Previous qualification (grade),Admission grade,Age at enrollment,Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target,y_labels
0,0.284211,0.34,0.056604,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.372093,0.488889,0.766182,Dropout,-1
1,0.684211,0.5,0.037736,0.230769,0.133333,0.230769,0.741722,0.26087,0.181818,0.3,0.735897,0.0,0.732558,0.111111,0.640687,Graduate,1
2,0.284211,0.313684,0.037736,0.230769,0.0,0.0,0.0,0.26087,0.0,0.0,0.0,0.0,0.372093,0.488889,0.766182,Dropout,-1
3,0.284211,0.258947,0.056604,0.230769,0.177778,0.230769,0.711447,0.26087,0.30303,0.25,0.667692,0.0,0.209302,0.0,0.124174,Graduate,1
4,0.052632,0.489474,0.528302,0.230769,0.2,0.192308,0.653422,0.26087,0.181818,0.3,0.7,0.0,0.732558,0.111111,0.640687,Graduate,1


In [167]:
processed_dataset.to_csv('data/significant-features-data.csv', index=False)