In [24]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, train_test_split
from scipy.stats import ttest_1samp

In [25]:
processed_dataset = pd.read_csv('data/processed-data.csv')
processed_dataset.head()

Unnamed: 0,Previous qualification (grade),Admission grade,Age at enrollment,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target,y_labels
0,0.284211,0.34,0.056604,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.372093,0.488889,0.766182,Dropout,-1
1,0.684211,0.5,0.037736,0.0,0.230769,0.133333,0.230769,0.741722,0.0,0.0,0.26087,0.181818,0.3,0.735897,0.0,0.732558,0.111111,0.640687,Graduate,1
2,0.284211,0.313684,0.037736,0.0,0.230769,0.0,0.0,0.0,0.0,0.0,0.26087,0.0,0.0,0.0,0.0,0.372093,0.488889,0.766182,Dropout,-1
3,0.284211,0.258947,0.056604,0.0,0.230769,0.177778,0.230769,0.711447,0.0,0.0,0.26087,0.30303,0.25,0.667692,0.0,0.209302,0.0,0.124174,Graduate,1
4,0.052632,0.489474,0.528302,0.0,0.230769,0.2,0.192308,0.653422,0.0,0.0,0.26087,0.181818,0.3,0.7,0.0,0.732558,0.111111,0.640687,Graduate,1


In [26]:
feature_columns = processed_dataset.columns.drop(['Target', 'y_labels'])
X = processed_dataset[feature_columns].to_numpy(dtype=float)
y = processed_dataset['y_labels'].to_numpy(dtype=int)

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, shuffle=True)

In [28]:
# Augment X matrices with column of 1s (for intercept)
X_train = np.hstack([X_train, np.ones((X_train.shape[0], 1))])
X_test = np.hstack([X_test, np.ones((X_test.shape[0], 1))])

In [29]:
# Train a soft-margin linear SVM via SGD using hinge loss and regularization term C
def train_linear_svm(X, y, C=0.01, T=1000):
    """
    Runs T iterations of SGD on the hinge loss SVM objective.
    Returns the averaged weight vector w_bar.
    """
    n, d = X.shape
    theta = np.zeros(d)
    w_sum = np.zeros(d)

    for t in range(1, T + 1):
        # form the current predictor
        w_t = theta / (2 * C * t)

        # uniformly pick one point at random 
        i = np.random.randint(n)

        if y[i] * (w_t.dot(X[i])) < 1:
            # only on violations do we update theta
            theta = theta + y[i] * X[i]
            
        # otherwise theta stays the same

        w_sum += w_t

    # return the average of all w_t’s
    return w_sum / T


In [30]:
# Testing model on testing set
w_final = train_linear_svm(X_train, y_train, T=10000)

In [31]:
coeffs    = w_final[:-1]
intercept = w_final[-1]

print(f"Intercept: {intercept:.4f}\n")
print("Feature Coefficients:")
for name, coeff in zip(feature_columns, coeffs):
    print(f"- {name:50s} {coeff: .4f}")

Intercept: -0.5687

Feature Coefficients:
- Previous qualification (grade)                     -0.0848
- Admission grade                                     0.0008
- Age at enrollment                                  -0.4332
- Curricular units 1st sem (credited)                -0.1583
- Curricular units 1st sem (enrolled)                -0.2360
- Curricular units 1st sem (evaluations)             -0.1118
- Curricular units 1st sem (approved)                 0.5436
- Curricular units 1st sem (grade)                    0.6767
- Curricular units 1st sem (without evaluations)      0.0247
- Curricular units 2nd sem (credited)                -0.1085
- Curricular units 2nd sem (enrolled)                -0.2889
- Curricular units 2nd sem (evaluations)              0.0161
- Curricular units 2nd sem (approved)                 1.0655
- Curricular units 2nd sem (grade)                    1.7832
- Curricular units 2nd sem (without evaluations)     -0.0699
- Unemployment rate                        

In [32]:
# Test Accuracy
test_preds = np.sign(X_test.dot(w_final))
test_acc = (test_preds == y_test).mean()
print(f"\nTest Accuracy: {test_acc:.3f}")


Test Accuracy: 0.821


For further evaluation, we'll create a dual-optimization (kernel) SVM to compare. To ensure the feature set of this dual-SGD SVM is robust, we’ll first run five-fold cross-validation on this linear SVM using fixed folds each time in order to evaluate each coefficient via statistical analysis. Features that fail to reach some threshold (i.e. features with p-value >= 0.05) will be removed. We expect that this will improve the performance of the subsequent kernel-based SVM.

In [33]:
kf = KFold(n_splits=5, shuffle=False) # shuffle=False ensures folds remain fixed across iterations
coefs, val_accs = [], []

In [34]:
for tr_idx, val_idx in kf.split(X_train):
    X_tr, y_tr = X_train[tr_idx], y_train[tr_idx]
    X_val, y_val = X_train[val_idx], y_train[val_idx]
    
    w_fold = train_linear_svm(
        X_tr, 
        y_tr,
    )
    coefs.append(w_fold)
    
    preds = np.sign(X_val.dot(w_fold))
    val_accs.append((preds == y_val).mean())

coefs = np.vstack(coefs)

In [35]:
print("Validation Accuracies:", val_accs)
print(f"Mean Cross-Validation Accuracy: {np.mean(val_accs):.3f} ± {np.std(val_accs, ddof=1):.3f}")

Validation Accuracies: [0.8008474576271186, 0.8050847457627118, 0.8036723163841808, 0.8107344632768362, 0.809052333804809]
Mean Cross-Validation Accuracy: 0.806 ± 0.004


In [36]:
# Statistical Analysis on 5-fold Cross Validation
coef_mean = coefs.mean(axis=0)
coef_std  = coefs.std(axis=0, ddof=1)
t_stats, p_vals = ttest_1samp(coefs, popmean=0.0, axis=0)

results = pd.DataFrame({
    'feature': list(feature_columns) + ['bias'],
    'mean_w': coef_mean,
    'std_w' : coef_std,
    't_stat': t_stats,
    'p_value': p_vals
}).sort_values('p_value')

results

Unnamed: 0,feature,mean_w,std_w,t_stat,p_value
7,Curricular units 1st sem (grade),0.965018,0.076517,28.200696,9e-06
13,Curricular units 2nd sem (grade),1.734813,0.203902,19.024629,4.5e-05
2,Age at enrollment,-0.611754,0.072229,-18.93865,4.6e-05
6,Curricular units 1st sem (approved),0.684366,0.091476,16.728839,7.5e-05
12,Curricular units 2nd sem (approved),1.170004,0.156914,16.672901,7.6e-05
18,bias,-0.558805,0.171061,-7.304551,0.001868
4,Curricular units 1st sem (enrolled),-0.16387,0.063722,-5.750372,0.004534
10,Curricular units 2nd sem (enrolled),-0.180038,0.086788,-4.638636,0.009744
5,Curricular units 1st sem (evaluations),-0.16639,0.090661,-4.103829,0.014807
1,Admission grade,0.143948,0.093407,3.44597,0.026151


All features with a p-value less than 0.05 are statistically significant and will be kept. Features with higher p-values will be excluded from the model.

Significant Features (p < 0.05):
- Curricular units 1st sem (grade)
- Curricular units 2nd sem (grade)
- Age at enrollment
- Curricular units 1st sem (approved)
- Curricular units 2nd sem (approved)
- Curricular units 1st sem (enrolled)
- Curricular units 2nd sem (enrolled)
- Curricular units 1st sem (evaluations)
- Admission grade
- Inflation rate

Non-Significant Features (p ≥ 0.05):
- Curricular units 2nd sem (credited)
- Unemployment rate
- Previous qualification (grade)
- Curricular units 2nd sem (without evaluations)
- Curricular units 1st sem (credited)
- Curricular units 2nd sem (evaluations)
- GDP
- Curricular units 1st sem (without evaluations)

In [37]:
# Removing non-significant features
nonsignificant_features = [
    'Curricular units 2nd sem (credited)',
    'Unemployment rate',
    'Previous qualification (grade)',
    'Curricular units 2nd sem (without evaluations)',
    'Curricular units 1st sem (credited)',
    'Curricular units 2nd sem (evaluations)',
    'GDP',
    'Curricular units 1st sem (without evaluations)'
]

processed_dataset = processed_dataset.drop(columns=nonsignificant_features, errors='ignore', axis=1)
processed_dataset.head()

Unnamed: 0,Admission grade,Age at enrollment,Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Inflation rate,Target,y_labels
0,0.34,0.056604,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.488889,Dropout,-1
1,0.5,0.037736,0.230769,0.133333,0.230769,0.741722,0.26087,0.3,0.735897,0.111111,Graduate,1
2,0.313684,0.037736,0.230769,0.0,0.0,0.0,0.26087,0.0,0.0,0.488889,Dropout,-1
3,0.258947,0.056604,0.230769,0.177778,0.230769,0.711447,0.26087,0.25,0.667692,0.0,Graduate,1
4,0.489474,0.528302,0.230769,0.2,0.192308,0.653422,0.26087,0.3,0.7,0.111111,Graduate,1


In [38]:
processed_dataset.to_csv('data/significant-features-data.csv', index=False)