In [48]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, train_test_split, ParameterGrid
from sklearn.metrics import accuracy_score
from joblib import Parallel, delayed

In [49]:
data = pd.read_csv('data/significant-features-data.csv')
data.head()

Unnamed: 0,Age at enrollment,Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Unemployment rate,Target,y_labels
0,0.056604,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.372093,Dropout,-1
1,0.037736,0.133333,0.230769,0.741722,0.0,0.26087,0.3,0.735897,0.732558,Graduate,1
2,0.037736,0.0,0.0,0.0,0.0,0.26087,0.0,0.0,0.372093,Dropout,-1
3,0.056604,0.177778,0.230769,0.711447,0.0,0.26087,0.25,0.667692,0.209302,Graduate,1
4,0.528302,0.2,0.192308,0.653422,0.0,0.26087,0.3,0.7,0.732558,Graduate,1


In [50]:
feature_columns = data.columns.drop(['Target', 'y_labels'])
X = data[feature_columns].to_numpy(dtype=float)
y = data['y_labels'].to_numpy(dtype=int)

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, shuffle=True)

In [52]:
# Augment X matrices with column of 1s (for intercept)
X_train = np.hstack([X_train, np.ones((X_train.shape[0], 1))])
X_test = np.hstack([X_test, np.ones((X_test.shape[0], 1))])

In [53]:
# Polynomial Kernel
def polynomial_kernel(A, B, *, degree=3, gamma=1., coef0=1.):
    """
    K(x, x') = (gamma * <x, x'> + coef0)^degree
    """
    return (gamma * (A @ B.T) + coef0) ** degree

In [54]:
# Gaussian Kernel
def gaussian_kernel(A, B, *, gamma=1.):
    """
    K(x, x') = exp( - gamma * ||x - x'||^2 )
    """
    A2 = (A**2).sum(1)[:, None]
    B2 = (B**2).sum(1)[None, :]
    return np.exp(-gamma * (A2 + B2 - 2 * (A @ B.T)))

In [55]:
def train_kernel_svm_sgd(X, y, kernel, T=1000, C=1.0, **kernel_params):
    """
    Dual-SGD for hinge-loss SVM using a precomputed Gram matrix.
    Returns the averaged alpha coefficients.
    """
    n = X.shape[0]
    # precompute Gram matrix
    K = kernel(X, X, **kernel_params)
    
    # betas accumulate raw counts; we'll convert to alphas each step
    beta = np.zeros(n)
    alpha_sum = np.zeros(n)
    
    for t in range(1, T + 1):
        # compute current alphas from beta
        alpha = beta / (2 * C * t)
        
        # uniformly pick one example at random
        j = np.random.randint(n)
        
        # decision function margin for j
        margin_j = y[j] * np.dot(alpha * y, K[:, j])
        
        # if margin_j < 1, we incur hinge loss → update beta_j
        if margin_j < 1:
            beta[j] += y[j]
        
        alpha_sum += alpha
    
    # return the averaged alpha over all iterations
    return alpha_sum / T

In [56]:
def predict_kernel_svm(alpha, X_train, y_train, X_eval, kernel, **kernel_params):
    """
    Predict sign( f(x) ) where
      f(x) = sum_i alpha_i * y_i * K(x_i, x)
    """
    return np.sign((alpha * y_train) @ kernel(X_train, X_eval, **kernel_params))

In [57]:
def cross_validate(params, X, y, kernel, *, n_folds=5, T=500):
    """
    Perform k-fold cross-validation for kernel SVM with given parameters
    Returns average accuracy across folds
    """
    C = params['C']
    kernel_params = {k: v for k, v in params.items() if k != 'C'}

    acc = []
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=0)

    for train_idx, val_idx in skf.split(X, y):
        X_tr, X_val = X[train_idx], X[val_idx]
        y_tr, y_val = y[train_idx], y[val_idx]

        alpha = train_kernel_svm_sgd(X_tr, y_tr, kernel, C=C, T=T, **kernel_params)
        preds = predict_kernel_svm(alpha, X_tr, y_tr, X_val, kernel, **kernel_params)

        acc.append((preds == y_val).mean())

    return np.mean(acc)

In [58]:
def grid_search_svm(X, y, kernel, param_grid, *, n_folds=5, T=500, n_jobs=-1):
    """
    Perform grid search to find best hyperparameters
    Returns best parameters and best score
    """
    grid = list(ParameterGrid(param_grid))
    scores = Parallel(n_jobs=n_jobs, verbose=1)(
        delayed(cross_validate)(p, X, y, kernel, n_folds=n_folds, T=T) for p in grid
    )
    best_i = int(np.argmax(scores))
    return grid[best_i], scores[best_i]

In [59]:
# Parameter grids for both kernels
poly_grid = dict(degree=[2,3,4], gamma=[.1,1,10], coef0=[0,1], C=[.1,1,10])
gaussian_grid  = dict(gamma=[.01,.1,1,10], C=[.1,1,10])

In [60]:
# Perform grid search for polynomial kernel
print(">> Polynomial grid-search")
best_poly, poly_cv = grid_search_svm(X_train, y_train, polynomial_kernel, poly_grid)
print("Best poly params:", best_poly, "CV =", poly_cv)

>> Polynomial grid-search


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.8s


Best poly params: {'C': 0.1, 'coef0': 0, 'degree': 2, 'gamma': 0.1} CV = 0.6787228601794804


[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed:    5.4s finished


In [61]:

# Perform grid search for Gaussian kernel
print("\n>> Gaussian grid-search")
best_gaussian, gaussian_cv = grid_search_svm(X_train, y_train, gaussian_kernel, gaussian_grid)
print("Best gaussian params:", best_gaussian, "CV =", gaussian_cv)


>> Gaussian grid-search


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Best gaussian params: {'C': 0.1, 'gamma': 0.01} CV = 0.6787228601794804


[Parallel(n_jobs=-1)]: Done  10 out of  12 | elapsed:    1.6s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    1.7s finished


In [65]:
def fit_and_eval(best_params, kernel, name):
    C = best_params['C']
    kernel_params = {k: v for k, v in best_params.items() if k != 'C'}

    alpha = train_kernel_svm_sgd(X_train, y_train, kernel, C=C, T=1000, **kernel_params)
    preds = predict_kernel_svm(alpha, X_train, y_train, X_test, kernel, **kernel_params)

    acc = accuracy_score(y_test, preds)
    print(f"\n{name} kernel - test accuracy: {acc:.4f}")


In [66]:
# Train Polynomial kernel model
fit_and_eval(best_poly, polynomial_kernel, "Polynomial")


Polynomial kernel - test accuracy: 0.6791


In [67]:
# Train Gaussian kernel model
fit_and_eval(best_gaussian, gaussian_kernel, "Gaussian")


Gaussian kernel - test accuracy: 0.6791
