In [3]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
# from sklearn.model_selection import train_test_split


# Checks if feature data has been created locally
if not os.path.exists("mask_data.csv"):
    import featurize
    
data = np.array(pd.read_csv('mask_data.csv', sep=',',header=None))
X, Y = np.split(data, [-1], axis=1)
Y = Y.flatten()

# X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=0)

# PCA dimensionality reduction

In [67]:
from sklearn.decomposition import PCA

pca = PCA(n_components=X.shape[1])

def binary_search_helper(left, right, X, Y, target):
    if left > right:
        raise ValueError("Binary search helper called with left > right")
    
    mid = (left + right) // 2
    pca.set_params(**{'n_components':mid})
    pca.fit(X, Y) if Y is not None else pca.fit(X)
    variance = sum(pca.explained_variance_ratio_)    
    print("Explained variance ratio when mapping to {} dimensions is {}".format(
        mid, variance
    ))
        
    # Base case
    if left == right:
        # Use None to denote that we tried exploring left path but it was too low
        return left if variance >= target else None 
    
    # Recursive case
    if variance == target:
        return mid
    elif variance >= target:
        left_result = binary_search_helper(left, mid - 1, X, Y, target)
        return left_result if left_result is not None else mid
    else:
        return binary_search_helper(mid + 1, right, X, Y, target)
        
    

def required_dimensions(target, X, Y=None) -> int:
    """
    Returns the minimum number of dimensions we can reduce the dataset given by X and Y to 
    while keeping an explained variance of no less than target.
    """
#     variance = 0
#     n_dims = 0
#     pca = PCA(n_components=n_dims)
#     while variance < target and n_dims < X.shape[0]:
#         n_dims += 1
#         pca.set_params(**{'n_components':n_dims})
#         pca.fit(X, Y) if Y is not None else pca.fit(X)
#         variance = sum(pca.explained_variance_ratio_)
#         print("Explained variance ratio when mapping to {} dimensions is {}".format(
#             n_dims, variance
#         ))
    return binary_search_helper(0, X.shape[1] - 1, X, Y, target)
        

# pca = PCA(n_components=100)
# pca.fit(X, Y)

# print(pca.explained_variance_ratio_)

print(required_dimensions(0.95, X, Y))

Explained variance ratio when mapping to 2047 dimensions is 0.9991406104596937
Explained variance ratio when mapping to 1023 dimensions is 0.9944351054826367
Explained variance ratio when mapping to 511 dimensions is 0.9838490009086879
Explained variance ratio when mapping to 255 dimensions is 0.9661279807652066
Explained variance ratio when mapping to 127 dimensions is 0.9387080996549644
Explained variance ratio when mapping to 191 dimensions is 0.9560223393412376
Explained variance ratio when mapping to 159 dimensions is 0.9486848339439383
Explained variance ratio when mapping to 175 dimensions is 0.952603272966209
Explained variance ratio when mapping to 167 dimensions is 0.9507209368195808
Explained variance ratio when mapping to 163 dimensions is 0.9497079259424271
Explained variance ratio when mapping to 165 dimensions is 0.9502179976307089
Explained variance ratio when mapping to 164 dimensions is 0.9499773774806068
165


Thus, we can try mapping the data to $\mathbb{R}^{165}$ to reduce computational cost later

In [73]:
pca.set_params(**{'n_components':165})
pca.fit(X, Y)
X = pca.transform(X)

# SVM Classification using various kernels

In [75]:
from sklearn import svm

def svm_experiment(kernel : str, C : int, k : int, degree=3):
    """
    Sample code for SVM svc from https://scikit-learn.org/stable/modules/cross_validation.html
    
    Helper function to run k-fold cross validation on a SVM with a specified kernel.
    Returns the trained classifier.
    """
    clf = svm.SVC(kernel=kernel, C=C, random_state=0, degree=degree)
    scores = cross_val_score(clf, X, Y, cv=k)
    print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
    return clf

In [None]:
linear_svm_model = svm_experiment("linear", 1, 5)

In [None]:
gaussian_svm_model = svm_experiment("rbf", 1, 5)

In [None]:
cubic_svm_model = svm_experiment("poly", 1, 5, 5)

# XGBoost parameter fitting

In [None]:
import xgboost as xgb

def xgb_experiment(rounds : int, k : int, param : dict):
    """ 
    Sample code for xgboost from:
    https://xgboost.readthedocs.io/en/latest/python/examples/cross_validation.html
    """
    dtrain = xgb.DMatrix(X, label=Y)

    return xgb.cv(param, dtrain, rounds, nfold=k,
           metrics={'merror'}, early_stopping_rounds=10, seed=0,
           callbacks=[xgb.callback.EvaluationMonitor(show_stdv=True)])

xgb_experiment(10, 5, param)

In [None]:
from sklearn.model_selection import GridSearchCV
from xgboost.sklearn import XGBClassifier
"""
Example tuning from 
https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/
"""

param = {
    'max_depth':5, 
    'min_child_weight':1, 
    'gamma':0,
    'eta':2, 
    'num_class':4,
    'subsample':0.8,
    'colsample_bytree':0.8,
}

param_test1 = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2)
}

gsearch1 = GridSearchCV(
    estimator=XGBClassifier(
        max_depth=5, 
        min_child_weight=1, 
        gamma=0,
        eta=2, 
        num_class=4,
        subsample=0.8,
        colsample_bytree=0.8,
        use_label_encoder=False,
    ),
    param_grid=param_test1,
)

gsearch1.fit(X, Y)


In [None]:
gsearch1.best_params_