In [1]:
import numpy as np, humanfriendly as hf, random
import time
from sklearn.model_selection import train_test_split,\
     RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

def get_scores(model, xtrain, ytrain, xtest, ytest):
    ypred = model.predict(xtest)
    train = model.score(xtrain, ytrain)
    test = model.score(xtest, y_test)
    name = model.__class__.__name__
    return (name, train, test)

def get_cross(model, data, target, groups=10):
    return cross_val_score(model, data, target, cv=groups)

def prep_data(data, target):
    d = [data[i] for i, _ in enumerate(data)]
    t = [target[i] for i, _ in enumerate(target)]
    return list(zip(d, t))

def create_sample(d, n, replace='yes'):
    if replace == 'yes': s = random.sample(d, n)
    else: s = [random.choice(d)
               for i, _ in enumerate(d) if i < n]
    Xs = [row[0] for i, row in enumerate(s)]
    ys = [row[1] for i, row in enumerate(s)]
    return np.array(Xs), np.array(ys)

def see_time(note):
    end = time.perf_counter()
    elapsed = end - start
    print (note,
           hf.format_timespan(elapsed, detailed=True))
    
# Convert binary fields to numeric boolean fields
def binary_encode(x):
    if x == 'Yes':
        return 1
    elif x == 'No':
        return 0
    else:
        return None
    
# df['x'] = df['x'].apply(binary_encode)


if __name__ == "__main__":
    br = '\n'
    X = np.load('data/X_init.npy')
    # need to add allow_pickle=True parameter
    y = np.load('data/y_init.npy', allow_pickle=True)
    print("output y:", y)
    print("max y:", y.max())
    #y = np.where(y=='Y', 1, 0)
    
    
    sample_size = 4000
    data = prep_data(X, y)

    Xs, ys = create_sample(data, sample_size, replace='no')
    Xs = StandardScaler().fit_transform(Xs.astype(np.float64))
    X_train, X_test, y_train, y_test = train_test_split(Xs, ys, random_state=0)
    
    svm = SVC(random_state=0, gamma='auto')
    print(svm, br)
    svm.fit(X_train, y_train)
    
    svm_scores = get_scores(svm, X_train, y_train,
                            X_test, y_test)
    print (svm_scores[0] + ' (train, test):')
    print (svm_scores[1], svm_scores[2], br)

    Cs = [0.0001, 0.001]
    param_grid = {'C': Cs}
    start = time.perf_counter()
    rand = RandomizedSearchCV(svm, param_grid, cv=3, n_jobs = -1,
                              random_state=0, verbose=2,
                              n_iter=2)
    rand.fit(X, y)
    see_time('RandomizedSearchCV total tuning time:')
    bp = rand.best_params_
    print (bp, br)
    svm_tuned = SVC(**bp, gamma='auto', random_state=0)
    svm_tuned.fit(X_train, y_train)
    svm_scores = get_scores(svm_tuned, X_train, y_train,
                            X_test, y_test)
    print (svm_scores[0] + ' (train, test):')
    print (svm_scores[1], svm_scores[2], br)
    print ('cross-validation score:')
    svm = SVC(gamma='auto')
    scores = get_cross(svm, Xs, ys)
    print (np.mean(scores))

output y: [0 0 1 ... 0 0 0]
max y: 1
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False) 

SVC (train, test):
0.7466666666666667 0.736 

Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed:   12.0s remaining:   12.0s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:   14.2s finished


RandomizedSearchCV total tuning time: 15 seconds, 400 milliseconds, 37 microseconds and 488.24 nanoseconds
{'C': 0.0001} 

SVC (train, test):
0.7386666666666667 0.734 

cross-validation score:
0.739
