In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
sns.set(style="white")
#sns.set(style="whitegrid", color_codes=True)

import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import matthews_corrcoef
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit
from sklearn import svm
from sklearn.svm import SVC

In [2]:
class BlockingTimeSeriesSplit():
    def __init__(self, n_splits):
        self.n_splits = n_splits
    
    def get_n_splits(self, X, y, groups):
        return self.n_splits
    
    def split(self, X, y=None, groups=None):
        n_samples = len(X)
        k_fold_size = n_samples // self.n_splits
        indices = np.arange(n_samples)

        margin = 0
        for i in range(self.n_splits):
            start = i * k_fold_size
            stop = start + k_fold_size
            mid = int(0.5 * (stop - start)) + start
            yield indices[start: mid], indices[mid + margin: stop]

In [3]:
def train_model(data):
    
    #get data
    y=data['UpDownSign']
    X=data.drop(columns=['UpDownSign'])
    
    #Split the data to train and test
    btss = BlockingTimeSeriesSplit(n_splits=16)

    for train_idx, validation_idx in btss.split(X, y):

        X_train, X_validation = X.iloc[train_idx], X.iloc[validation_idx]
        y_train, y_validation = y.iloc[train_idx], y.iloc[validation_idx]
    
         
    '''
    #Below uses method to find the best parameters
    
    #For Kernel = "linear"
    linear_svc = svm.SVC(kernel='linear')
    linear_svc.fit(X_train, y_train)
    y_pred = linear_svc.predict(X_validation) 
    
    # Performance of linear model on test data
    print("Accuracy: ", accuracy_score(y_validation, y_pred))
    print("Sensitivity: ", recall_score(y_validation, y_pred))
    print("Precision: ", precision_score(y_validation, y_pred))
    print("MCC: ", matthews_corrcoef(y_validation, y_pred))
    
    #For Kernel = "poly" and "sigmod"
    from sklearn.svm import SVC
    kernel_range = ('poly','sigmod')
    scoring = {'AUC': 'roc_auc', 'Accuracy': make_scorer(accuracy_score)}
    degree_range = (1,2,3)
    coef0_range = (0,0.5,1,1.5,2)
    param_grid = dict(degree=degree_range, coef0=coef0_range,kernel = kernel_range)
    cv = BlockingTimeSeriesSplit(n_splits=16)
    poly_sigmod = GridSearchCV(SVC(), param_grid=param_grid, cv=cv,scoring = scoring, refit='AUC', return_train_score=True)
    poly_sigmod.fit(X, y)
  
    print("The best parameters for poly/sigmod kernel are %s with a score of %0.6f"
      % (poly_sigmod.best_params_, poly_sigmod.best_score_))
      
    #For Kernel = "rbf"  
    C_range = np.logspace(-9, 10, 20)
    gamma_range = np.logspace(-20, 4, 25)
    
    param_grid = dict(gamma=gamma_range, C=C_range)
    cv = BlockingTimeSeriesSplit(n_splits=16)
    rbf = GridSearchCV(SVC(), param_grid=param_grid, cv=cv)
    rbf.fit(X, y)
    
    print("The best parameters are %s with a score of %0.6f"
      % (rbf.best_params_, rbf.best_score_))

    
    '''
    svm_best=svm.SVC(kernel='rbf',C = 1, gamma = 1e-14)
    svm_best.fit(X_train,y_train)
    y_pred=svm_best.predict(X_validation)
    
    #print accuracy rate
    #print('Accuracy Score:',accuracy_score(y_validation, y_pred, normalize=True)*100.0)

    return svm_best