# Notebook to compute the Grid Search by using Apache Spark validating D1 with the Time Series Cross Validation and D2 with Hold Out. 

This notebook implements the Time Series Cross Validation + Hold Out technique. As input it requires a tabular dataset like 1-SignalSelection/dataset/All.pkl.

Input: 
- < Classifier >: the classifier to tune {tree, forest, svm, mlp}
- < OptimizationStep >: which step should be tested. The process finds the dataset in the dataset folder of that step
- < Selected>: the dataset to test  
- < D1DatasetSize >: The portion of dataset assigned to D1
- < DeltaT >: The window size default = 60 minutes
    
Output: 

- < "gridresult/"+OptimizationStep > save a file with the the classification performance of each hyperparamter configuration

In [1]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix


from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

import pandas as pd
import numpy as np
import random
import pickle
import math  
import json
import time
import sys
import os 

%matplotlib inline
pd.options.mode.chained_assignment = None  # default='warn'

broadcastVar = ""

In [2]:
def f1score(p,r):
    return 0 if p == 0 or r == 0 else (2 * p * r) / (p + r)


def getPerformance(y_true, y_pred):

    C = confusion_matrix (y_true, y_pred) 

    labels = [ str(s) for s in sorted(list(set(y_true)))]


    accuracy = round(float(C.diagonal().sum()) /C.sum(),4)

    stats = {"green" : {}, "yellow" : {}, "red":{}}
    f1 = {}
    tex = []
    p = 0
    r = 0
    for i,label in enumerate(labels): 
        try:
            p = float(C[i][i]) / C.transpose()[i].sum()
            if math.isnan(p):
                p=0
        except:
            p = 0
        try:
            r = float(C[i][i]) / C[i].sum()
            if math.isnan(r):
                r=0
        except:
            r = 0


        stats[label]['precision'] = round(p,4)
        stats[label]['recall'] = round(r,4)
        stats[label]["f1-score"] = round(f1score(p,r),4)
    stats["accuracy"] = accuracy

    
    return stats


def TimeSeriesValidationPerformance(clf, D1Dataset, TesingWindow, userScale = True):

    buffer = 100
    scaler = StandardScaler()
    
    y_pred_all = []
    y_true_all = []

    train_index = []  

    
    features = list(D1Dataset.columns)
    features.remove("ExpID")
    features.remove("Label")

    X = np.array(D1Dataset[features])
    y = np.array(D1Dataset['Label'])    
    
    
    
    for count in range(0,len(X)):
        train_index.append(count)
        if(len(train_index)==buffer): 
            
            test_index = [i for i in range(count+1, min(count+TesingWindow+1,len(X)))]
            if(len(test_index)==TesingWindow):
                buffer+=TesingWindow    

                X_train = X[train_index]
                if userScale == True:
                    scaler.fit(X_train)
                    X_train = scaler.transform(X_train)

                clf.fit(X_train, y[train_index])

                X_test = X[test_index]
                if userScale == True:
                    X_test = scaler.transform(X_test)

                y_pred = clf.predict(X_test)
                y_true = y[test_index]

                y_pred_all += list(y_pred)
                y_true_all += list(y_true)

    stats = getPerformance(y_true_all, y_pred_all)
    return stats


def HoldOutPerformance(clf, D1Dataset, D2Dataset, userScale = True):


    features = list(D1Dataset.columns)
    features.remove("ExpID")
    features.remove("Label")

    X_train = np.array(D1Dataset[features])
    y_train = list(D1Dataset['Label'])

    
    scaler = StandardScaler()
    if userScale == True:
        scaler.fit(X_train)
        X_train = scaler.transform(X_train)

    clf.fit(X_train, y_train)

    X_test = np.array(D2Dataset[features])
    y_test = np.array(D2Dataset['Label'])
    
    
    if userScale == True:
        X_test = scaler.transform(X_test)

    y_pred = clf.predict(X_test)
    y_true = y_test

    stats = getPerformance(y_true, y_pred)
        
    return stats

In [3]:
def run_classification(args):
        
    (params_id, clf,classifier,TesingWindow,D1DatasetSize) = args

    Scaler = False
    if(classifier=="svm" or classifier=="mlp"): Scaler=True
    
    Dataset   = broadcastVar.value
    D1Dataset = Dataset.iloc[:D1DatasetSize]
    D2Dataset = Dataset.iloc[D1DatasetSize:]

    TimeValidation = TimeSeriesValidationPerformance(clf, D1Dataset, TesingWindow, Scaler)
    HoldValidation = HoldOutPerformance(clf, D1Dataset, D2Dataset, Scaler)
    
    stats = {"Time":TimeValidation,"Hold":HoldValidation}
     
    return (params_id,(stats))

In [6]:
def createMLPGrid(N):

    m = 3 #number of classes
    first  = np.sqrt((m+2)*N)+2*np.sqrt(N/(m+2))
    second = m*np.sqrt(N/(m+2))

    hidden_layer =  set([int(first*i)  for i in [1]] +  [int(second*i) for i in [1]]) #0.1, 0.5,0.75,0.85,

    i=6
    while len(hidden_layer) < 10:

        hidden_layer.add(int(first*(1-float(i)/10)))
        if(len(hidden_layer) < 10):
            hidden_layer.add(int(second*(1-float(i)/10)))
        i+=1    
        if(i>=10): 
            print("ERORR NUMBER OF NODES CANNOT BE 0 OR NEGATIVE")
            exit(0)
    
    activation_functions = ['logistic']#, 'tanh']
    tol_array = [1e-4]
    random.seed(0)
    random_states = [random.randint(0, 2 ** 32 - 1) for i in range(0, 100)]

    tuple_layer = []
    # create al possible combination
    for i in hidden_layer:
        for j in hidden_layer:
            tuple_layer.append((i, j))

    conf_list = []
    for tuple in tuple_layer:
        for activation_function in activation_functions:
            for tol in tol_array:
                for rs in random_states:
                    conf = dict(hidden_layer_sizes=tuple, max_iter=5000, tol=tol, solver='adam', \
                                activation=activation_function, random_state=rs)
                    conf_list.append(conf)

    return conf_list

def loadConfig(classifier, N = 300):
    
    if(classifier == "mlp"): return createMLPGrid(N)

    configs = json.load(open("../classes/parameters/grid_"+classifier+".json"))
    
    return configs 

def main(Classifier,OptimizationStep,Selected,TesingWindow,DeltaT,D1DatasetSize):


    Dataset = pickle.load(open("../"+OptimizationStep+"/dataset/"+Selected+".pkl","rb"))
    
    WinExperiment = int(float(62)/float(DeltaT))
    D1DatasetSize *=WinExperiment
    TesingWindow *=WinExperiment

    global broadcastVar
    broadcastVar = sc.broadcast(Dataset)

    configurations = loadConfig(Classifier,D1DatasetSize)
    
    clfs = []
    ID_Params = {}
    ID=0
    for params in configurations:
        clf = ""
        if(Classifier=="forest"):
            clf = RandomForestClassifier(**params)
        if(Classifier=="tree"):
            clf = DecisionTreeClassifier(**params)
        if(Classifier=="svm"):
            clf = SVC(**params)
        if(Classifier=="mlp"):
            clf = MLPClassifier(**params)
        clfs.append((ID,clf,Classifier,TesingWindow,D1DatasetSize))
        ID_Params[ID] = params
        ID+=1

    
    print("Run Configuration:")
    print(" Step:",OptimizationStep)
    print(" Step configuration:",Selected)
    print(" Classifier:",Classifier)
    print(" Hyperparameters configurations:",len(clfs))
    print(" D1 Size:",D1DatasetSize)    
    print(" Window Shift:",TesingWindow)    
    
    
    bs_rdd = sc.parallelize(clfs, numSlices=len(clfs))   
    Performance = bs_rdd.map(run_classification).collectAsMap()
    
    if not os.path.exists('gridresult'):
        os.makedirs('gridresult')    

    if not os.path.exists('gridresult/'+OptimizationStep):
        os.makedirs('gridresult/'+OptimizationStep)       
    
    fout = open("gridresult/"+OptimizationStep+"/TS_"+Selected+"-"+Classifier+"_"+str(TesingWindow)+".csv","w")
    fout.write("ID;Conf;D1G-P;D1G-R;D1G-F1;D1Y-P;D1Y-R;D1Y-F1;D1R-P;D1R-R;D1R-F1;D1Accuracy;D2G-P;D2G-R;D2G-F1;D2Y-P;D2Y-R;D2Y-F1;D2R-P;D2R-R;D2R-F1;D2Accuracy\n")

    
    for ID in range(len(Performance)):
        PerformanceI = Performance[ID]
        Params = ID_Params[ID]
        
        StrOut = str(ID)+";"+str(Params)+";"
        for Validation in ["Time","Hold"]:
            for label in ['green','yellow','red']:
                for measure in ['precision','recall','f1-score']:
                    StrOut +="%.4f;"%PerformanceI[Validation][label][measure]
            StrOut +="%.4f;"%PerformanceI[Validation]['accuracy']
        
        fout.write(StrOut[0:-1]+"\n")
        
    fout.close()

    
    return

In [None]:
Classifier = "tree"
OptimizationStep = "2-Windowing"
Selected = "60"

D1DatasetSize = 300
DeltaT = 60
TesingWindow = 3

start_time = time.time()
print("Start %s"%(time.strftime(" %R ")))
main(Classifier,OptimizationStep,Selected,TesingWindow,DeltaT,D1DatasetSize)
print("--- %s seconds ---" % (time.time() - start_time))

Start  15:41 
Run Configuration:
 Step: 2-Windowing
 Step configuration: 60
 Classifier: tree
 Hyperparameters configurations: 1344
 D1 Size: 300
 Window Shift: 3
