In [159]:
import pandas as pd
import stumpy
import numpy as np
import datetime as dt
import random
import math
import pickle
import sys

from statistics import mean
from tqdm.auto import tqdm
from multiprocessing import Pool

import sklearn.metrics as metrics
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [160]:
## PART 3
shapelet_samples_list = range(0, 18)
shapelet_size_list = [500]
clf_samples_list = [400000]

parameter_list = np.array(np.meshgrid(shapelet_samples_list, shapelet_size_list, clf_samples_list)).T.reshape(-1,3)

print(parameter_list)



[[     0    500 400000]
 [     1    500 400000]
 [     2    500 400000]
 [     3    500 400000]
 [     4    500 400000]
 [     5    500 400000]
 [     6    500 400000]
 [     7    500 400000]
 [     8    500 400000]
 [     9    500 400000]
 [    10    500 400000]
 [    11    500 400000]
 [    12    500 400000]
 [    13    500 400000]
 [    14    500 400000]
 [    15    500 400000]
 [    16    500 400000]
 [    17    500 400000]]


In [161]:
from sklearn.model_selection import KFold
'''
Perform cross-validation of sklearn classifier on training data samples 

Input:
   
    clf: sklearn classifier object
    X: x values
    y: y values
    topk: k values for evaluation metrics
    n_splits: Number of folds.
Output:
    list scores, one for each fold, where each score is of length topk with accuracy for validate data
'''

def cross_validate(clf,X,y, topk=[1,3,5],n_splits=5):
    
    #xx = X[0,:30]
    #print(xx)
    #yy = y[:30]
    #print(yy)
    kf = KFold(n_splits=n_splits)

    #yyy = []
    cv_score_list = []    
    for train_index, validate_index in kf.split(X):
        #print("\n\n\n=================\n","train_index",train_index,"validate_index",validate_index)
        #print("\n\n\n=================\n","validate_index",validate_index)
        
        y_train = [y[j] for j in train_index]
        #print(X[train_index].shape,len(y_train))
        
        y_validate = [y[j] for j in validate_index]
        #print(X[validate_index].shape,len(y_validate))
        
        #print("\n")

        clf.fit(X[train_index],y_train)
        y_prob = clf.predict_proba(X[validate_index])

        #print(y_prob.shape)


        cv_scores = []
        for k in topk:
            correct = 0
            for i in range(len(y_prob)):
                ind = np.argpartition(y_prob[i], -k)[-k:]
                if y_validate[i] in ind:
                    correct += 1
            #print(correct/len(y_prob))
            cv_scores.append(correct/len(y_prob))

        cv_score_list.append(cv_scores)

    return cv_score_list
    
    
    
    
    

In [162]:

'''
Evaluate performance of sklearn classifier on data samples - 90/10 training testing split

Input:
    
    clf: sklearn classifier object
    X: x values
    y: y values
    topk: k values for evaluation metrics
    bCrossValidate: A boolean variable defining if cross-validation is required
    n_splits: Number of cross-validation folds
Output:
    list of length topk with accuracy for testing data
    list scores, one for each fold, where each score is of length topk with accuracy for validate data, return -1 if bCrossValidate = False
'''

def classifier_performance( clf, X, y, topk=[1,3,5],bCrossValidate=True, n_splits=5):
    
    #print(type(X),type(y))
    #print(X.shape,len(y))
    cv_score_list = -1
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
    #print("X_train.shape",X_train.shape,"X_test.shape",X_test.shape,"y_train",len(y_train),"y_test.shape",len(y_test))
    
    if bCrossValidate == True:
        cv_score_list = cross_validate(clf,X_train,y_train,topk,n_splits)
        #print("cv_score_list", cv_score_list)
    
    #print("\n******************\ncross validation ends here\n\n\n")
    clf.fit(X_train, y_train)
    y_prob = clf.predict_proba(X_test)
    
    #print(y_prob.shape)
    
    scores = []
    for k in topk:
        correct = 0
        for i in range(len(y_prob)):
            ind = np.argpartition(y_prob[i], -k)[-k:]
            if y_test[i] in ind:
                correct += 1
        scores.append(correct/len(y_prob))
    
    return cv_score_list, scores

In [163]:
import os
for parameters in parameter_list:
    
    
    filename = '../results/data/X/' + 'num=' + str(parameters[0]) + 'size=' + str(parameters[1]) + 'samples=' + str(parameters[2])
    if os.path.isfile(filename) == False:
        continue
    print(filename)
    with open(filename, 'rb') as f:
        X = pickle.load(f)
    
    
    filename = '../results/data/y/' + 'num=' + str(parameters[0]) + 'size=' + str(parameters[1]) + 'samples=' + str(parameters[2])
    if os.path.isfile(filename) == False:
        continue
    print(filename)
    with open(filename, 'rb') as f:
        y = pickle.load(f)
    
    bCrossValidate = True
    Num_Instance = 800
    n_splits = 4
    
    clf = RandomForestClassifier()
    cv_score_list, scores = classifier_performance(clf, X[:Num_Instance,:], y[:Num_Instance],bCrossValidate=bCrossValidate,n_splits=n_splits)
    
    # To run on entire dataset, replace the above line with the following
    #cv_score_list, scores = classifier_performance(clf, X, y,bCrossValidate=bCrossValidate,n_splits=n_splits)
    
    print("",cv_score_list,"\n",scores)
    print("===========================================")
    
    
    #outfile_name = "../results/scores/" + 'num=' + str(parameters[0]) + 'size=' + str(parameters[1]) + 'samples=' + str(parameters[2])
    
    #with open(outfile_name, 'wb') as f:
    #    pickle.dump(scores, f)

../results/data/X/num=4size=500samples=400000
../results/data/y/num=4size=500samples=400000
 [[0.016666666666666666, 0.03888888888888889, 0.06666666666666667], [0.005555555555555556, 0.03888888888888889, 0.05555555555555555], [0.016666666666666666, 0.05555555555555555, 0.08888888888888889], [0.011111111111111112, 0.044444444444444446, 0.07222222222222222]] 
 [0.0, 0.0125, 0.05]
../results/data/X/num=6size=500samples=400000
../results/data/y/num=6size=500samples=400000
 [[0.0, 0.005555555555555556, 0.03333333333333333], [0.011111111111111112, 0.03333333333333333, 0.044444444444444446], [0.005555555555555556, 0.016666666666666666, 0.027777777777777776], [0.0, 0.005555555555555556, 0.011111111111111112]] 
 [0.0125, 0.025, 0.0625]
