In [2]:
import os
import csv
import time
import numpy as np
import pandas as pd
from scipy.io import arff
from sklearn import preprocessing
from tslearn.preprocessing import TimeSeriesScalerMeanVariance
from sklearn.metrics import rand_score, normalized_mutual_info_score, adjusted_rand_score

# Read Data

In [3]:
def read_from_arff(path):
    f = open(path, 'r', encoding='utf-8')
    data = arff.loadarff(f)
    df = pd.DataFrame(data[0])
    f.close()

    X = np.zeros((df[df.columns[0]].shape[0], df[df.columns[0]][0].shape[0], len(df[df.columns[0]][0][0])))
    for i in range(df[df.columns[0]].shape[0]):
        for j in range(df[df.columns[0]][0].shape[0]):
            X[i, j, :] = list(df[df.columns[0]][i][j])

    trans_label = pd.DataFrame(df[df.columns[1]].astype(str)).applymap(lambda x: x[2:-1])
    le = preprocessing.LabelEncoder()
    Y = le.fit_transform(trans_label[trans_label.columns[0]])

    return X, Y

In [4]:
def ArffDataset_Generate(root_path, dataset_name):
    dataset_train_path = '{0}/{1}/{1}_TRAIN.arff'.format(root_path, dataset_name)
    dataset_test_path = '{0}/{1}/{1}_TEST.arff'.format(root_path, dataset_name)

    X_train, Y_train = read_from_arff(dataset_train_path)
    X_test, Y_test = read_from_arff(dataset_test_path)
    
    X = np.concatenate((X_train, X_test), axis=0)
    Y = np.concatenate((Y_train, Y_test), axis=0)
    X = X.transpose(0, 2, 1)
    X = TimeSeriesScalerMeanVariance(mu=0., std=1.).fit_transform(X)
    num_clusters = len(set(Y))
    
    return np.nan_to_num(X), Y, num_clusters

In [5]:
DATASET_PATH = '../data/multivariate_example/'
DATASET_NAME = 'SpokenArabicDigits'

ts, labels, num_clusters = ArffDataset_Generate(DATASET_PATH, DATASET_NAME)

# CPU Benchmark

In [7]:
from kshape.core import KShapeClusteringCPU

In [8]:
cpu_times = []
for i in range(5):
    start_time = time.time()
    
    ksc = KShapeClusteringCPU(n_clusters=num_clusters,n_jobs=-1)
    ksc.fit(ts)
    
    cpu_times.append(time.time() - start_time)

In [10]:
print('Mean CPU Benchmark for 5 Runs:', np.mean(cpu_times))

Mean CPU Benchmark for 5 Runs: 1182.4650375843048


In [11]:
predictions = ksc.labels_

cluster_centers = np.zeros((num_clusters, ts.shape[1], 1))
for k in range(num_clusters):
    cluster_centers[k, :, :] = ksc.centroids_[k

In [12]:
ri_ks = rand_score(predictions, labels)
print('Rand Score:', ri_ks)
ari_ks = adjusted_rand_score(predictions, labels)
print('Adjusted Rand Score:', ari_ks)
nmi_ks = normalized_mutual_info_score(predictions, labels)
print('Normalized Mutual Information:', nmi_ks)

Rand Score: 0.8300454160386519
Adjusted Rand Score: 0.1279219457884177
Normalized Mutual Information: 0.19734751142416793


# GPU Benchmark

In [7]:
from kshape.core_gpu import KShapeClusteringGPU

In [8]:
gpu_times = []
for i in range(5):
    start_time = time.time()
    
    ksg = KShapeClusteringGPU(n_clusters=num_clusters)
    ksg.fit(ts)
    
    gpu_times.append(time.time() - start_time)

In [9]:
print('Mean GPU Benchmark for 5 Runs:', np.mean(gpu_times))

Mean GPU Benchmark for 5 Runs: 5687.338460683823


In [12]:
predictions = ksg.labels_

cluster_centers = np.zeros((num_clusters, ts.shape[1], 1))
for k in range(num_clusters):
    cluster_centers[k, :, :] = ksg.centroids_[k]

In [14]:
ri_ks = rand_score(predictions, labels)
print('Rand Score:', ri_ks)
ari_ks = adjusted_rand_score(predictions, labels)
print('Adjusted Rand Score:', ari_ks)
nmi_ks = normalized_mutual_info_score(predictions, labels)
print('Normalized Mutual Information:', nmi_ks)

Rand Score: 0.8260587762112686
Adjusted Rand Score: 0.11377073826893268
Normalized Mutual Information: 0.18563626255616614
