In [1]:
%matplotlib inline

In [2]:
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score

import torch
from torch import nn
from torch.autograd import Variable
from torch.utils.data import DataLoader, Dataset
### our imports

from src.utils import (get_data_from_directory, get_files_directory_list, 
                       one_hot_encoding, TimeSeriesDataset,get_device, train_clf)

from src.TFE import *

In [3]:
torch.manual_seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(0)

In [4]:
#!wget -nc "http://www.timeseriesclassification.com/Downloads/Archives/Univariate2018_arff.zip"
#!unzip -q -n "Univariate2018_arff.zip"

In [5]:
directory_list = get_files_directory_list()
directory_list = sorted(directory_list)

random_index =  15 #6 # 5
random_path = directory_list[random_index]

X_train, X_test, y_train, y_test = get_data_from_directory(random_path)
X_train = X_train.squeeze()
y_train = y_train.squeeze()
X_test = X_test.squeeze()
y_test = y_test.squeeze()

print('Dataset: ', random_path)
print('X_train shape: ', X_train.shape)
print('y_train shape: ', y_train.shape)
print('X_test shape:  ', X_test.shape)
print('y_test shape:  ', y_test.shape)

Dataset:  Coffee
X_train shape:  (28, 286)
y_train shape:  (28,)
X_test shape:   (28, 286)
y_test shape:   (28,)


In [6]:
feature_extractor = TopologicalFeaturesExtractor(
    persistence_diagram_extractor=PersistenceDiagramsExtractor(tokens_embedding_dim=3, 
                                                               tokens_embedding_delay=10,
                                                               homology_dimensions=(0, 1, 2)),
    persistence_diagram_features=[HolesNumberFeature(),
                                  MaxHoleLifeTimeFeature(),
                                  RelevantHolesNumber(),
                                  AverageHoleLifetimeFeature(),
                                  SumHoleLifetimeFeature(),
                                  PersistenceEntropyFeature(),
                                  SimultaneousAliveHolesFeatue()])

X_train_transformed = feature_extractor.fit_transform(X_train)
X_test_transformed = feature_extractor.fit_transform(X_test)

In [7]:
print('X_train_transformed shape: ', X_train_transformed.shape)
print('X_test_transformed shape:  ', X_test_transformed.shape)

X_train_transformed shape:  (28, 21)
X_test_transformed shape:   (28, 21)


In [8]:
parameters = {"C": [10**i for i in range(-2, 5)],
              "kernel": ["linear", "rbf", "sigmoid", "poly"]}

svc_cv = GridSearchCV(SVC(random_state=42), 
                      param_grid=parameters,
                      cv=5,
                      scoring='accuracy', 
                      n_jobs=-1)
svc_cv.fit(X_train_transformed, y_train)

print("Train accuracy: ", accuracy_score(y_train, svc_cv.best_estimator_.predict(X_train_transformed)))
print("Test accuracy: ", accuracy_score(y_test, svc_cv.best_estimator_.predict(X_test_transformed)))

Train accuracy:  0.8928571428571429
Test accuracy:  0.6428571428571429


In [9]:
parameters = {"max_depth": [2, 10, 15, 20, 25, 30, 35, 40, 45, 50, 70, 100, 120, 150],
              "n_estimators": [20, 50, 100, 150, 200, 250]}
svc_cv = GridSearchCV(XGBClassifier(n_jobs=-1, random_state=42), 
                      param_grid=parameters,
                      cv=2,
                      scoring='accuracy', 
                      n_jobs=-1)
svc_cv.fit(X_train_transformed, y_train)

print("Train accuracy: ", accuracy_score(y_train, svc_cv.best_estimator_.predict(X_train_transformed)))
print("Test accuracy: ", accuracy_score(y_test, svc_cv.best_estimator_.predict(X_test_transformed)))

Train accuracy:  0.9642857142857143
Test accuracy:  0.7857142857142857


In [10]:
parameters = {"n_neighbors": [3, 5, 7, 11,]}

knn_cv = GridSearchCV(KNeighborsClassifier(n_jobs=-1), 
                      param_grid=parameters,
                      cv=5,
                      scoring='accuracy', 
                      n_jobs=-1)
knn_cv.fit(X_train_transformed, y_train)

print("Train accuracy: ", accuracy_score(y_train, knn_cv.best_estimator_.predict(X_train_transformed)))
print("Test accuracy: ", accuracy_score(y_test, knn_cv.best_estimator_.predict(X_test_transformed)))

Train accuracy:  0.8214285714285714
Test accuracy:  0.5


In [28]:
out_dim = np.unique(y_train).shape[0]
device = get_device()
handle_dim = lambda x: np.swapaxes(x[..., np.newaxis], 1, -1)

In [29]:
batch_size = 32

In [30]:
X_train_transformed_dim = handle_dim(X_train_transformed)
X_test_transformed_dim  = handle_dim(X_test_transformed)

y_hot_train = one_hot_encoding(y_train)
y_hot_test = one_hot_encoding(y_test)

dataset_train = TimeSeriesDataset(X_train_transformed_dim, y_hot_train)
dataset_test  = TimeSeriesDataset(X_test_transformed_dim, y_hot_test)

loader_train = DataLoader(dataset_train, batch_size=batch_size)
loader_test = DataLoader(dataset_test, batch_size=batch_size)

In [31]:
model = nn.Sequential(nn.Conv1d(1, 32, 3),
                      nn.MaxPool1d(2),
                      nn.ReLU(),
                      
                      nn.Conv1d(32, 32, 4),
                      nn.MaxPool1d(2),
                      nn.ReLU(),
                      
                      nn.Conv1d(32, 16, 3, 2),
                      nn.ReLU(),
                      
                      nn.Flatten(),
                      
                      nn.Linear(16, 64),
                      nn.ReLU(),
                      nn.Linear(64, 32),
                      nn.ReLU(),
                      nn.Linear(32, out_dim))

In [32]:
def init_weights(m):
    if type(m) == nn.Linear:
        torch.nn.init.xavier_normal_(m.weight)
    if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))

In [33]:
model = model.apply(init_weights).to(device)
model

Sequential(
  (0): Conv1d(1, 32, kernel_size=(3,), stride=(1,))
  (1): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (2): ReLU()
  (3): Conv1d(32, 32, kernel_size=(4,), stride=(1,))
  (4): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (5): ReLU()
  (6): Conv1d(32, 16, kernel_size=(3,), stride=(2,))
  (7): ReLU()
  (8): Flatten()
  (9): Linear(in_features=16, out_features=64, bias=True)
  (10): ReLU()
  (11): Linear(in_features=64, out_features=32, bias=True)
  (12): ReLU()
  (13): Linear(in_features=32, out_features=2, bias=True)
)

In [34]:
num_epoch = 5000
lr = 1e-2
t_max = np.var(X_train_transformed)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr, )
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, t_max)

train_clf(num_epoch, model, criterion, optimizer, loader_train, loader_test, scheduler) 

Epoch 250/5000 ||	 Loss:  Train 0.2234 | Validation 0.2467
Epoch 500/5000 ||	 Loss:  Train 0.0110 | Validation 0.1176
Epoch 750/5000 ||	 Loss:  Train 0.0010 | Validation 0.1372
Epoch 1000/5000 ||	 Loss:  Train 0.0000 | Validation 0.1450
Epoch 1250/5000 ||	 Loss:  Train 0.0000 | Validation 0.1454
Epoch 1500/5000 ||	 Loss:  Train 0.0000 | Validation 0.1455
Epoch 1750/5000 ||	 Loss:  Train 0.0000 | Validation 0.1455
Epoch 2000/5000 ||	 Loss:  Train 0.0000 | Validation 0.1456
Epoch 2250/5000 ||	 Loss:  Train 0.0000 | Validation 0.1456
Epoch 2500/5000 ||	 Loss:  Train 0.0000 | Validation 0.1456
Epoch 2750/5000 ||	 Loss:  Train 0.0000 | Validation 0.1456
Epoch 3000/5000 ||	 Loss:  Train 0.0000 | Validation 0.1456
Epoch 3250/5000 ||	 Loss:  Train 0.0000 | Validation 0.1456
Epoch 3500/5000 ||	 Loss:  Train 0.0000 | Validation 0.1456
Epoch 3750/5000 ||	 Loss:  Train 0.0000 | Validation 0.1456
Epoch 4000/5000 ||	 Loss:  Train 0.0000 | Validation 0.1456
Epoch 4250/5000 ||	 Loss:  Train 0.0000 | V

In [35]:
model_pred = lambda x: model(x.dataset[:][0]).round().abs().detach().numpy()

In [36]:
print("Train accuracy: ", accuracy_score(y_hot_train, model_pred(loader_train)))
print("Test accuracy: ", accuracy_score(y_hot_test, model_pred(loader_test)))

Train accuracy:  1.0
Test accuracy:  0.8214285714285714
