In [1]:
%matplotlib inline

In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from gtda.time_series import TakensEmbedding
from gtda.homology import VietorisRipsPersistence
from gtda.diagrams import Scaler, Filtering, PersistenceEntropy
from gtda.plotting import plot_diagram

from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score

### our imports

from src.utils import get_data_from_directory, get_files_directory_list
from src.TFE import *

In [22]:
#!wget -nc "http://www.timeseriesclassification.com/Downloads/Archives/Univariate2018_arff.zip"
#!unzip -q -n "Univariate2018_arff.zip"

In [7]:
directory_list = get_files_directory_list()
directory_list = sorted(directory_list)

random_index = 15
random_path = directory_list[random_index]

X_train, X_test, y_train, y_test = get_data_from_directory(random_path)
X_train = X_train.squeeze()
y_train = y_train.squeeze()
X_test = X_test.squeeze()
y_test = y_test.squeeze()

print('Dataset: ', random_path)
print('X_train shape: ', X_train.shape)
print('y_train shape: ', y_train.shape)
print('X_test shape:  ', X_test.shape)
print('y_test shape:  ', y_test.shape)

Dataset:  Coffee
X_train shape:  (28, 286)
y_train shape:  (28,)
X_test shape:   (28, 286)
y_test shape:   (28,)


In [12]:
feature_extractor = TopologicalFeaturesExtractor(
    persistence_diagram_extractor=PersistenceDiagramsExtractor(takens_embedding_dim=3, 
                                                               takens_embedding_delay=10,
                                                               homology_dimensions=(0, 1, 2)),
    persistence_diagram_features=[HolesNumberFeature(),
                                  MaxHoleLifeTimeFeature(),
                                  RelevantHolesNumber(),
                                  AverageHoleLifetimeFeature(),
                                  SumHoleLifetimeFeature(),
                                  PersistenceEntropyFeature(),
                                  SimultaneousAliveHolesFeatue()])

X_train_transformed = feature_extractor.fit_transform(X_train)
X_test_transformed = feature_extractor.fit_transform(X_test)

In [13]:
parameters = {"C": [10**i for i in range(-2, 5)],
              "kernel": ["linear", "rbf", "sigmoid", "poly"]}

svc_cv = GridSearchCV(SVC(random_state=42), 
                      param_grid=parameters,
                      cv=5,
                      scoring='accuracy', 
                      n_jobs=-1)
svc_cv.fit(X_train_transformed, y_train)

print("Train accuracy: ", accuracy_score(y_train, svc_cv.best_estimator_.predict(X_train_transformed)))
print("Test accuracy: ", accuracy_score(y_test, svc_cv.best_estimator_.predict(X_test_transformed)))

Train accuracy:  0.8928571428571429
Test accuracy:  0.6428571428571429


In [14]:
parameters = {"max_depth": [2, 10, 15, 20, 25, 30, 35, 40, 45, 50, 70, 100, 120, 150],
              "n_estimators": [20, 50, 100, 150, 200, 250]}
svc_cv = GridSearchCV(XGBClassifier(n_jobs=-1, random_state=42), 
                      param_grid=parameters,
                      cv=2,
                      scoring='accuracy', 
                      n_jobs=-1)
svc_cv.fit(X_train_transformed, y_train)

print("Train accuracy: ", accuracy_score(y_train, svc_cv.best_estimator_.predict(X_train_transformed)))
print("Test accuracy: ", accuracy_score(y_test, svc_cv.best_estimator_.predict(X_test_transformed)))

Train accuracy:  0.9642857142857143
Test accuracy:  0.7857142857142857
