In [15]:
import numpy as np
from scipy.spatial import distance
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import resample
import os
import time

In [16]:
meta = pd.read_csv('synthetic-meta.txt', sep=',', header=None)
meta.columns = ['station', 'direction', 'date']
z_meta = meta[meta.direction == 'Z']

In [17]:
def process_csv(station):
    df = pd.read_csv('../Artificial_data/DOGEx_v1/' + station, header=None)

    df.columns = ['date', 'un', 'ue', 'uz', 'sn', 'se', 'sz']
    df.drop(['sn', 'se', 'sz'], axis=1, inplace=True)
    df = df[['date','uz']]

    offset_dates = z_meta[z_meta.station==station.split('/')[1][0:4]]['date']
    
    return df, offset_dates

In [18]:
interval_size = 40
offset = 10

num_files = 0
tot_ranges = 0

csv_list = []
ranges_per_csv = []

for file in os.listdir('../Artificial_data/DOGEx_v1/csv/'):
    if file[0] == '.':
        continue 
    
    num_ranges = (sum(1 for line in open('../Artificial_data/DOGEx_v1/csv/' + file)) - interval_size) // offset
        
    csv_list.append(file)
    ranges_per_csv.append(num_ranges)
    
    tot_ranges += num_ranges
    num_files +=  1
    
ranges = np.empty((tot_ranges, 3), dtype=np.ndarray)

range_row = 0
csv_idx = 0
while range_row < tot_ranges:
    df, offset_dates = process_csv('csv/'+csv_list[csv_idx])    
    i = 0
    while i < ranges_per_csv[csv_idx]:
        arr = np.array(df.iloc[i * offset:i * offset + interval_size].T)

        for date in arr[0]:
            if date in offset_dates.values:
                ranges[range_row] = (arr[0], arr[1], 1)
                break
            else:
                ranges[range_row] = (arr[0], arr[1], 0)

        i = i + 1  
        range_row = range_row + 1   
    
    csv_idx += 1
    
data = pd.DataFrame(ranges)
data.columns = ['dates', 'uz', 'label']
regular = data[data.label == 0]
offsets  = data[data.label == 1]

offsets_upsampled = resample(offsets, replace=True, n_samples=len(regular), random_state=42)
data_upsampled = pd.concat([regular, offsets_upsampled])

data_upsampled.label.value_counts()

X = np.stack(np.array(data_upsampled['uz']))
y = np.stack(np.array(data_upsampled['label']))

In [19]:
print(offsets.shape)

(120, 3)


In [20]:
print(regular.shape)
print(tot_ranges)

(23383, 3)
23503


In [22]:
interval_size = 40
offset = 10

num_files = 0
tot_ranges = 0

csv_list = []
ranges_per_csv = []

for file in os.listdir('../Artificial_data/DOGEx_v1/csv_val/'):
    if file[0] == '.':
        continue 
    num_ranges = (sum(1 for line in open('../Artificial_data/DOGEx_v1/csv_val/' + file)) - interval_size) // offset
        
    csv_list.append(file)
    ranges_per_csv.append(num_ranges)
    
    tot_ranges += num_ranges
    num_files +=  1
    
ranges = np.empty((tot_ranges, 3), dtype=np.ndarray)

range_row = 0
csv_idx = 0
while range_row < tot_ranges:
    df, offset_dates = process_csv('csv_val/'+csv_list[csv_idx]) 
    i = 0
    while i < ranges_per_csv[csv_idx]:
        arr = np.array(df.iloc[i * offset:i * offset + interval_size].T)

        for date in arr[0]:
            if date in offset_dates.values:
                ranges[range_row] = (arr[0], arr[1], 1)
                break
            else:
                ranges[range_row] = (arr[0], arr[1], 0)

        i = i + 1  
        range_row = range_row + 1   
    
    csv_idx += 1
    
data = pd.DataFrame(ranges)
data.columns = ['dates', 'uz', 'label']
regular = data[data.label == 0]
offsets  = data[data.label == 1]

offsets_upsampled = resample(offsets, replace=True, n_samples=len(regular), random_state=42)
data_upsampled = pd.concat([regular, offsets_upsampled])

data_upsampled.label.value_counts()

X_val = np.stack(np.array(data_upsampled['uz']))
y_val = np.stack(np.array(data_upsampled['label']))

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [25]:
from sktime.classification.interval_based import TimeSeriesForestClassifier

In [26]:
clf = TimeSeriesForestClassifier()

In [27]:
start = time.time()

clf.fit(X_train, y_train)

end = time.time()
print(end - start)

86.32556247711182


In [28]:
start = time.time()
                              
y_pred = clf.predict(X_test)

end = time.time()
print(end - start)

5.054145336151123


In [29]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

0.9999465440744106

In [30]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      9368
           1       1.00      1.00      1.00      9339

    accuracy                           1.00     18707
   macro avg       1.00      1.00      1.00     18707
weighted avg       1.00      1.00      1.00     18707



In [31]:
clf.get_params()

{'min_interval': 3, 'n_estimators': 200, 'n_jobs': 1, 'random_state': None}

In [34]:
val_pred = clf.predict(X_val)

In [35]:
accuracy_score(y_val, val_pred)

0.6911427260211106