In [1]:
import numpy as np
from scipy.spatial import distance
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import resample
import os
import time

In [2]:
meta = pd.read_csv('synthetic-meta.txt', sep=',', header=None)
meta.columns = ['station', 'direction', 'date']
z_meta = meta[meta.direction == 'Z']

In [3]:
def process_csv(station):
    df = pd.read_csv('../Artificial_data/DOGEx_v1/csv/' + station, header=None)

    df.columns = ['date', 'un', 'ue', 'uz', 'sn', 'se', 'sz']
    df.drop(['sn', 'se', 'sz'], axis=1, inplace=True)
    df = df[['date','uz']]

    offset_dates = z_meta[z_meta.station==station[0:4]]['date']
    
    return df, offset_dates

In [4]:
interval_size = 40
offset = 10

num_files = 0
tot_ranges = 0

csv_list = []
ranges_per_csv = []

for file in os.listdir('../Artificial_data/DOGEx_v1/csv/'):
    if file[0] == '.':
        continue 
    
    num_ranges = (sum(1 for line in open('../Artificial_data/DOGEx_v1/csv/' + file)) - interval_size) // offset
        
    csv_list.append(file)
    ranges_per_csv.append(num_ranges)
    
    tot_ranges += num_ranges
    num_files +=  1

In [5]:
ranges = np.empty((tot_ranges, 3), dtype=np.ndarray)

In [6]:
range_row = 0
csv_idx = 0
while range_row < tot_ranges:
    df, offset_dates = process_csv(csv_list[csv_idx])    
    i = 0
    while i < ranges_per_csv[csv_idx]:
        arr = np.array(df.iloc[i * offset:i * offset + interval_size].T)

        for date in arr[0]:
            if date in offset_dates.values:
                ranges[range_row] = (arr[0], arr[1], 1)
                break
            else:
                ranges[range_row] = (arr[0], arr[1], 0)

        i = i + 1  
        range_row = range_row + 1   
    
    csv_idx += 1

In [7]:
data = pd.DataFrame(ranges)
data.head()

Unnamed: 0,0,1,2
0,"[1991.6934, 1991.6988, 1992.0164, 1992.0192, 1...","[-0.00497095, 0.02199764, -0.00127296, 0.00747...",0
1,"[1992.0602, 1992.063, 1992.0657, 1992.0821, 19...","[0.0074101, -0.00149556, -0.00742904, -0.01239...",0
2,"[1992.104, 1992.1095, 1992.1123, 1992.115, 199...","[-0.02175648, -0.00756418, -0.01096628, -0.004...",0
3,"[1992.1588, 1992.178, 1992.1807, 1992.1834, 19...","[-0.01787595, -0.01300338, -0.00406316, -0.012...",0
4,"[1992.2245, 1992.2327, 1992.2355, 1992.2382, 1...","[-0.02349643, -0.02366129, -0.0154773, -0.0193...",0


In [8]:
data.columns = ['dates', 'uz', 'label']
regular = data[data.label == 0]
offsets  = data[data.label == 1]

In [9]:
offsets_upsampled = resample(offsets, replace=True, n_samples=len(regular), random_state=42)
data_upsampled = pd.concat([regular, offsets_upsampled])

In [10]:
data_upsampled.label.value_counts()

X = X = np.stack(np.array(data_upsampled['uz']))
y = np.stack(np.array(data_upsampled['label']))

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [13]:
from sktime.classification.interval_based import TimeSeriesForestClassifier

In [14]:
clf = TimeSeriesForestClassifier()

In [21]:
start = time.time()

clf.fit(X_train, y_train)

end = time.time()
print(end - start)

115.42716717720032


In [23]:
start = time.time()
                              
y_pred = clf.predict(X_test)

end = time.time()
print(end - start)

4.640050888061523


In [24]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

1.0

In [26]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      8458
           1       1.00      1.00      1.00      8413

    accuracy                           1.00     16871
   macro avg       1.00      1.00      1.00     16871
weighted avg       1.00      1.00      1.00     16871



In [29]:
clf.get_params()

{'min_interval': 3, 'n_estimators': 200, 'n_jobs': 1, 'random_state': None}