In [1]:
import numpy as np
from scipy.spatial import distance
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import resample
import os
import time

# Creating data 

In [2]:
meta = pd.read_csv('../../Artificial_data/DOGEx_v1/synthetic-meta.txt', sep=',', header=None)
meta.columns = ['station', 'direction', 'date']
e_meta = meta[meta.direction == 'E']

In [3]:
def process_csv(station):
    df = pd.read_csv('../../Artificial_data/DOGEx_v1/' + station, header=None)

    df.columns = ['date', 'un', 'ue', 'uz', 'sn', 'se', 'sz']
    df.drop(['sn', 'se', 'sz'], axis=1, inplace=True)
    df = df[['date','ue']]

    offset_dates = e_meta[e_meta.station==station.split('/')[1][0:4]]['date']
    
    return df, offset_dates

In [4]:
# set sliding window settings
interval_size = 100
offset = 10


#run through each file in csv dir, 
#      calculate the total # of files, total # of windows, 
#      the name of each csv, and the windows per csv 
num_files = 0
tot_ranges = 0

csv_list = []
ranges_per_csv = []

for file in os.listdir('../../Artificial_data/DOGEx_v1/csv/'):
    if file[0] == '.':
        continue 
    
    num_ranges = (sum(1 for line in open('../../Artificial_data/DOGEx_v1/csv/' + file)) - interval_size) // offset
        
    csv_list.append(file)
    ranges_per_csv.append(num_ranges)
    
    tot_ranges += num_ranges
    num_files +=  1
    
    

# create empty numpy array     
ranges = np.empty((tot_ranges, 3), dtype=np.ndarray)

# run through each file and place each range into the ranges array
range_row = 0
csv_idx = 0
while range_row < tot_ranges:
    df, offset_dates = process_csv('csv/'+csv_list[csv_idx])  
    df.set_index('date')
    i = 0
    while i < ranges_per_csv[csv_idx]:
        arr = np.array(df.iloc[i * offset:i * offset + interval_size].T)

        for date in arr[0]:
            if date in offset_dates.values:
                ranges[range_row] = (arr[0], arr[1], 1)
                break
            else:
                ranges[range_row] = (arr[0], arr[1], 0)

        i = i + 1  
        range_row = range_row + 1   
    
    csv_idx += 1

    
    
# split ranges arr into X and y components, then train test split
X = ranges[:, 1]
y = ranges[:, 2]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)


# re-merge the X and y components (so we can then upsample)
train = np.column_stack((X_train, y_train))
test = np.column_stack((X_test, y_test))

# upsample 
train_df = pd.DataFrame(train)
train_df.columns = ['ue', 'label']
regular = train_df[train_df.label == 0]
offsets  = train_df[train_df.label == 1]

offsets_upsampled = resample(offsets, replace=True, n_samples=len(regular), random_state=42)
data_upsampled = pd.concat([regular, offsets_upsampled])

data_upsampled = data_upsampled.sample(frac=1)

# split upsampled train data into X and y 
X_train = np.stack(np.array(data_upsampled['ue']))
y_train = np.stack(np.array(data_upsampled['label']))


# Perform the same upsampling process on the test set
test_df = pd.DataFrame(test)
test_df.columns = ['ue', 'label']
regular = test_df[test_df.label == 0]
offsets  = test_df[test_df.label == 1]

offsets_upsampled = resample(offsets, replace=True, n_samples=len(regular), random_state=42)
data_upsampled = pd.concat([regular, offsets_upsampled])

data_upsampled.label.value_counts()

data_upsampled = data_upsampled.sample(frac=1)

X_test = np.stack(np.array(data_upsampled['ue']))
y_test = np.stack(np.array(data_upsampled['label']))

In [5]:
data_upsampled

Unnamed: 0,ue,label
1833,"[-0.04721647, -0.05259667, -0.05010058, -0.052...",0
8828,"[0.51860788, 0.517851, 0.51942457, 0.51991983,...",0
2367,"[0.39250956, 0.39839678, 0.39634982, 0.3934890...",0
381,"[-0.18391101, -0.1894265, -0.1895975, -0.18907...",0
5279,"[0.49872118, 0.5000422, 0.49684279, 0.49472051...",1
...,...,...
8280,"[0.5013061, 0.47662414, 0.47965212, 0.47932337...",1
3732,"[-0.14850462, -0.16145072, -0.15452356, -0.155...",1
952,"[0.02083008, 0.01567958, 0.01815026, 0.0120212...",0
8413,"[0.33226844, 0.32860847, 0.33009824, 0.3277563...",1


In [6]:
# Create a sliding window dataset with a set of station data not used in the training set for testing

num_files = 0
tot_ranges = 0

csv_list = []
ranges_per_csv = []

for file in os.listdir('../../Artificial_data/DOGEx_v1/csv_val/'):
    if file[0] == '.':
        continue 
    num_ranges = (sum(1 for line in open('../../Artificial_data/DOGEx_v1/csv_val/' + file)) - interval_size) // offset
        
    csv_list.append(file)
    ranges_per_csv.append(num_ranges)
    
    tot_ranges += num_ranges
    num_files +=  1
    
ranges = np.empty((tot_ranges, 3), dtype=np.ndarray)

range_row = 0
csv_idx = 0
while range_row < tot_ranges:
    df, offset_dates = process_csv('csv_val/'+csv_list[csv_idx]) 
    i = 0
    while i < ranges_per_csv[csv_idx]:
        arr = np.array(df.iloc[i * offset:i * offset + interval_size].T)

        for date in arr[0]:
            if date in offset_dates.values:
                ranges[range_row] = (arr[0], arr[1], 1)
                break
            else:
                ranges[range_row] = (arr[0], arr[1], 0)

        i = i + 1  
        range_row = range_row + 1   
    
    csv_idx += 1
    
data = pd.DataFrame(ranges)
data.columns = ['dates', 'ue', 'label']
regular = data[data.label == 0]
offsets  = data[data.label == 1]

offsets_upsampled = resample(offsets, replace=True, n_samples=len(regular), random_state=42)
data_upsampled = pd.concat([regular, offsets_upsampled])

data_upsampled = data_upsampled.sample(frac=1)

X_val = np.stack(np.array(data_upsampled['ue']))
y_val = np.stack(np.array(data_upsampled['label']))

In [8]:
np.savetxt('../sliced_data/window-100-step-10/ue/X_train.csv', X_train, delimiter=",")
np.savetxt('../sliced_data/window-100-step-10/ue/y_train.csv', y_train, delimiter=",")
np.savetxt('../sliced_data/window-100-step-10/ue/X_test.csv', X_test, delimiter=",")
np.savetxt('../sliced_data/window-100-step-10/ue/y_test.csv', y_test, delimiter=",")

In [9]:
np.savetxt('../sliced_data/window-100-step-10/ue/X_val.csv', X_val, delimiter=',')
np.savetxt('../sliced_data/window-100-step-10/ue/y_val.csv', y_val, delimiter=',')

# All directions combined

In [None]:
# X_train sets
un = np.genfromtxt('../sliced_data/un/X_train.csv', delimiter=',')
ue = np.genfromtxt('../sliced_data/ue/X_train.csv', delimiter=',')
uz = np.genfromtxt('../sliced_data/uz/X_train.csv', delimiter=',')

combined_X_train = np.concatenate((un, ue, uz))

np.savetxt('../sliced_data/combined/X_train.csv', combined_X_train, delimiter=',')

In [None]:
# y_train sets
un = np.genfromtxt('../sliced_data/un/y_train.csv', delimiter=',')
ue = np.genfromtxt('../sliced_data/ue/y_train.csv', delimiter=',')
uz = np.genfromtxt('../sliced_data/uz/y_train.csv', delimiter=',')

combined_y_train = np.concatenate((un, ue, uz))

np.savetxt('../sliced_data/combined/y_train.csv', combined_y_train, delimiter=',')

In [None]:
# X_test sets
un = np.genfromtxt('../sliced_data/un/X_test.csv', delimiter=',')
ue = np.genfromtxt('../sliced_data/ue/X_test.csv', delimiter=',')
uz = np.genfromtxt('../sliced_data/uz/X_test.csv', delimiter=',')

combined_X_test = np.concatenate((un, ue, uz))

np.savetxt('../sliced_data/combined/X_test.csv', combined_X_test, delimiter=',')

In [None]:
# y_test sets
un = np.genfromtxt('../sliced_data/un/y_test.csv', delimiter=',')
ue = np.genfromtxt('../sliced_data/ue/y_test.csv', delimiter=',')
uz = np.genfromtxt('../sliced_data/uz/y_test.csv', delimiter=',')

combined_y_test = np.concatenate((un, ue, uz))

np.savetxt('../sliced_data/combined/y_test.csv', combined_y_test, delimiter=',')

In [None]:
# X_val sets
un = np.genfromtxt('../sliced_data/un/X_val.csv', delimiter=',')
ue = np.genfromtxt('../sliced_data/ue/X_val.csv', delimiter=',')
uz = np.genfromtxt('../sliced_data/uz/X_val.csv', delimiter=',')

combined_X_val = np.concatenate((un, ue, uz))

np.savetxt('../sliced_data/combined/X_val.csv', combined_X_val, delimiter=',')

In [None]:
# y_val sets
un = np.genfromtxt('../sliced_data/un/y_val.csv', delimiter=',')
ue = np.genfromtxt('../sliced_data/ue/y_val.csv', delimiter=',')
uz = np.genfromtxt('../sliced_data/uz/y_val.csv', delimiter=',')

combined_y_val = np.concatenate((un, ue, uz))

np.savetxt('../sliced_data/combined/y_val.csv', combined_y_val, delimiter=',')