# Setup

In [1]:
%load_ext autotime

In [2]:
import pickle

time: 99.4 ms


In [3]:
import numpy as np
from tqdm import tqdm_notebook

time: 166 ms


# Load the data

In [4]:
with open("data.pickle", "rb") as f:
    data = pickle.load(f)

time: 1.14 s


In [5]:
X = data["X"][:, 2:, :]
y = data["y"]
names = data["names"]

time: 1.63 ms


N.b.: first two series of each observation are unused, as I found no useful patterns in them.

# Generate synthetic dataset

## Negative observations

(i.e. without anomalies)

Selecting only clean (non-attacked) factories:

In [6]:
i_neg = np.where(y == 0)[0]
X_neg = np.vstack(X[i_neg])

time: 504 ms


Cut into chunks:

In [7]:
def generate_chunks(l, o):
    chunks = []
    start = 0
    while start + l < X.shape[-1]:
        chunks.append((start, start + l))
        start += (l - o)
    return chunks

time: 3.78 ms


In [8]:
CHUNK_LENGTH = 1000
CHUNK_OVERLAP = 900

time: 122 ms


In [9]:
chunks = generate_chunks(CHUNK_LENGTH, CHUNK_OVERLAP)

time: 106 ms


In [10]:
X_chunks = np.vstack([
    X_neg[:, chunk[0]:chunk[1]]
    for chunk in chunks
])

time: 1.41 s


In [11]:
neg_size = X_chunks.shape[0]
neg_size

409536

time: 10.9 ms


Save the negative part:

In [12]:
file_name_neg = f"chunks_neg_{CHUNK_LENGTH}_{CHUNK_OVERLAP}.pickle"

with open(file_name_neg, "wb") as f:
    pickle.dump(X_chunks, f)

time: 48.2 s


In [13]:
! ls -lh $file_name_neg

-rw-r--r-- 1 mityajj mityajj 3.1G Feb 21 00:26 chunks_neg_1000_900.pickle
time: 190 ms


## Positive observations

(i.e. with anomalies)

Adding random patterns like ones found by EDA.

### Peaks

In [14]:
def add_peak(x):

    if np.abs(x.mean()) <= x.std():
        v = np.random.uniform(-50, 50, 1)[0]
    else:
        v = 0

    length = np.random.choice([3, 4, 5])
    pos = np.random.choice(range(len(x) - length - 1))

    x[pos:(pos + length)] = v

    return True

time: 2.04 s


### Noises

In [15]:
def add_noise(x):

    size = np.random.uniform(low=10, high=50)
    mean = x.mean()

    if x.std() > 0.01:
        size = size * x.std()

    pos = np.random.choice(range(len(x)))
    width = int(np.random.uniform(low=100, high=800))
    is_start = bool(np.random.binomial(1, 0.5))

    if is_start:
        pos_start = pos
        pos_end = pos + width
    else:
        pos_start = pos - width
        pos_end = pos

    if pos_start < 0:
        pos_start = 0
    if pos_end > len(x):
        pos_end = len(x)

    if pos_end - pos_start < 100:
        return False

    x[pos_start:pos_end] = np.random.uniform(
        mean - size / 2,
        mean + size / 2,
        pos_end - pos_start,
    )

    return True

time: 107 ms


### Constants

In [16]:
def add_constant(x):

    if np.unique(x).shape[0] == 1:
        return False

    pos = np.random.choice(range(len(x)))
    width = int(np.random.uniform(low=100, high=300))
    is_start = bool(np.random.binomial(1, 0.5))

    if is_start:
        pos_start = pos
        pos_end = pos + width
    else:
        pos_start = pos - width
        pos_end = pos

    if pos_start < 0:
        pos_start = 0
    if pos_end > len(x):
        pos_end = len(x)

    if pos_end - pos_start < 100:
        return False

    x[pos_start:pos_end] = x[pos_start]

    return True

time: 118 ms


### "Special cases"

In [17]:
def add_special_case(x):

    if np.abs(x.mean()) <= x.std() or x.mean() < 0:
        return False

    pos = np.random.choice(range(len(x)))
    width = int(np.random.uniform(low=300, high=800))
    is_start = bool(np.random.binomial(1, 0.5))

    if is_start:
        pos_start = pos
        pos_end = pos + width
    else:
        pos_start = pos - width
        pos_end = pos

    if pos_start < 0:
        pos_start = 0
    if pos_end > len(x):
        pos_end = len(x)

    if pos_end - pos_start < 100:
        return False

    insert = np.zeros(pos_end - pos_start)
    n_peaks = int(np.random.uniform(10, 20))
    peaks_pos = np.random.choice(range(len(insert)), size=n_peaks)
    insert[peaks_pos] = np.random.uniform(low=0, high=x.mean(), size=len(peaks_pos))

    x[pos_start:pos_end] = insert

    return True

time: 134 ms


### Randomly apply to chunks

In [18]:
fs = [add_peak, add_noise, add_constant, add_special_case]

for i in tqdm_notebook(range(X_chunks.shape[0])):
    while True:
        f = np.random.choice(fs, size=1)[0]
        if f(X_chunks[i]):
            break


time: 1min 56s


Save the positive part:

In [19]:
file_name_pos = f"chunks_pos_{CHUNK_LENGTH}_{CHUNK_OVERLAP}.pickle"

with open(file_name_pos, "wb") as f:
    pickle.dump(X_chunks, f)

time: 49 s


In [20]:
! ls -lh $file_name_pos

-rw-r--r-- 1 mityajj mityajj 3.1G Feb 21 00:29 chunks_pos_1000_900.pickle
time: 190 ms
