In [1]:
import pandas as pd
import numpy as np

In [2]:
NUM_TIMESTEPS = 1000
NUM_SENSORS = 33
NUM_ZONES = 3

# Prepare y data (zone hits)

In [3]:
leakage = pd.read_csv("2019_leakages.csv")
leakage.index = pd.to_datetime(leakage['Date'] + ' ' + leakage['Timestamp'])
leakage = leakage.drop(columns=["Date", "Timestamp"])

zones_data = pd.read_csv("wdn_zones_3.csv")
num_zones = NUM_ZONES

In [4]:
leakage_diff = leakage.diff()[1:]
leakage_diff["p123"].idxmax()

Timestamp('2019-11-29 21:10:00')

In [5]:
ps = zones_data["link_id"].values
zones = zones_data["zone"].values

p2z = {}

for j in range(len(ps)):
    p2z[ps[j]] = zones[j]
    


zone_hits = pd.DataFrame(columns=list(range(num_zones)))
zone_hits = zone_hits.reindex(leakage.index)

zone_hits = zone_hits.fillna(0)

for pipe in list(leakage)[2:]:
    zone_hits[p2z[pipe]] += leakage[pipe].values

zone_hits = zone_hits.applymap(lambda x: int(bool(x)))

In [6]:
zone_hits

Unnamed: 0,0,1,2
2019-01-01 00:00:00,1,0,1
2019-01-01 00:05:00,1,0,1
2019-01-01 00:10:00,1,0,1
2019-01-01 00:15:00,1,0,1
2019-01-01 00:20:00,1,0,1
...,...,...,...
2019-12-31 23:35:00,1,1,1
2019-12-31 23:40:00,1,1,1
2019-12-31 23:45:00,1,1,1
2019-12-31 23:50:00,1,1,1


In [7]:
zone_hits["data"] = np.sum([2**j * zone_hits[j] for j in range(num_zones)], axis=0)

# Prepare x data (pressure values)

In [8]:
pressures = pd.read_csv("pressure_sensors.csv")
pressures.index = pd.to_datetime(pressures['Date'] + ' ' + pressures['Timestamp'])
pressures = pressures.drop(columns=["Date", "Timestamp"])

sensors = np.random.choice(list(pressures), replace=False,size=(NUM_SENSORS))

selected_pressures = pressures[sensors]

In [9]:
selected_pressures

Unnamed: 0,n516,n769,n613,n288,n31,n296,n636,n679,n429,n722,...,n495,n1,n105,n506,n229,n215,n410,n342,n740,n726
2019-01-01 00:00:00,54.76,48.38,55.95,52.64,37.00,42.35,45.37,47.17,36.59,46.03,...,51.64,28.63,50.45,53.46,52.35,39.09,30.88,46.56,43.75,46.96
2019-01-01 00:05:00,54.88,48.43,56.07,52.71,37.02,42.39,45.46,47.25,36.71,46.08,...,51.70,28.66,50.54,53.55,52.49,39.10,30.99,46.66,43.78,47.03
2019-01-01 00:10:00,54.84,48.44,56.05,52.73,37.04,42.41,45.46,47.26,36.69,46.09,...,51.68,28.67,50.51,53.52,52.47,39.09,30.97,46.66,43.79,47.05
2019-01-01 00:15:00,54.88,48.46,56.11,52.76,37.05,42.42,45.50,47.29,36.75,46.11,...,51.70,28.69,50.54,53.56,52.52,39.09,31.03,46.71,43.79,47.07
2019-01-01 00:20:00,54.77,48.38,55.96,52.65,37.05,42.36,45.39,47.19,36.61,46.03,...,51.65,28.68,50.46,53.47,52.38,39.09,30.89,46.57,43.75,46.97
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-12-31 23:35:00,53.97,47.28,54.74,51.22,36.85,41.56,44.18,45.88,35.12,45.06,...,51.20,28.37,49.86,52.81,50.89,39.07,29.27,45.20,43.24,45.71
2019-12-31 23:40:00,53.91,47.27,54.67,51.20,36.85,41.55,44.15,45.86,35.05,45.05,...,51.16,28.37,49.80,52.75,50.83,39.07,29.18,45.15,43.23,45.69
2019-12-31 23:45:00,53.99,47.34,54.77,51.29,36.87,41.60,44.22,45.94,35.16,45.11,...,51.22,28.40,49.87,52.82,50.95,39.08,29.30,45.23,43.26,45.77
2019-12-31 23:50:00,53.95,47.27,54.69,51.20,36.86,41.55,44.15,45.85,35.08,45.05,...,51.19,28.38,49.84,52.79,50.82,39.07,29.22,45.15,43.23,45.69


# Random Sample of Data

In [10]:
random_times = np.random.choice(zone_hits.index, 1000)

In [11]:
random_data = {}
random_data["pressure"] = selected_pressures.loc[random_times]
random_data["zone_hits"] = zone_hits.loc[random_times]

In [12]:
np.savez("data.npz", random_data)

In [15]:
random_data["zone_hits"]

Unnamed: 0,0,1,2,data
2019-09-27 14:20:00,1,1,1,7
2019-02-02 04:20:00,1,1,1,7
2019-06-17 06:10:00,1,1,1,7
2019-09-09 03:40:00,1,1,1,7
2019-09-27 01:00:00,1,1,1,7
...,...,...,...,...
2019-08-25 07:10:00,1,1,1,7
2019-03-07 00:25:00,1,0,1,5
2019-11-10 08:05:00,1,1,1,7
2019-11-22 08:50:00,1,1,1,7
