In [5]:
import pandas as pd
import numpy as np

In [21]:
NUM_TIMESTEPS = 1000
NUM_SENSORS = 33
NUM_ZONES = 5

# Prepare y data (zone hits)

In [22]:
leakage = pd.read_csv("data/2019_leakages.csv")
leakage.index = pd.to_datetime(leakage['Date'] + ' ' + leakage['Timestamp'])
leakage = leakage.drop(columns=["Date", "Timestamp"])

zones_data = pd.read_csv("data/wdn_zones_5.csv")
num_zones = NUM_ZONES

In [23]:
leakage_diff = leakage.diff()[1:]
leakage_diff["p123"].idxmax()

Timestamp('2019-11-29 21:10:00')

In [24]:
ps = zones_data["link_id"].values
zones = zones_data["zone"].values

p2z = {}

for j in range(len(ps)):
    p2z[ps[j]] = zones[j]
    


zone_hits = pd.DataFrame(columns=list(range(num_zones)))
zone_hits = zone_hits.reindex(leakage.index)

zone_hits = zone_hits.fillna(0)

for pipe in list(leakage)[2:]:
    zone_hits[p2z[pipe]] += leakage[pipe].values

zone_hits = zone_hits.applymap(lambda x: int(bool(x)))

In [25]:
zone_hits

Unnamed: 0,0,1,2,3,4
2019-01-01 00:00:00,1,1,0,1,0
2019-01-01 00:05:00,1,1,0,1,0
2019-01-01 00:10:00,1,1,0,1,0
2019-01-01 00:15:00,1,1,0,1,0
2019-01-01 00:20:00,1,1,0,1,0
...,...,...,...,...,...
2019-12-31 23:35:00,1,1,1,1,1
2019-12-31 23:40:00,1,1,1,1,1
2019-12-31 23:45:00,1,1,1,1,1
2019-12-31 23:50:00,1,1,1,1,1


In [26]:
zone_hits["data"] = np.sum([2**j * zone_hits[j] for j in range(num_zones)], axis=0)

# Prepare x data (pressure values)

In [27]:
pressures = pd.read_csv("pressure_sensors.csv")
pressures.index = pd.to_datetime(pressures['Date'] + ' ' + pressures['Timestamp'])
pressures = pressures.drop(columns=["Date", "Timestamp"])

sensors = np.random.choice(list(pressures), replace=False,size=(NUM_SENSORS))

selected_pressures = pressures[sensors]

In [28]:
selected_pressures

Unnamed: 0,n410,n549,n229,n114,n4,n105,n636,n495,n506,n740,...,n415,n769,n752,n644,n342,n516,n469,n458,n31,n54
2019-01-01 00:00:00,30.88,54.69,52.35,53.92,33.72,50.45,45.37,51.64,53.46,43.75,...,45.47,48.38,49.00,47.45,46.56,54.76,47.47,43.34,37.00,36.93
2019-01-01 00:05:00,30.99,54.80,52.49,54.02,33.75,50.54,45.46,51.70,53.55,43.78,...,45.56,48.43,49.07,47.54,46.66,54.88,47.56,43.45,37.02,37.05
2019-01-01 00:10:00,30.97,54.76,52.47,53.98,33.76,50.51,45.46,51.68,53.52,43.79,...,45.54,48.44,49.09,47.55,46.66,54.84,47.53,43.43,37.04,37.03
2019-01-01 00:15:00,31.03,54.82,52.52,54.01,33.78,50.54,45.50,51.70,53.56,43.79,...,45.60,48.46,49.11,47.59,46.71,54.88,47.56,43.49,37.05,37.09
2019-01-01 00:20:00,30.89,54.69,52.38,53.93,33.77,50.46,45.39,51.65,53.47,43.75,...,45.48,48.38,49.01,47.47,46.57,54.77,47.48,43.35,37.05,36.95
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-12-31 23:35:00,29.27,53.85,50.89,53.15,33.49,49.86,44.18,51.20,52.81,43.24,...,44.32,47.28,47.66,45.91,45.20,53.97,46.87,42.09,36.85,35.48
2019-12-31 23:40:00,29.18,53.78,50.83,53.08,33.48,49.80,44.15,51.16,52.75,43.23,...,44.25,47.27,47.65,45.87,45.15,53.91,46.81,42.02,36.85,35.41
2019-12-31 23:45:00,29.30,53.88,50.95,53.16,33.51,49.87,44.22,51.22,52.82,43.26,...,44.38,47.34,47.73,45.96,45.23,53.99,46.88,42.13,36.87,35.51
2019-12-31 23:50:00,29.22,53.83,50.82,53.13,33.50,49.84,44.15,51.19,52.79,43.23,...,44.30,47.27,47.65,45.87,45.15,53.95,46.85,42.05,36.86,35.43


# Random Sample of Data

In [29]:
random_times = np.random.choice(zone_hits.index, 1000)

In [30]:
random_data = {}
random_data["pressure"] = selected_pressures.loc[random_times]
random_data["zone_hits"] = zone_hits.loc[random_times]

In [31]:
np.savez(f"data/data_{NUM_ZONES}.npz", random_data)

In [32]:
random_data["zone_hits"]

Unnamed: 0,0,1,2,3,4,data
2019-09-23 23:55:00,1,1,0,1,1,27
2019-06-28 03:10:00,1,1,0,1,1,27
2019-02-10 22:30:00,1,1,0,1,0,11
2019-07-24 15:35:00,1,1,0,1,1,27
2019-03-10 11:30:00,1,1,0,1,0,11
...,...,...,...,...,...,...
2019-04-28 22:55:00,1,1,0,1,1,27
2019-05-13 11:05:00,1,1,0,1,1,27
2019-03-11 07:15:00,1,1,0,1,0,11
2019-07-24 03:40:00,1,1,0,1,1,27
