In [1]:
import sys
print(f'Interpreter dir: {sys.executable}')
import os

if os.path.basename(os.getcwd()) == 'notebooks':
    os.chdir('../')
    
print(f'Working dir: {os.getcwd()}')
%load_ext autoreload
%autoreload 2

Interpreter dir: c:\users\sesma\water_hackathlon\.venv_waterhack\scripts\python.exe
Working dir: C:\Users\sesma\water_hackathlon


In [2]:
import pandas as pd
from pandas_profiling import ProfileReport

# Explore AW dataset

In [3]:
df_DMAleaks = pd.read_csv('data/raw/DMALeaks.csv')

# There are extra commas :/ lets merge extra column
df_DMAleaks['DMAName'] = df_DMAleaks['DMAName'].combine_first(df_DMAleaks['AUX'])
df_DMAleaks = df_DMAleaks.drop('AUX', axis=1)
df_DMAleaks

Unnamed: 0,DateRaised,LeakType,DMAName
0,2018-11-19 12:13:21,Other Repaired Leaks,NEWCENMA
1,2018-02-12 20:39:37,BurstMain,NORCRDMA
2,2020-08-13 10:43:46,BTBB,NORW37MA
3,2017-02-24 08:39:14,Other Repaired Leaks,NEWSEVMA
4,2019-02-21 12:26:52,Stop Tap/ Atplas (Fix On Site),NEWSEVMA
...,...,...,...
703,2020-05-15 07:50:10,Washout (Fixed On Site),NEWCENMA
704,2020-07-22 12:03:48,Comm Pipe (before ST),NEWSEVMA
705,2018-12-10 11:43:13,Stop Tap / Atplas,NEWCENMA
706,2019-08-09 14:37:52,BTBB,NEWSEVMA


In [4]:
df_DMAleaks.groupby('DMAName').count()

Unnamed: 0_level_0,DateRaised,LeakType
DMAName,Unnamed: 1_level_1,Unnamed: 2_level_1
"<12"" (300mm)",2,2
and above,2,2
not BTBB),2,2
BLOFLDMA,51,51
NEWCENMA,159,159
NEWSEVMA,247,247
NEWSTUMA,27,27
NORCRDMA,22,22
NORFIFMA,39,39
NORW18MA,13,13


- Something is funny with a couple of entries, due to an extra comma in the LeakType - fixed manually

# Prepare a dataset for classification problem

The target is the leakage at DMA level, which is a binary variable. We can organize the work as follows:
1. Find the times when a leakage was raised. Mark with 1
2. Add a new column, with a few hours ebfore and after that indicates is_leakage
3. We can predict that, or we can predict a (probabbly( easier target one that says "There is a leakage today"

On the features side, we need to align:
- Inflow
- Pressure
- ...
- Feature engineered columns (past values, moving average, ...)

# Prepare features dataset

In [5]:
df_p = pd.read_csv('data/raw/DMA15MinPressure.csv')
print(df_p.info())
df_p

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1005975 entries, 0 to 1005974
Data columns (total 3 columns):
 #   Column     Non-Null Count    Dtype  
---  ------     --------------    -----  
 0   DMA        1005975 non-null  object 
 1   Timestamp  1005975 non-null  object 
 2   Value      1005975 non-null  float64
dtypes: float64(1), object(2)
memory usage: 23.0+ MB
None


Unnamed: 0,DMA,Timestamp,Value
0,BLOFLDMA,2019-03-25 00:00:00.0000000,4.952223
1,BLOFLDMA,2019-03-25 00:15:00.0000000,4.854159
2,BLOFLDMA,2019-03-25 00:30:00.0000000,4.805128
3,BLOFLDMA,2019-03-25 00:45:00.0000000,4.952223
4,BLOFLDMA,2019-03-25 01:00:00.0000000,4.903191
...,...,...,...
1005970,SWALSHMA,2020-07-31 23:00:00.0000000,0.000000
1005971,SWALSHMA,2020-07-31 23:15:00.0000000,0.000000
1005972,SWALSHMA,2020-07-31 23:30:00.0000000,0.000000
1005973,SWALSHMA,2020-07-31 23:45:00.0000000,0.000000


In [6]:
df_inf = pd.read_csv('data/raw/DMA15MinNetFLow.csv')
print(df_inf.info())
df_inf

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1348712 entries, 0 to 1348711
Data columns (total 3 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   DMA                 1348712 non-null  object 
 1   ReferenceTimestamp  1348712 non-null  object 
 2   m3Volume            1348712 non-null  float64
dtypes: float64(1), object(2)
memory usage: 30.9+ MB
None


Unnamed: 0,DMA,ReferenceTimestamp,m3Volume
0,NORW21MA,2016-12-01 00:00:00.0000000,8.094000
1,NORW21MA,2016-12-01 00:15:00.0000000,7.343000
2,NORW21MA,2016-12-01 00:30:00.0000000,6.684000
3,NORW21MA,2016-12-01 00:45:00.0000000,6.835000
4,NORW21MA,2016-12-01 01:00:00.0000000,6.220000
...,...,...,...
1348707,NEWCENMA,2020-05-20 21:45:00.0000000,21.640001
1348708,NEWCENMA,2020-05-20 22:00:00.0000000,19.190001
1348709,NEWCENMA,2020-05-20 22:15:00.0000000,18.350000
1348710,NEWCENMA,2020-05-20 22:30:00.0000000,17.230001


In [7]:
# Lets merge both
df_m = pd.merge(df_p, df_inf, left_on=['DMA','Timestamp'], right_on=['DMA','ReferenceTimestamp'])

In [8]:
df_m

Unnamed: 0,DMA,Timestamp,Value,ReferenceTimestamp,m3Volume
0,BLOFLDMA,2019-03-25 00:00:00.0000000,4.952223,2019-03-25 00:00:00.0000000,3.54
1,BLOFLDMA,2019-03-25 00:15:00.0000000,4.854159,2019-03-25 00:15:00.0000000,3.36
2,BLOFLDMA,2019-03-25 00:30:00.0000000,4.805128,2019-03-25 00:30:00.0000000,3.06
3,BLOFLDMA,2019-03-25 00:45:00.0000000,4.952223,2019-03-25 00:45:00.0000000,3.01
4,BLOFLDMA,2019-03-25 01:00:00.0000000,4.903191,2019-03-25 01:00:00.0000000,2.98
...,...,...,...,...,...
943663,SWALSHMA,2020-05-31 23:00:00.0000000,0.000000,2020-05-31 23:00:00.0000000,0.00
943664,SWALSHMA,2020-05-31 23:15:00.0000000,0.000000,2020-05-31 23:15:00.0000000,0.00
943665,SWALSHMA,2020-05-31 23:30:00.0000000,0.000000,2020-05-31 23:30:00.0000000,0.00
943666,SWALSHMA,2020-05-31 23:45:00.0000000,0.000000,2020-05-31 23:45:00.0000000,0.00


In [13]:
print(df_m.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 943668 entries, 0 to 943667
Data columns (total 5 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   DMA                 943668 non-null  object 
 1   Timestamp           943668 non-null  object 
 2   Value               943668 non-null  float64
 3   ReferenceTimestamp  943668 non-null  object 
 4   m3Volume            943668 non-null  float64
dtypes: float64(2), object(3)
memory usage: 43.2+ MB
None


# Prepare target dataset

Convert the dataset df_DMAleaks into a time-series

In [16]:
df_DMAleaks.rename(columns={'DMAName':'DMA',}, inplace=True)
df_DMAleaks

Unnamed: 0,DateRaised,LeakType,DMA
0,2018-11-19 12:13:21,Other Repaired Leaks,NEWCENMA
1,2018-02-12 20:39:37,BurstMain,NORCRDMA
2,2020-08-13 10:43:46,BTBB,NORW37MA
3,2017-02-24 08:39:14,Other Repaired Leaks,NEWSEVMA
4,2019-02-21 12:26:52,Stop Tap/ Atplas (Fix On Site),NEWSEVMA
...,...,...,...
703,2020-05-15 07:50:10,Washout (Fixed On Site),NEWCENMA
704,2020-07-22 12:03:48,Comm Pipe (before ST),NEWSEVMA
705,2018-12-10 11:43:13,Stop Tap / Atplas,NEWCENMA
706,2019-08-09 14:37:52,BTBB,NEWSEVMA


In [10]:
df_leaks = pd.merge_asof(df_m, df_DMAleaks, on="DMA", by=)

In [12]:
print(df_leaks.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 73615716 entries, 0 to 73615715
Data columns (total 8 columns):
 #   Column              Dtype  
---  ------              -----  
 0   DMA                 object 
 1   Timestamp           object 
 2   Value               float64
 3   ReferenceTimestamp  object 
 4   m3Volume            float64
 5   DateRaised          object 
 6   LeakType            object 
 7   DMAName             object 
dtypes: float64(2), object(6)
memory usage: 4.9+ GB
None


In [14]:
df_leaks

Unnamed: 0,DMA,Timestamp,Value,ReferenceTimestamp,m3Volume,DateRaised,LeakType,DMAName
0,BLOFLDMA,2019-03-25 00:00:00.0000000,4.952223,2019-03-25 00:00:00.0000000,3.54,2017-10-10 08:29:29,BurstMain,BLOFLDMA
1,BLOFLDMA,2019-03-25 00:00:00.0000000,4.952223,2019-03-25 00:00:00.0000000,3.54,2017-01-05 14:45:29,BurstMain,BLOFLDMA
2,BLOFLDMA,2019-03-25 00:00:00.0000000,4.952223,2019-03-25 00:00:00.0000000,3.54,2020-07-01 10:41:46,BTBB,BLOFLDMA
3,BLOFLDMA,2019-03-25 00:00:00.0000000,4.952223,2019-03-25 00:00:00.0000000,3.54,2017-08-21 14:36:56,Other Repaired Leaks,BLOFLDMA
4,BLOFLDMA,2019-03-25 00:00:00.0000000,4.952223,2019-03-25 00:00:00.0000000,3.54,2019-10-29 06:49:57,BurstMain,BLOFLDMA
...,...,...,...,...,...,...,...,...
73615711,SWALSHMA,2020-06-01 00:00:00.0000000,0.000000,2020-06-01 00:00:00.0000000,0.00,2017-01-26 08:09:00,"Main up to 6"" (150mm)",SWALSHMA
73615712,SWALSHMA,2020-06-01 00:00:00.0000000,0.000000,2020-06-01 00:00:00.0000000,0.00,2020-07-31 07:33:03,Fire Hydrant (Fixed On Site),SWALSHMA
73615713,SWALSHMA,2020-06-01 00:00:00.0000000,0.000000,2020-06-01 00:00:00.0000000,0.00,2019-05-18 21:43:47,BurstMain,SWALSHMA
73615714,SWALSHMA,2020-06-01 00:00:00.0000000,0.000000,2020-06-01 00:00:00.0000000,0.00,2018-01-08 09:45:04,"Main up to 6"" (150mm)",SWALSHMA


In [17]:
df_leaks['Timestamp'].set_option('precision',0)

AttributeError: 'Series' object has no attribute 'set_option'