In [35]:
import os
from sys import path
path.append('..')

import pandas as pd
import numpy as np

import features.hydro
import utils

In [2]:
root_dir = "../../"
data_dir = os.path.join(root_dir, "working_data/")

water_levels_path = os.path.join(data_dir, "water_levels.csv")
# corrs_dists_path = os.path.join(data_dir, "corrs_and_dists.csv")

In [3]:
hydro_coords = pd.read_csv(data_dir + "hydro_posts_coords.csv")

In [4]:
water_levels = pd.read_csv(water_levels_path)
water_levels["date"] = pd.to_datetime(water_levels["date"], format="%Y-%m-%d")
water_levels.set_index(["id", "date"], inplace=True)
water_levels = utils.reduce_memory_usage(water_levels)

water_levels.head()

# corr_and_nearest = pd.read_csv(corrs_dists_path)
# corr_and_nearest.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,max_level
id,date,Unnamed: 2_level_1
5001,1984-01-01,258.0
5001,1984-01-02,255.0
5001,1984-01-03,252.0
5001,1984-01-04,248.0
5001,1984-01-05,244.0


In [5]:
def fill_missing_dates(water_levels, fill_val=np.nan):
    dates = water_levels.index.get_level_values(1)
    min_date, max_date = dates.min(), dates.max()
    
    new_index = pd.MultiIndex.from_product([water_levels.index.get_level_values(0).unique(), 
                                            pd.date_range(min_date, max_date, name="date")])
    water_levels = water_levels.reindex(new_index, fill_value=fill_val)
    return water_levels

water_levels = fill_missing_dates(water_levels)
water_levels.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,max_level
id,date,Unnamed: 2_level_1
5001,1984-01-01,258.0
5001,1984-01-02,255.0
5001,1984-01-03,252.0
5001,1984-01-04,248.0
5001,1984-01-05,244.0


In [11]:
stat_config = [
    [np.nanmean, 1, 7],
    [np.nanmean, 1, 30],
    [np.nanstd, 1, 30],
    
    [np.nanmax, 1, 7],
    [np.nanmin, 1, 7],
    
    [np.nanmax, 1, 30],
    [np.nanmin, 1, 30]
]

def lag_from_previous(series, lag):
    return series.values[-lag] # use .values if breaks


def lag1(series):
    return lag_from_previous(series, 1)


def lag2(series):
    return lag_from_previous(series, 2)


def func_for_n_last(series, func, n_last):
#     start_idx = max(len(series) - n_last, 0)
    n_last_series = series.iloc[-n_last:] # use .values if breaks
    return func(n_last_series)


def mean_last_5_years(series):
    return func_for_n_last(series, np.nanmean, 5)


def std_last_5_years(series):
    return func_for_n_last(series, np.nanstd, 5)

doy_funcs = [lag1, lag2, mean_last_5_years, std_last_5_years]

In [12]:
extract_config = {
    "lags": np.arange(1, 8),
    "diff_lags": [7],
    "diff_funcs": [np.nanmean, np.nanstd],
    "levels_stat_config": stat_config,
    "past_years_funcs": doy_funcs
}


In [13]:
days_usage_config = {
    "lags": 9,
    "diff": 9, 
    "levels_stat": 32,
    "doy": 365 * 5 + 1
}

In [18]:
target_station_ids = [6005, 6022, 6027, 5004, 5012, 5024, 5805]
station_id = water_levels.reset_index()["id"]
target_station_mask = station_id.isin(target_station_ids).values
water_levels = water_levels[target_station_mask]

In [21]:
extractor = features.hydro.Extractor(extract_config)
extr_manager = features.hydro.LastDayExtractManager(water_levels, hydro_coords, extractor, days_usage_config)

In [22]:
%%time
last_day_features = extr_manager.extract()

start extracting
end!
start extracting
end!
start extracting
end!
start extracting
end!
start extracting
end!
start extracting
end!
start extracting
end!


  results[i] = self.f(v)
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,


agg func: lag1
agg func: lag2
agg func: mean_last_5_years
agg func: std_last_5_years
CPU times: user 23 s, sys: 156 ms, total: 23.2 s
Wall time: 23.3 s


In [29]:
last_day_features["diff_1"]

id    date      
5004  2020-10-01    -6.0
5012  2020-10-01   -11.0
5024  2020-10-01     4.0
5805  2020-10-01     1.0
6005  2020-10-01    -4.0
6022  2020-10-01    22.0
6027  2020-10-01     0.0
Name: diff_1, dtype: float64

In [38]:
whole_extractor = features.hydro.Extractor(extract_config)
whole_extr_manager = features.hydro.ExtractManager(water_levels, hydro_coords, extractor)

In [39]:
%%time
all_features = whole_extr_manager.extract()

start extracting
end!
start extracting
end!
start extracting
end!
start extracting
end!
start extracting
end!
start extracting
end!
start extracting
end!


  results[i] = self.f(v)
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,


agg func: lag1
agg func: lag2
agg func: mean_last_5_years
agg func: std_last_5_years
CPU times: user 2min 32s, sys: 414 ms, total: 2min 33s
Wall time: 2min 33s


In [56]:
def merge_features_with_coords(features, coords):
    merged = features.reset_index().merge(coords, on="id", how="left")
    merged.set_index(["id", "date"], inplace=True)
    
    return merged

def df_doy(df):
    date = df.reset_index()["date"]
    return date.dt.dayofyear.values

all_features = merge_features_with_coords(all_features, hydro_coords)
all_features["doy"] = df_doy(all_features)
all_features["target"] = water_levels

In [58]:
all_features.to_csv(os.path.join(data_dir, "hydro_features.csv"))