In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import pandas as pd
import numpy as np

from datetime import datetime, timedelta
import utils
import os
from collections.abc import Iterable

In [3]:
root_dir = "../"
data_dir = os.path.join(root_dir, "working_data/")

water_levels_path = os.path.join(data_dir, "water_levels.csv")
corrs_dists_path = os.path.join(data_dir, "corrs_and_dists.csv")

In [4]:
water_levels = pd.read_csv(water_levels_path)
water_levels["date"] = pd.to_datetime(water_levels["date"], format="%Y-%m-%d")
water_levels.set_index(["id", "date"], inplace=True)
water_levels = utils.reduce_memory_usage(water_levels)

water_levels.head()

corr_and_nearest = pd.read_csv(corrs_dists_path)
corr_and_nearest.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,max_level
id,date,Unnamed: 2_level_1
5001,1984-01-01,258.0
5001,1984-01-02,255.0
5001,1984-01-03,252.0
5001,1984-01-04,248.0
5001,1984-01-05,244.0


Unnamed: 0,post,best_corr_post,best_corr_shift,nearest_post,nearest_shift,best_corr_dist,best_corr_value,nearest_dist,nearest_corr
0,5001,5002.0,0.0,5036.0,-1,0.563987,0.955423,0.277389,0.483945
1,5002,6030.0,-1.0,5036.0,-1,1.444206,0.97028,0.317767,0.494123
2,5004,5008.0,0.0,5675.0,-1,2.105232,0.969865,0.20025,-0.115904
3,5008,5004.0,-2.0,5358.0,-1,2.105232,0.978104,0.254951,0.793771
4,5009,5012.0,0.0,5008.0,-1,0.407063,0.946303,0.269072,0.891186


In [5]:
def fill_missing_dates(water_levels, fill_val=np.nan):
    dates = water_levels.index.get_level_values(1)
    min_date, max_date = dates.min(), dates.max()
    
    new_index = pd.MultiIndex.from_product([water_levels.index.get_level_values(0).unique(), 
                                            pd.date_range(min_date, max_date, name="date")])
    water_levels = water_levels.reindex(new_index, fill_value=fill_val)
    return water_levels

water_levels = fill_missing_dates(water_levels)
water_levels.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,max_level
id,date,Unnamed: 2_level_1
5001,1984-01-01,258.0
5001,1984-01-02,255.0
5001,1984-01-03,252.0
5001,1984-01-04,248.0
5001,1984-01-05,244.0


In [7]:
def extract_features(df, func, func_args_list):
    id_groups = df.groupby(by="id")
    features = []
    
    for func_args in func_args_list:
        if not isinstance(func_args, Iterable):
            func_args = (func_args,)
        func_out = func(id_groups, *func_args)
            
        features.append(func_out)
        
    features_df = pd.concat(features, axis=1)
    return features_df


def lag(grouped, lag):
    feature = grouped.shift(lag)
    
    feature_name = f"lag_{lag}"
    feature = rename_df(feature, feature_name)
    
    return feature


def stat(grouped, func, lag, winsize):
    print("start extracting")
    feature = grouped.rolling(winsize, min_periods=1).agg(func).shift(lag)
    print("end!")
    
    feature = drop_redundant_agg_indexes(feature)
    
    feature_name = f"{func.__name__}_{lag}_{winsize}"
    feature = rename_df(feature, feature_name)
    
    return feature


def drop_redundant_agg_indexes(df):
    df.index = df.index.droplevel(0) # agg creates second id col
    return df


def rename_first_col(df, new_name):
    old_name = df.columns[0]
    new_name = f"{old_name}_" + new_name
    
    return df.rename(columns={old_name: new_name})

# def mean_previous_years(df):
#     feature = 

    
# def last_year_records(df)


In [9]:
def diff_lag(grouped, ndays):
    differences = grouped.diff(1).groupby("id") #group again to roll by groups not whole df
    
    feature_names = [f"diff_{i}" for i in range(1, ndays+1)]
    diff_lags = differences.rolling(ndays, min_periods=1) # need shift to get only previous values without current one
    
    diff_lags = all_rolling_lags(diff_lags, ndays)
    
    diff_lags_df = pd.DataFrame(diff_lags, columns=feature_names)
    return diff_lags_df
        
    
def all_rolling_lags(roll, ndays):
    vals = []
    for window in enumerate(roll):
        window_vals_flipped = window[1].values[:, 0]
        # window is took from earlier days to later, so we flip it to lag format (1 col is lag 1 day ago etc)
        window_vals = np.flip(window_vals_flipped)
        
        win_len = window_vals.shape[0]
        
        if win_len < ndays:
            padding_array = np.full(ndays - win_len, np.nan)
            padded_window_values = np.concatenate((window_vals, padding_array))
            
        else:
            padded_window_values = window_vals
            
        vals.append(padded_window_values)
    return np.array(vals)

diff_lags = extract_features(water_levels, diff_lag, [7])
diff_lags.head()

Unnamed: 0,diff_1,diff_2,diff_3,diff_4,diff_5,diff_6,diff_7
0,,,,,,,
1,-3.0,,,,,,
2,-3.0,-3.0,,,,,
3,-4.0,-3.0,-3.0,,,,
4,-4.0,-4.0,-3.0,-3.0,,,


In [10]:
diff_lags.index = water_levels.index
diff_lags.head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,diff_1,diff_2,diff_3,diff_4,diff_5,diff_6,diff_7
id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
5001,1984-01-01,,,,,,,
5001,1984-01-02,-3.0,,,,,,


In [None]:
lags = extract_features(water_levels, lag, np.arange(1, 8))
lags.head()

In [None]:
stat_config = [
    [np.nanmean, 1, 7],
    [np.nanmean, 1, 30],
    [np.nanstd, 1, 30],
    
    [np.nanmax, 1, 7],
    [np.nanmin, 1, 7],
    
    [np.nanmax, 1, 30],
    [np.nanmin, 1, 30]
]

stats = extract_features(water_levels, stat, stat_config)
stats.head()

In [84]:
## Lagecy attempt to extract seasonal features

# doy = pd.Series(water_levels.index.get_level_values("date").dayofyear, name="doy")
# year = pd.Series( water_levels.index.get_level_values("date").year, name="year")

# extended_levels = pd.concat([water_levels.reset_index(), doy, year], axis=1)
# extended_levels.head()

# conc = []

# for station_id in extended_levels["id"].unique():
#     for date in sorted(extended_levels["date"].unique()):
#         date = pd.to_datetime(date)
#         doy, year = date.dayofyear, date.year

#         previous_years_same_day_mask = (extended_levels["doy"] == doy) & (extended_levels["year"] < year) & (extended_levels["id"] == station_id)

#         vals = extended_levels.loc[previous_years_same_day_mask, "max_level"]
#         if len(vals) > 0:
#             conc.append(np.nanmean(vals))
#         else:
#             conc.append(np.nan)

In [46]:
def past_years_stat(df, func):
    
    old_index = df.index
    
    months = df.index.get_level_values("date").month
    days = df.index.get_level_values("date").day
    station = df.index.get_level_values("id")
    
    same_doy = df["max_level"].groupby([months, days, station])
    mean_previous_years = same_doy.expanding(min_periods=1).agg(func)
    
    mean_previous_years.index = old_index
    return mean_previous_years
    

mean_previous_years = past_years_mean(water_levels)
mean_previous_years.head()

id    date      
5001  1984-01-01    258.000000
      1984-01-02    272.500000
      1984-01-03    288.333333
      1984-01-04    219.500000
      1984-01-05    229.200000
Name: max_level, dtype: float64

In [44]:
%%time
l = []
for i in pd.Series(np.random.rand(10^6)).rolling(7):
#     print(i)
    l.append(i.values)

CPU times: user 3.05 ms, sys: 0 ns, total: 3.05 ms
Wall time: 3.08 ms


In [48]:
len(pd.Series(np.random.rand(10^6)).rolling(7))

TypeError: object of type 'Rolling' has no len()