In [4]:
# TODO:
# возможные глоб. фичи:
# groupby: year, month, id (в различных коминациях)
# функции: mean, std, mean_diff

In [5]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [6]:
import pandas as pd
import numpy as np

from datetime import datetime, timedelta
import utils
import os
from collections.abc import Iterable

In [7]:
root_dir = "../"
data_dir = os.path.join(root_dir, "working_data/")

water_levels_path = os.path.join(data_dir, "water_levels.csv")
# corrs_dists_path = os.path.join(data_dir, "corrs_and_dists.csv")

In [8]:
hydro_coords = pd.read_csv(data_dir + "hydro_posts_coords.csv")

In [9]:
water_levels = pd.read_csv(water_levels_path)
water_levels["date"] = pd.to_datetime(water_levels["date"], format="%Y-%m-%d")
water_levels.set_index(["id", "date"], inplace=True)
water_levels = utils.reduce_memory_usage(water_levels)

water_levels.head()

# corr_and_nearest = pd.read_csv(corrs_dists_path)
# corr_and_nearest.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,max_level
id,date,Unnamed: 2_level_1
5001,1984-01-01,258.0
5001,1984-01-02,255.0
5001,1984-01-03,252.0
5001,1984-01-04,248.0
5001,1984-01-05,244.0


In [10]:
def fill_missing_dates(water_levels, fill_val=np.nan):
    dates = water_levels.index.get_level_values(1)
    min_date, max_date = dates.min(), dates.max()
    
    new_index = pd.MultiIndex.from_product([water_levels.index.get_level_values(0).unique(), 
                                            pd.date_range(min_date, max_date, name="date")])
    water_levels = water_levels.reindex(new_index, fill_value=fill_val)
    return water_levels

# water_levels = fill_missing_dates(water_levels)
# water_levels.head()

In [11]:
def extract_features(df, func, func_args_list):
    id_groups = df.groupby(by="id")
    features = []
    
    for func_args in func_args_list:
        if not isinstance(func_args, Iterable):
            func_args = (func_args,)
        func_out = func(id_groups, *func_args)
            
        features.append(func_out)
        
    features_df = pd.concat(features, axis=1)
    return features_df


def lag(grouped, lag):
    feature = grouped.shift(lag)
    
    feature_name = f"lag_{lag}"
    feature = rename_first_col(feature, feature_name)
    
    return feature


def stat(grouped, func, lag, winsize):
    shifted_grouped = grouped.shift(lag).groupby("id")
    
    print("start extracting")
    feature = shifted_grouped.rolling(winsize, min_periods=1).agg(func)
    print("end!")
    
    feature = drop_redundant_agg_indexes(feature)
    
    feature_name = f"{func.__name__}_{lag}_{winsize}"
    feature = rename_first_col(feature, feature_name)
    
    return feature


def drop_redundant_agg_indexes(df):
    df.index = df.index.droplevel(0) # agg creates second id col
    return df


def rename_first_col(df, new_name):
    old_name = df.columns[0]
    new_name = f"{old_name}_" + new_name
    
    return df.rename(columns={old_name: new_name})

# def mean_previous_years(df):
#     feature = 

    
# def last_year_records(df)


In [12]:
def diff_lag(grouped, ndays):
    differences = grouped.diff(1).groupby("id").shift(1).groupby("id") #group again to roll by groups not whole df
    
    feature_names = [f"diff_{i}" for i in range(1, ndays+1)]
    diff_lags = differences.rolling(ndays, min_periods=1) # need shift to get only previous values without current one
    
    diff_lags = all_rolling_lags(diff_lags, ndays)
    
    diff_lags_df = pd.DataFrame(diff_lags, columns=feature_names)
    return diff_lags_df
        
    
def all_rolling_lags(roll, ndays):
    vals = []
    for window in enumerate(roll):
        window_vals_flipped = window[1].values[:, 0]
        # window is took from earlier days to later, so we flip it to lag format (1 col is lag 1 day ago etc)
        window_vals = np.flip(window_vals_flipped)
        
        win_len = window_vals.shape[0]
        
        if win_len < ndays:
            padding_array = np.full(ndays - win_len, np.nan)
            padded_window_values = np.concatenate((window_vals, padding_array))
            
        else:
            padded_window_values = window_vals
            
        vals.append(padded_window_values)
    return np.array(vals)

# diff_lags = extract_features(water_levels, diff_lag, [7])
# diff_lags.index = water_levels.index
# diff_lags.head(3)

In [13]:
def calc_diff_lags_stats(diff_lags, funcs):
    
    features = []
    for func in funcs:
        print(func.__name__)
        f_name = f"diff_{func.__name__}"
        feature = diff_lags.apply(func, axis=1)
        feature.name = f_name
        features.append(feature)
    
    features_df = pd.concat(features, axis=1)
    return features_df

In [14]:
# diff_lags_stats = calc_diff_lags_stats(diff_lags, [np.nanmean, np.nanstd])
# diff_lags_stats.head()

In [15]:
# lags = extract_features(water_levels, lag, np.arange(1, 8))
# lags.head()

In [16]:
# stat_config = [
#     [np.nanmean, 1, 7],
#     [np.nanmean, 1, 30],
#     [np.nanstd, 1, 30],
    
#     [np.nanmax, 1, 7],
#     [np.nanmin, 1, 7],
    
#     [np.nanmax, 1, 30],
#     [np.nanmin, 1, 30]
# ]

# stats = extract_features(water_levels, stat, stat_config)
# stats.head()

In [17]:
def past_years_stats(df, funcs):
    months = df.index.get_level_values("date").month
    days = df.index.get_level_values("date").day
    station = df.index.get_level_values("id")
    
    same_doy = df["max_level"].groupby([months, days, station])
    same_doy_previous = same_doy.shift(1).groupby([months, days, station])
    
    features = []
    for func in funcs:
        print(f"agg func: {func.__name__}")
    #agg from previous because we have no water level of x year's doy when calc features for this year's doy
        feature = same_doy_previous.expanding(min_periods=1).agg(func) 
        f_name = f"doy_{func.__name__}"
#         feature = rename_first_col(feature, f_name)
        feature.name = f_name
        features.append(feature)
        
    features_df = pd.concat(features, axis=1)
    features_df = features_df.droplevel([0, 1, 2]).sort_index(level=["id", "date"])
        
    return features_df


In [18]:
def lag_from_previous(series, lag):
    return series.iloc[-lag] # use .values if breaks

def lag1(series):
    return lag_from_previous(series, 1)

def lag2(series):
    return lag_from_previous(series, 2)

def func_for_n_last(series, func, n_last):
#     start_idx = max(len(series) - n_last, 0)
    n_last_series = series.iloc[-n_last:] # use .values if breaks
    return func(n_last_series)

def mean_last_5_years(series):
    return func_for_n_last(series, np.nanmean, 5)

def std_last_5_years(series):
    return func_for_n_last(series, np.nanstd, 5)

# doy_stats = past_years_stats(water_levels, [mean_last_5_years, std_last_5_years, np.nanmean, np.nanstd, lag1, lag2])

In [19]:
# lags.head()
# stats.head()
# doy_stats.head() #also includes lags
# diff_lags.head()
# diff_lags_stats.head()

# check indexes are same
def check_same_idxs(dfs):
    first_df = dfs[0]
    for df in dfs[1:]:
        if not first_df.index.equals(df.index):
            return False
    return True

# feature_dfs = (lags, stats, doy_stats, diff_lags, diff_lags_stats)
# if check_same_idxs(feature_dfs):
#     all_features = pd.concat(feature_dfs, axis=1)
# else:
#     raise ValueError("dfs have different indexes, cant concatenate")

# all_features.head()

In [20]:
# разумеется, нужно добавить сами предсказываемые значения

# water_levels.rename(columns={"max_level": "target"}, inplace=True)
# all_features = all_features.merge(water_levels, left_index=True, right_index=True)
# all_features.head()

### add lon lat

In [21]:
# all_features = pd.read_csv(os.path.join(data_dir, "hydro_features.csv"))
# all_features["date"] = pd.to_datetime(all_features["date"])
# all_features.set_index(["id", "date"], inplace=True)
# all_features.head()

In [22]:
# hydro_coords = pd.read_csv(data_dir + "hydro_posts_coords.csv")
# hydro_coords.head()

In [23]:
# all_features = all_features.reset_index().merge(hydro_coords, on="id", how="left")
# all_features.set_index(["id", "date"], inplace=True)

### extract calendar features 

In [24]:
# date = all_features.index.get_level_values("date")
# doy = date.dayofyear
# all_features["doy"] = doy

In [25]:
def df_doy(df):
    date = df.reset_index()["date"]
    return date.dayofyear

In [26]:
def extract_diff_features(df):
    diff_lags = extract_features(df, diff_lag, [7])
    diff_lags.index = df.index
    
    diff_lags_stats = calc_diff_lags_stats(diff_lags, [np.nanmean, np.nanstd])
    
    features = pd.concat([diff_lags, diff_lags_stats], axis=1)
    return features

In [27]:
def safe_features_concat(features_dfs):
    if check_same_idxs(feature_dfs):
        return pd.concat(feature_dfs, axis=1)
    else:
        raise ValueError("dfs have different indexes, cant concatenate")

In [28]:
def merge_features_with_coords(features, coords):
    merged = features.reset_index().merge(coords, on="id", how="left")
    merged.set_index(["id", "date"], inplace=True)
    
    return merged

In [29]:
def water_levels_features(water_levels):
    water_levels = fill_missing_dates(water_levels)
    
    stat_config = [
    [np.nanmean, 1, 7],
    [np.nanmean, 1, 30],
    [np.nanstd, 1, 30],
    
    [np.nanmax, 1, 7],
    [np.nanmin, 1, 7],
    
    [np.nanmax, 1, 30],
    [np.nanmin, 1, 30]
    ]
    stats = extract_features(water_levels, stat, stat_config)
    lags = extract_features(water_levels, lag, np.arange(1, 8))
    
    diff_features = extract_diff_features(water_levels)
    doy_stats = past_years_stats(water_levels, [mean_last_5_years, std_last_5_years, np.nanmean, np.nanstd, lag1, lag2])
    doy = df_doy(all_features)
    
    all_features = safe_features_concat([lags, stats, diff_features, doy_stats, water_levels, doy])
    
    return all_features

In [30]:
def hydro_feature_extraction(water_levels, hydro_coords,):
    all_features = water_levels_features(water_levels)
    all_features = merge_features_with_coords(all_features, hydro_coords)
    
    return all_features

In [None]:
all_features = hydro_feature_extraction(water_levels, hydro_coords)

start extracting
end!
start extracting
end!
start extracting
end!
start extracting


In [None]:
all_features.to_csv(os.path.join(data_dir, "hydro_features.csv"))

In [None]:
# def water_levels_features_latest_date(water_levels):
#     """Extracting features only for last date in water_levels,
#     used for step predict."""
#     water_levels = fill_missing_dates(water_levels)
    
#     stat_config = [
#     [np.nanmean, 1, 7],
#     [np.nanmean, 1, 30],
#     [np.nanstd, 1, 30],
    
#     [np.nanmax, 1, 7],
#     [np.nanmin, 1, 7],
    
#     [np.nanmax, 1, 30],
#     [np.nanmin, 1, 30]
#     ]
#     water_levels_last_31_days = n_latest_dates(water_levels, 31)
#     stats = extract_features(water_levels_last_31_days, stat, stat_config)
    
#     water_levels_last_9_days = n_latest_dates(water_levels, 9)
#     diff_features = extract_diff_features(water_levels_last_9_days)
#     lags = extract_features(water_levels, lag, np.arange(1, 8))
    
#     doy_stats = past_years_stats(water_levels, [mean_last_5_years, std_last_5_years, np.nanmean, np.nanstd, lag1, lag2])
#     doy = df_doy(all_features)
    
#     all_features = safe_features_concat([lags, stats, diff_features, doy_stats, water_levels, doy])
#     all_features = keep_latest_date(all_features)
#     return all_features
                                 
    

# def n_latest_dates(df, n):
#     dates = df.reset_index()["date"]
#     last_n_uniq_dates = sorted(dates.unique())[-n:]
    
#     mask_last_n_dates = dates.isin(last_n_uniq_dates).values
    
#     return df[mask_last_n_dates]

# def keep_latest_date(df):
#     dates = df.reset_index()["date"]
#     latest_date = dates.max()
    
#     mask_latest_date = dates.values == latest_date
    
#     return df[mask_latest_date]

In [None]:
# def last_day_hydro_feature_extraction(water_levels, hydro_coords):
#     all_features = water_levels_features_latest_date(water_levels)
#     all_features = merge_features_with_coords(all_features, hydro_coords)
    
#     return all_features

In [None]:
# last_day = last_day_hydro_feature_extraction(water_levels, hydro_coords)

#### проверяем, что всё извлеклось мы хотели

In [15]:
# st_example = stats.loc[5019]
# st_example.iloc[:10] #first values
# st_example[st_example.index.year == 1990].iloc[:10] # values from n year

# st_example = water_levels.loc[5019]
# st_example.iloc[:10] #first values
# st_example[st_example.index.year == 1990].iloc[:10] # values from second year
# st_example[(st_example.index.month == 1) & (st_example.index.day == 1)].iloc[:10]