In [1]:
# TODO:
# возможные глоб. фичи:
# groupby: year, month, id (в различных коминациях)
# функции: mean, std, mean_diff

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
import pandas as pd
import numpy as np

from datetime import datetime, timedelta
import utils
import os
from collections.abc import Iterable

In [4]:
root_dir = "../"
data_dir = os.path.join(root_dir, "working_data/")

water_levels_path = os.path.join(data_dir, "water_levels.csv")
corrs_dists_path = os.path.join(data_dir, "corrs_and_dists.csv")

In [5]:
water_levels = pd.read_csv(water_levels_path)
water_levels["date"] = pd.to_datetime(water_levels["date"], format="%Y-%m-%d")
water_levels.set_index(["id", "date"], inplace=True)
water_levels = utils.reduce_memory_usage(water_levels)

water_levels.head()

corr_and_nearest = pd.read_csv(corrs_dists_path)
corr_and_nearest.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,max_level
id,date,Unnamed: 2_level_1
5001,1984-01-01,258.0
5001,1984-01-02,255.0
5001,1984-01-03,252.0
5001,1984-01-04,248.0
5001,1984-01-05,244.0


Unnamed: 0,post,best_corr_post,best_corr_shift,nearest_post,nearest_shift,best_corr_dist,best_corr_value,nearest_dist,nearest_corr
0,5001,5002.0,0.0,5036.0,-1,0.563987,0.955423,0.277389,0.483945
1,5002,6030.0,-1.0,5036.0,-1,1.444206,0.97028,0.317767,0.494123
2,5004,5008.0,0.0,5675.0,-1,2.105232,0.969865,0.20025,-0.115904
3,5008,5004.0,-2.0,5358.0,-1,2.105232,0.978104,0.254951,0.793771
4,5009,5012.0,0.0,5008.0,-1,0.407063,0.946303,0.269072,0.891186


In [6]:
def fill_missing_dates(water_levels, fill_val=np.nan):
    dates = water_levels.index.get_level_values(1)
    min_date, max_date = dates.min(), dates.max()
    
    new_index = pd.MultiIndex.from_product([water_levels.index.get_level_values(0).unique(), 
                                            pd.date_range(min_date, max_date, name="date")])
    water_levels = water_levels.reindex(new_index, fill_value=fill_val)
    return water_levels

water_levels = fill_missing_dates(water_levels)
water_levels.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,max_level
id,date,Unnamed: 2_level_1
5001,1984-01-01,258.0
5001,1984-01-02,255.0
5001,1984-01-03,252.0
5001,1984-01-04,248.0
5001,1984-01-05,244.0


In [6]:
def extract_features(df, func, func_args_list):
    id_groups = df.groupby(by="id")
    features = []
    
    for func_args in func_args_list:
        if not isinstance(func_args, Iterable):
            func_args = (func_args,)
        func_out = func(id_groups, *func_args)
            
        features.append(func_out)
        
    features_df = pd.concat(features, axis=1)
    return features_df


def lag(grouped, lag):
    feature = grouped.shift(lag)
    
    feature_name = f"lag_{lag}"
    feature = rename_first_col(feature, feature_name)
    
    return feature


def stat(grouped, func, lag, winsize):
    shifted_grouped = grouped.shift(lag).groupby("id")
    
    print("start extracting")
    feature = grouped.rolling(winsize, min_periods=1).agg(func)
    print("end!")
    
    feature = drop_redundant_agg_indexes(feature)
    
    feature_name = f"{func.__name__}_{lag}_{winsize}"
    feature = rename_first_col(feature, feature_name)
    
    return feature


def drop_redundant_agg_indexes(df):
    df.index = df.index.droplevel(0) # agg creates second id col
    return df


def rename_first_col(df, new_name):
    old_name = df.columns[0]
    new_name = f"{old_name}_" + new_name
    
    return df.rename(columns={old_name: new_name})

# def mean_previous_years(df):
#     feature = 

    
# def last_year_records(df)


In [7]:
def diff_lag(grouped, ndays):
    differences = grouped.diff(1).groupby("id") #group again to roll by groups not whole df
    
    feature_names = [f"diff_{i}" for i in range(1, ndays+1)]
    diff_lags = differences.rolling(ndays, min_periods=1) # need shift to get only previous values without current one
    
    diff_lags = all_rolling_lags(diff_lags, ndays)
    
    diff_lags_df = pd.DataFrame(diff_lags, columns=feature_names)
    return diff_lags_df
        
    
def all_rolling_lags(roll, ndays):
    vals = []
    for window in enumerate(roll):
        window_vals_flipped = window[1].values[:, 0]
        # window is took from earlier days to later, so we flip it to lag format (1 col is lag 1 day ago etc)
        window_vals = np.flip(window_vals_flipped)
        
        win_len = window_vals.shape[0]
        
        if win_len < ndays:
            padding_array = np.full(ndays - win_len, np.nan)
            padded_window_values = np.concatenate((window_vals, padding_array))
            
        else:
            padded_window_values = window_vals
            
        vals.append(padded_window_values)
    return np.array(vals)

diff_lags = extract_features(water_levels, diff_lag, [7])
diff_lags.index = water_levels.index
diff_lags.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,diff_1,diff_2,diff_3,diff_4,diff_5,diff_6,diff_7
id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
5001,1984-01-01,,,,,,,
5001,1984-01-02,-3.0,,,,,,
5001,1984-01-03,-3.0,-3.0,,,,,


In [8]:
def calc_diff_lags_stats(diff_lags, funcs):
    
    features = []
    for func in funcs:
        print(func.__name__)
        f_name = f"diff_{func.__name__}"
        feature = diff_lags.apply(func, axis=1)
        feature.name = f_name
        features.append(feature)
    
    features_df = pd.concat(features, axis=1)
    return features_df

In [9]:
diff_lags_stats = calc_diff_lags_stats(diff_lags, [np.nanmean, np.nanstd])
diff_lags_stats.head()

nanmean


  results[i] = self.f(v)


nanstd


  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,


Unnamed: 0_level_0,Unnamed: 1_level_0,diff_nanmean,diff_nanstd
id,date,Unnamed: 2_level_1,Unnamed: 3_level_1
5001,1984-01-01,,
5001,1984-01-02,-3.0,0.0
5001,1984-01-03,-3.0,0.0
5001,1984-01-04,-3.333333,0.471405
5001,1984-01-05,-3.5,0.5


In [10]:
lags = extract_features(water_levels, lag, np.arange(1, 8))
lags.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,max_level_lag_1,max_level_lag_2,max_level_lag_3,max_level_lag_4,max_level_lag_5,max_level_lag_6,max_level_lag_7
id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
5001,1984-01-01,,,,,,,
5001,1984-01-02,258.0,,,,,,
5001,1984-01-03,255.0,258.0,,,,,
5001,1984-01-04,252.0,255.0,258.0,,,,
5001,1984-01-05,248.0,252.0,255.0,258.0,,,


In [11]:
stat_config = [
    [np.nanmean, 1, 7],
    [np.nanmean, 1, 30],
    [np.nanstd, 1, 30],
    
    [np.nanmax, 1, 7],
    [np.nanmin, 1, 7],
    
    [np.nanmax, 1, 30],
    [np.nanmin, 1, 30]
]

stats = extract_features(water_levels, stat, stat_config)
stats.head()

start extracting
end!
start extracting
end!
start extracting
end!
start extracting
end!
start extracting
end!
start extracting
end!
start extracting
end!


Unnamed: 0_level_0,Unnamed: 1_level_0,max_level_nanmean_1_7,max_level_nanmean_1_30,max_level_nanstd_1_30,max_level_nanmax_1_7,max_level_nanmin_1_7,max_level_nanmax_1_30,max_level_nanmin_1_30
id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
5001,1984-01-01,258.0,258.0,,258.0,258.0,258.0,258.0
5001,1984-01-02,256.5,256.5,2.12132,258.0,255.0,258.0,255.0
5001,1984-01-03,255.0,255.0,3.0,258.0,252.0,258.0,252.0
5001,1984-01-04,253.25,253.25,4.272002,258.0,248.0,258.0,248.0
5001,1984-01-05,251.4,251.4,5.549775,258.0,244.0,258.0,244.0


In [12]:
def past_years_stats(df, funcs):
    months = df.index.get_level_values("date").month
    days = df.index.get_level_values("date").day
    station = df.index.get_level_values("id")
    
    same_doy = df["max_level"].groupby([months, days, station])
    same_doy_previous = same_doy.shift(1).groupby([months, days, station])
    
    features = []
    for func in funcs:
        print(f"agg func: {func.__name__}")
    #agg from previous because we have no water level of x year's doy when calc features for this year's doy
        feature = same_doy_previous.expanding(min_periods=1).agg(func) 
        f_name = f"doy_{func.__name__}"
#         feature = rename_first_col(feature, f_name)
        feature.name = f_name
        features.append(feature)
        
    features_df = pd.concat(features, axis=1)
    features_df = features_df.droplevel([0, 1, 2]).sort_index(level=["id", "date"])
        
    return features_df


In [13]:
def lag_from_previous(series, lag):
    return series.iloc[-lag] # use .values if breaks

def lag1(series):
    return lag_from_previous(series, 1)

def lag2(series):
    return lag_from_previous(series, 2)

def func_for_n_last(series, func, n_last):
#     start_idx = max(len(series) - n_last, 0)
    n_last_series = series.iloc[-n_last:] # use .values if breaks
    return func(n_last_series)

def mean_last_5_years(series):
    return func_for_n_last(series, np.nanmean, 5)

def std_last_5_years(series):
    return func_for_n_last(series, np.nanstd, 5)

doy_stats = past_years_stats(water_levels, [mean_last_5_years, std_last_5_years, np.nanmean, np.nanstd, lag1, lag2])

agg func: mean_last_5_years


  return func(n_last_series)


agg func: std_last_5_years


  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,


agg func: nanmean
agg func: nanstd
agg func: lag1
agg func: lag2


In [14]:
# lags.head()
# stats.head()
# doy_stats.head() #also includes lags
# diff_lags.head()
# diff_lags_stats.head()

# check indexes are same
def check_same_idxs(dfs):
    first_df = dfs[0]
    for df in dfs[1:]:
        if not first_df.index.equals(df.index):
            return False
    return True

feature_dfs = (lags, stats, doy_stats, diff_lags, diff_lags_stats)
if check_same_idxs(feature_dfs):
    all_features = pd.concat(feature_dfs, axis=1)
else:
    raise ValueError("dfs have different indexes, cant concatenate")

all_features.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,max_level_lag_1,max_level_lag_2,max_level_lag_3,max_level_lag_4,max_level_lag_5,max_level_lag_6,max_level_lag_7,max_level_nanmean_1_7,max_level_nanmean_1_30,max_level_nanstd_1_30,...,doy_lag2,diff_1,diff_2,diff_3,diff_4,diff_5,diff_6,diff_7,diff_nanmean,diff_nanstd
id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
5001,1984-01-01,,,,,,,,258.0,258.0,,...,,,,,,,,,,
5001,1984-01-02,258.0,,,,,,,256.5,256.5,2.12132,...,,-3.0,,,,,,,-3.0,0.0
5001,1984-01-03,255.0,258.0,,,,,,255.0,255.0,3.0,...,,-3.0,-3.0,,,,,,-3.0,0.0
5001,1984-01-04,252.0,255.0,258.0,,,,,253.25,253.25,4.272002,...,,-4.0,-3.0,-3.0,,,,,-3.333333,0.471405
5001,1984-01-05,248.0,252.0,255.0,258.0,,,,251.4,251.4,5.549775,...,,-4.0,-4.0,-3.0,-3.0,,,,-3.5,0.5


In [None]:
# разумеется, нужно добавить сами предсказываемые значения

water_levels.rename(columns={"max_level": "target"}, inplace=True)
all_features = all_features.merge(water_levels, left_index=True, right_index=True)
all_features.head()

### add lon lat

In [11]:
# all_features = pd.read_csv(os.path.join(data_dir, "hydro_features.csv"))
# all_features["date"] = pd.to_datetime(all_features["date"])
# all_features.set_index(["id", "date"], inplace=True)
# all_features.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,max_level_lag_1,max_level_lag_2,max_level_lag_3,max_level_lag_4,max_level_lag_5,max_level_lag_6,max_level_lag_7,max_level_nanmean_1_7,max_level_nanmean_1_30,max_level_nanstd_1_30,...,diff_1,diff_2,diff_3,diff_4,diff_5,diff_6,diff_7,diff_nanmean,diff_nanstd,target
id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
5001,1984-01-01,,,,,,,,258.0,258.0,,...,,,,,,,,,,258.0
5001,1984-01-02,258.0,,,,,,,256.5,256.5,2.12132,...,-3.0,,,,,,,-3.0,0.0,255.0
5001,1984-01-03,255.0,258.0,,,,,,255.0,255.0,3.0,...,-3.0,-3.0,,,,,,-3.0,0.0,252.0
5001,1984-01-04,252.0,255.0,258.0,,,,,253.25,253.25,4.272002,...,-4.0,-3.0,-3.0,,,,,-3.333333,0.471405,248.0
5001,1984-01-05,248.0,252.0,255.0,258.0,,,,251.4,251.4,5.549775,...,-4.0,-4.0,-3.0,-3.0,,,,-3.5,0.5,244.0


In [12]:
hydro_coords = pd.read_csv(data_dir + "hydro_posts_coords.csv")
hydro_coords.head()

Unnamed: 0,id,latitude,longitude
0,5674.0,48.72,132.8
1,5216.0,44.06,132.01
2,6256.0,52.17,126.58
3,5132.0,44.23,134.27
4,5151.0,43.58,133.04


In [13]:
all_features = all_features.reset_index().merge(hydro_coords, on="id", how="left")
all_features.set_index(["id", "date"], inplace=True)

### extract calendar features 

In [14]:
date = all_features.index.get_level_values("date")
doy = date.dayofyear
all_features["doy"] = doy

In [15]:
all_features.to_csv(os.path.join(data_dir, "hydro_features.csv"))

#### проверяем, что всё извлеклосdata_dir мы хотели

In [15]:
# st_example = stats.loc[5019]
# st_example.iloc[:10] #first values
# st_example[st_example.index.year == 1990].iloc[:10] # values from n year

# st_example = water_levels.loc[5019]
# st_example.iloc[:10] #first values
# st_example[st_example.index.year == 1990].iloc[:10] # values from second year
# st_example[(st_example.index.month == 1) & (st_example.index.day == 1)].iloc[:10]