In [1]:
import sys
sys.path.append("..")

In [2]:
import features

import numpy as np
import pandas as pd
from datetime import datetime, timedelta

from importlib import reload

In [3]:
data_dir = "../../datasets/"
s2m_path = "../../working_data/handmade_s2m.csv"

hydro_path = data_dir + "hydro_2018-2020/new_data_all.csv"
meteo_path = data_dir + "meteo_new/"

In [4]:
test_date_fmt = "%Y-%m-%d"
test_start_date, test_end_date = datetime.strptime("2020-11-23", test_date_fmt), datetime.strptime("2020-12-03", test_date_fmt)

### hydro manager

In [5]:
def lag_from_previous(series, lag):
    return series.values[-lag] # use .values if breaks


def lag1(series):
    return lag_from_previous(series, 1)


def lag2(series):
    return lag_from_previous(series, 2)


def func_for_n_last(series, func, n_last):
    n_last_series = series.iloc[-n_last:] # use .values if breaks
    return func(n_last_series)


def mean_last_5_years(series):
    return func_for_n_last(series, np.nanmean, 5)


def std_last_5_years(series):
    return func_for_n_last(series, np.nanstd, 5)

doy_funcs = [lag1, lag2, mean_last_5_years, std_last_5_years]

In [6]:
stat_params = [
    [np.nanmean, 1, 7],
    [np.nanmean, 1, 30],
    [np.nanstd, 1, 30],
    
    [np.nanmax, 1, 7],
    [np.nanmin, 1, 7],
    
    [np.nanmax, 1, 30],
    [np.nanmin, 1, 30]
]

hydro_extract_config = {
    "lags": np.arange(1, 8),
    "diff_lags": [7],
    "diff_funcs": [np.nanmean, np.nanstd],
    "levels_stat_config": stat_params,
    "past_years_funcs": doy_funcs
}

hydro_extractor = features.hydro.Extractor(hydro_extract_config)

In [7]:
days_usage_config = {
    "lags": 9,
    "diff": 9, 
    "levels_stat": 32,
    "doy": 365 * 5 + 1
}

In [8]:
hydro_loader = features.hydro.FileHydroLoader(hydro_path)
hydro_extract_manager = features.hydro.ExtractManager(hydro_extractor, days_usage_config)

In [9]:
hydro_manager = features.hydro.HydroManager(hydro_loader, hydro_extract_manager)

### meteo manager

In [10]:
meteo_dir_loader = features.meteo.DirMeteoLoader(meteo_path)

In [11]:
name_forecast_src = np.array([("Total_cloud_cover_entire_atmosphere_Mixed_intervals_Average", "cloudCoverTotal"),
                        ('u-component_of_wind_height_above_ground', "windAngleX"), 
                        ('v-component_of_wind_height_above_ground', "windAngleY"),
                        ('Wind_speed_gust_surface', "windSpeed"), 
                        ('Total_precipitation_surface_Mixed_intervals_Accumulation', "totalAccumulatedPrecipitation"), 
                        ("Temperature_height_above_ground", 'airTemperature'), 
                        ('Maximum_temperature_height_above_ground_Mixed_intervals_Maximum', 'maximumTemperatureOverPeriodSpecified'), 
                        ('Minimum_temperature_height_above_ground_Mixed_intervals_Minimum', 'minimumTemperatureAtHeightAndOverPeriodSpecified'),
                        ('Temperature_surface', 'soilTemperature'), 
                        ('Relative_humidity_height_above_ground', 'relativeHumidity'), 
                        ('Pressure_height_above_ground', 'pressure'), 
                        ('Pressure_reduced_to_MSL_msl', 'pressureReducedToMeanSeaLevel'),
                        ("Dewpoint_temperature_height_above_ground", "dewpointTemperature")
                       ])

retrieved_vars = list(name_forecast_src[:, 1])
varnames_table = pd.DataFrame(name_forecast_src, columns=["forecast", "src"])

In [12]:
forecast_parser = features.meteo.ForecastParser()
forecast_preprocessor = features.meteo.ForecastMeteoPreprocessor()
coords_builder = features.CoordsBuilder(data_dir + "processed_data/asunp.pkl")

In [13]:
meteo_forecast_loader = features.meteo.ForecastMeteoLoader(test_start_date, test_end_date, coords_builder, 
                                                          retrieved_vars, varnames_table, forecast_parser)

In [14]:
meteo_extract_config_builder = features.meteo.ExtrConfigBuilder()

ordinal_extr_stats = [{"func": np.nanmean, "lag": 1, "winsize": 7}, 
                      {"func": np.nanmean, "lag": 7, "winsize": 30}, 
                      {"func": np.nanstd, "lag": 7, "winsize": 30}]

ordinal_cols = ["cloudCoverTotal", "windSpeed", "totalAccumulatedPrecipitation", "soilTemperature",
                            "airTemperature", "dewpointTemperature", "pressure", "pressureReducedToMeanSeaLevel",
                            "windAngleX", "windAngleY"]

min_extr_stats = [{"func": np.nanmean, "lag": 1, "winsize": 7}]
min_cols = ["minimumTemperatureAtHeightAndOverPeriodSpecified", "maximumTemperatureOverPeriodSpecified"]

meteo_extract_config_builder.update_with_config_and_cols(ordinal_extr_stats, ordinal_cols)
meteo_extract_config_builder.update_with_config_and_cols(min_extr_stats, min_cols)
meteo_extract_config_builder.set_diff_params(min_extr_stats)

In [15]:
meteo_extract_manager = features.meteo.MeteoExtractManager(meteo_extract_config_builder)

In [16]:
dir_dropper = features.meteo.DirMeteoDropper()
dir_dt_builder = features.meteo.DtBuilder()

diff_cols = ["cloudCoverTotal", "windSpeed", "totalAccumulatedPrecipitation", "soilTemperature", "airTemperature", 
                        "relativeHumidity", "pressureReducedToMeanSeaLevel", "windAngleX", "windAngleY"]

dir_preprocessor = features.meteo.DirMeteoPreprocessor(dir_dropper, dir_dt_builder, diff_cols)
forecast_preprocessor = features.meteo.ForecastMeteoPreprocessor()

In [17]:
meteo_manager = features.meteo.MeteoManager(meteo_dir_loader, meteo_forecast_loader, dir_preprocessor,
                                           forecast_preprocessor, meteo_extract_manager)

### station manager

In [18]:
class DataBuilder:
    # TODO: функции доступа к date и id, тк сейчас они то в колоноках,
    # то в индексах, так что вылетают рандомные ошибки
    def __init__(self, s2m_dict):
        self.s2m_dict = s2m_dict
    
    def build(self, hydro, meteo):
        hydro = self.prepare_df(hydro)
        meteo = self.prepare_df(meteo)
        
        hydro, meteo = self.fill_missing_dates(hydro, meteo)
        merged = self.merge_parts(hydro, meteo)
        
        return self.extract_merged_x_y(merged)
        
        #return self.features, self.target
    
    def prepare_df(self, df):
        df = df.reset_index()
        df["date"] = pd.to_datetime(df["date"])
        first2cols = list(df.columns[:2])
        df.set_index(first2cols, inplace=True)
        df = features.utils.reduce_memory_usage(df)
        
        return df
    
    def fill_missing_dates(self, hydro, meteo):
        min_date, max_date = self.min_max_data_date(hydro, meteo)
        
        new_hydro_idx = self.create_all_dates_index(hydro, min_date, max_date)
        new_meteo_idx = self.create_all_dates_index(meteo, min_date, max_date)
        
        fill_val = np.nan
        hydro = hydro.reindex(new_hydro_idx, fill_value=fill_val)
        meteo = meteo.reindex(new_meteo_idx, fill_value=fill_val)
        
        return hydro, meteo
    
    def min_max_data_date(self, hydro, meteo):
        dates_hydro = hydro.index.get_level_values("date")
        dates_meteo = meteo.index.get_level_values("date")
        
        min_date = min(dates_hydro.min(), dates_meteo.min())
        max_date = max(dates_hydro.max(), dates_meteo.max())
        
        return min_date, max_date
    
    def create_all_dates_index(self, df, min_date, max_date):
        id_idxs = df.index.get_level_values(0).unique()
        new_date_index = pd.date_range(min_date, max_date, name="date")
        
        all_dates_index = pd.MultiIndex.from_product([id_idxs, new_date_index])
        
        return all_dates_index
    
    def merge_parts(self, hydro, meteo):
        nearest_meteo_id = self.hydro_to_meteo_map_col(hydro)
        
        hydro = hydro.reset_index()

        merged = hydro.merge(meteo, left_on=[nearest_meteo_id, "date"], right_on=["stationNumber", "date"], how="left")
        merged.set_index(["id", "date"], inplace=True)
        
        return merged
    
    def hydro_to_meteo_map_col(self, hydro):
        hydro_id = hydro.index.get_level_values("id")
        print(hydro_id)
        print(self.s2m_dict)
        hydro_nearest_meteo = hydro_id.map(self.s2m_dict)
        
        return hydro_nearest_meteo
    
    def extract_merged_x_y(self, merged):
        feature_cols = list(merged.columns)
        feature_cols.remove("target")

        features = merged[feature_cols]
        target = merged["target"]
        
        return features, target

In [19]:
s2m_dict = pd.read_csv(s2m_path, index_col=0).to_dict()["meteo_id"]
data_builder = DataBuilder(s2m_dict)

### getting features

In [20]:
station_features_manager = features.StationFeatureManager(hydro_manager, meteo_manager, data_builder)

In [None]:
train = station_features_manager.get_whole_past()

                 target
id   date              
5001 1984-01-01   258.0
     1984-01-02   255.0
     1984-01-03   252.0
     1984-01-04   248.0
     1984-01-05   244.0
...                 ...
6574 2018-12-27    21.0
     2018-12-28    21.0
     2018-12-29    21.0
     2018-12-30    21.0
     2018-12-31    21.0

[2231619 rows x 1 columns]
start hydro extraction
water_levels extract                  target
id   date              
5001 1984-01-01   258.0
     1984-01-02   255.0
     1984-01-03   252.0
     1984-01-04   248.0
     1984-01-05   244.0
...                 ...
6574 2020-09-27     NaN
     2020-09-28     NaN
     2020-09-29     NaN
     2020-09-30     NaN
     2020-10-01     NaN

[2657952 rows x 1 columns]
filled missing dates
                 target
id   date              
5001 1984-01-01   258.0
extracting levels stats
[[<function nanmean at 0x7ff1980fa0d0>, 1, 7], [<function nanmean at 0x7ff1980fa0d0>, 1, 30], [<function nanstd at 0x7ff1980faca0>, 1, 30], [<function nanmax a

  results[i] = self.f(v)
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,


agg func: lag1
agg func: lag2
agg func: mean_last_5_years


  return func(n_last_series)


agg func: std_last_5_years


  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  col[col == 12] = 9.5  # согласно README, это "10" с просветами
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  col[col == 11] = 0.05  # следы облаков
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vi

start extracting
end!
start extracting
end!
start extracting
end!
start extracting
end!
start extracting
end!
start extracting
end!
start extracting
end!
start extracting
end!
start extracting
end!
start extracting
end!
start extracting
end!
start extracting
end!
start extracting
end!
start extracting


In [21]:
meteo = station_features_manager.meteo_manager.make_past_features()
meteo.to_csv("meteo_features.csv")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  col[col == 12] = 9.5  # согласно README, это "10" с просветами
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  col[col == 11] = 0.05  # следы облаков
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  col[col == 13] = np.nan  # облака невозможн

start extracting
end!
start extracting
end!
start extracting
end!
start extracting
end!
start extracting
end!
start extracting
end!
start extracting
end!
start extracting
end!
start extracting
end!
start extracting
end!
start extracting
end!
start extracting
end!
start extracting
end!
start extracting
end!
start extracting
end!
start extracting
end!
start extracting
end!
start extracting
end!
start extracting
end!
start extracting
end!
start extracting
end!
start extracting
end!
start extracting
end!
start extracting
end!
start extracting
end!
start extracting
end!
start extracting
end!
start extracting
end!
start extracting
end!
start extracting
end!
start extracting
end!
start extracting
end!
start extracting
end!
start extracting
end!
start extracting
end!
start extracting
end!
start extracting
end!
start extracting
end!
start extracting
end!
start extracting
end!
start extracting
end!


In [None]:
# meteo = pd.read_csv("meteo_features.csv")
# hydro = pd.read_csv("hydro_features.csv")
# meteo.set_index(["stationNumber", "date"], inplace=True)
# hydro.set_index(["id", "date"], inplace=True)

# s2m_dict = pd.read_csv(s2m_path, index_col=0).to_dict()["meteo_id"]
# data_builder = DataBuilder(s2m_dict)

# train = data_builder.build(hydro, meteo)
# train

# past_features, past_target = train
# past_features.to_csv("past_features.csv")
# past_target.to_csv("past_target.csv")