## 1: load water levels and coords

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import sys
sys.path.append("..")

In [3]:
import pandas as pd
# import geopandas as gpd
import numpy as np
import os

import utils
from features.meteo import DirMeteoLoader, DirMeteoPreprocessor, DirMeteoDropper, DtBuilder

from importlib import reload

In [4]:
root_dir = "../../"
data_dir = os.path.join(root_dir, "datasets/")
processed_data_dir = os.path.join(data_dir, "processed_data/")

water_levels_path = os.path.join(data_dir, "hydro_2018-2020/new_data_all.csv")
posts_path = os.path.join(processed_data_dir, "asunp.pkl")

meteo_dir = os.path.join(data_dir, "meteo_new/")

In [5]:
meteo_loader = DirMeteoLoader(meteo_dir)
meteo = meteo_loader.load()
meteo.head(3)

Unnamed: 0,stationNumber,year,month,day,time,localYear,localMonth,localDay,localTimePeriod,timePeriodNum,...,dewpointTemperatureQuality,pressure,pressureQuality,pressureReducedToMeanSeaLevel,pressureReducedToMeanSeaLevelQuality,characteristicOfPressureTendency,characteristicOfPressureTendencyQuality,HourPressureChange3,HourPressureChange3Quality,stationId
52595,30879,1984,1,1,0,1984,1,1,3,4,...,0,951.400024,0,1038.599976,0,2.0,0,0.4,0,5131961
52596,30879,1984,1,1,3,1984,1,1,6,5,...,0,951.700012,0,1037.5,0,2.0,0,0.3,0,5131961
52597,30879,1984,1,1,6,1984,1,1,9,6,...,0,951.200012,0,1035.5,0,8.0,0,0.5,0,5131961


In [5]:
water_levels = pd.read_csv(water_levels_path, sep=";")
posts = utils.load_pickle(posts_path)

water_levels = utils.reduce_memory_usage(water_levels)
posts = utils.reduce_memory_usage(posts)

water_levels.head(3)
posts.head(3)

Unnamed: 0,time,max_level,identifier
0,2020-10-01 00:00:00,232.0,5116
1,1986-06-20 00:00:00,278.0,5292
2,1986-06-22 00:00:00,67.0,6022


Unnamed: 0,foId,pgid,pagr,paer,codBasin,station_id,kod1,kod2,kod3,agro,...,cgms,pn,stStatus,rv,pmet,codeSubject,ter,parentNpSn,kto,geometry
0,8.0,2.0,0.0,0.0,177,,639,445,0,,...,999,100.0,open,0,0.0,14,98,5043.0,53,POINT (141.36667 63.33333)
1,8.0,4.0,0.0,0.0,177,,639,151,0,,...,999,20.0,open,0,0.0,14,98,5049.0,53,POINT (144.30000 64.44667)
2,8.0,2.0,0.0,0.0,200,2224.0,619,0,0,,...,999,28.0,open,0,0.0,41,30,5333.0,53,POINT (160.83000 56.81000)


## 2: form water_levels, manipulating with indexes 

In [12]:
pd.to_datetime(water_levels["time"], yearfirst=True)

0           1
1          20
2          22
3          23
4          22
           ..
2231614    15
2231615    17
2231616    17
2231617    14
2231618    15
Name: time, Length: 2231619, dtype: int64

In [13]:
water_datetime_date = pd.to_datetime(water_levels["time"], yearfirst=True).dt.date
water_levels["time"] = water_datetime_date

water_levels.rename(columns={"time": "date", "identifier": "id"}, inplace=True)

water_levels.sort_values(by=["id", "date"], inplace=True)
water_levels.set_index(["id", "date"], inplace=True)

water_levels.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,max_level
id,date,Unnamed: 2_level_1
5001,1984-01-01,258.0
5001,1984-01-02,255.0
5001,1984-01-03,252.0
5001,1984-01-04,248.0
5001,1984-01-05,244.0


## 3: get coords of hydrostations

In [62]:
def object_cols2num(df):
    """Tries to convert each objct col to num dtype, if error occures for
    some col, it's returned not modified"""
    object_cols_mask = (df.dtypes == "O").values
    object_cols = df.columns[object_cols_mask]
    
    for col in object_cols:
        df[col] = df[col].astype(np.float32, errors="ignore")
    return df

posts = object_cols2num(posts)

In [63]:
def coords_by_col_val(post_df, id_colname, needed_values):
    id_col_vals = post_df[id_colname]
    needed_rows_mask = id_col_vals.isin(needed_values)
    needed_rows = post_df[needed_rows_mask.values].reset_index(drop=True)
    
    coords_with_id_col = needed_rows[[id_colname, "lon", "lat"]]
    return coords_with_id_col

In [66]:
needed_hydro_ids = water_levels.reset_index()["id"].unique()
hydro_coords = coords_by_col_val(posts, "gidro", needed_hydro_ids)
hydro_coords.head(2)

Unnamed: 0,gidro,lon,lat
0,5674.0,132.8,48.72
1,5216.0,132.01,44.06


In [67]:
hydro_coords.drop_duplicates(subset="gidro", inplace=True)
hydro_coords.rename(columns={"gidro": "id"}, inplace=True)
hydro_coords.set_index("id", inplace=True)

## 4: meteo processing

In [6]:
meteo.head(2)

Unnamed: 0,stationNumber,year,month,day,time,localYear,localMonth,localDay,localTimePeriod,timePeriodNum,...,dewpointTemperatureQuality,pressure,pressureQuality,pressureReducedToMeanSeaLevel,pressureReducedToMeanSeaLevelQuality,characteristicOfPressureTendency,characteristicOfPressureTendencyQuality,HourPressureChange3,HourPressureChange3Quality,stationId
52595,30879,1984,1,1,0,1984,1,1,3,4,...,0,951.400024,0,1038.599976,0,2.0,0,0.4,0,5131961
52596,30879,1984,1,1,3,1984,1,1,6,5,...,0,951.700012,0,1037.5,0,2.0,0,0.3,0,5131961


In [7]:
dt_builder = DtBuilder()
meteo_dropper = DirMeteoDropper()

In [9]:
preprocessor = DirMeteoPreprocessor(meteo, meteo_dropper, dt_builder)
meteo_processed = preprocessor.preprocess()
meteo_processed.head()

Unnamed: 0,stationNumber,cloudCoverTotal,windSpeed,totalAccumulatedPrecipitation,soilTemperature,airTemperature,minimumTemperatureAtHeightAndOverPeriodSpecified,maximumTemperatureOverPeriodSpecified,relativeHumidity,dewpointTemperature,pressure,pressureReducedToMeanSeaLevel,stationId,datetime,windAngleX,windAngleY
52595,30879,0.0,0.0,0.0,,-33.700001,-33.700001,-32.099998,85.0,-35.299999,951.400024,1038.599976,5131961,1984-01-01 09:00:00,0.0,0.0
52596,30879,0.0,0.0,0.0,,-29.700001,-33.700001,-29.700001,85.0,-31.5,951.700012,1037.5,5131961,1984-01-01 12:00:00,0.0,0.0
52597,30879,0.0,0.0,0.0,,-26.0,-29.700001,-25.799999,83.0,-27.700001,951.200012,1035.5,5131961,1984-01-01 15:00:00,0.0,0.0
52598,30879,0.0,0.0,0.0,,-28.1,-28.1,-25.9,85.0,-29.799999,951.099976,1036.099976,5131961,1984-01-01 18:00:00,0.0,0.0
52599,30879,0.0,0.0,0.0,,-31.0,-31.0,-28.1,86.0,-32.799999,951.400024,1037.5,5131961,1984-01-01 21:00:00,0.0,0.0


In [11]:
meteo_processed.set_index(["stationNumber", "datetime"], inplace=True)

In [15]:
date = meteo_processed.reset_index()["datetime"].dt.date
meteo_processed.groupby(["stationNumber", date.values]).agg(np.nanmean)

Unnamed: 0_level_0,Unnamed: 1_level_0,cloudCoverTotal,windSpeed,totalAccumulatedPrecipitation,soilTemperature,airTemperature,minimumTemperatureAtHeightAndOverPeriodSpecified,maximumTemperatureOverPeriodSpecified,relativeHumidity,dewpointTemperature,pressure,pressureReducedToMeanSeaLevel,stationId,windAngleX,windAngleY
stationNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
30673,1984-01-01,6.800,0.200,0.0,-30.799999,-27.380001,-30.120001,-24.639999,79.800003,-29.940001,949.619995,1035.239990,5371981,-3.472964e-02,0.196962
30673,1984-01-02,1.125,0.500,0.0,-26.750000,-27.725000,-30.900000,-24.675001,77.750000,-30.687500,952.962524,1039.012451,5371981,-1.677525e-01,0.117462
30673,1984-01-03,0.750,0.375,0.0,-32.400002,-30.200001,-32.125000,-27.662500,79.750000,-32.662502,955.537476,1042.699951,5371981,1.231009e-01,-0.213217
30673,1984-01-04,0.000,0.250,0.0,-33.000000,-30.087500,-31.862499,-27.637499,79.125000,-32.674999,952.912476,1039.762451,5371981,-5.300305e-02,-0.037113
30673,1984-01-05,2.000,0.125,0.0,-31.000000,-31.512501,-33.087502,-29.375000,78.125000,-34.125000,949.974976,1037.099976,5371981,1.490610e-09,-0.125000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31931,2020-03-27,7.000,1.875,0.8,0.550000,1.450000,0.900000,3.387500,71.250000,-3.975000,1003.174988,1015.700012,4483311,-2.918216e-01,0.794642
31931,2020-03-28,2.375,3.000,0.0,0.375000,-1.025000,-1.912500,0.125000,49.750000,-10.812500,1004.512512,1017.174988,4483311,-8.330811e-01,0.070538
31931,2020-03-29,1.000,3.875,0.0,3.875000,3.250000,1.312500,4.250000,33.000000,-12.225000,1006.750000,1019.262512,4483311,-5.877326e-01,-0.797764
31931,2020-03-30,1.000,4.250,0.0,6.425000,7.487500,5.687500,8.862500,40.000000,-6.112500,1003.537476,1015.812500,4483311,-3.855550e-01,-0.800477
