## Extracting data for further models train and predict

### Step 0: imports and data load

In [1]:
# !pipenv shell
# !pipenv --where
# !python --version

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import math
import os
import re
import utils
from time import time
from copy import deepcopy, copy
from datetime import datetime, timedelta

import xarray 
from siphon.catalog import TDSCatalog

import warnings
warnings.simplefilter("ignore", pd.core.common.SettingWithCopyWarning)

In [3]:
root_dir = "../"
data_dir = os.path.join(root_dir, "working_data/")
os.makedirs(data_dir, exist_ok=True)
meteo_dir = root_dir + "datasets/meteo_new"

In [5]:
def add_quality_each_elem(columns):
    new_columns = list(columns)
    for col in columns:
        new_col = col + "Quality"
        new_columns.append(new_col)
        
    return new_columns
        

def drop_by_regex(df, regex):
    regex_cols = list_regex_find(df.columns, regex)
    return df.drop(columns=regex_cols)


def list_regex_find(lst, pattern):
    return [elem for elem in lst if re.findall(pattern, elem)]


def drop_nan_features(df, max_nan_freq=0.2):
    cols_nan_count = df.isna().sum(axis=0)
    drop_cols_mask = cols_nan_count > (len(df) * max_nan_freq)
    
    drop_cols = df.columns[drop_cols_mask]
    drop_cols = add_quality_each_elem(drop_cols) #also delete corresponding quality columns
    
    return df.drop(columns=drop_cols)


def drop_nan_rows(df, max_nan_count=17): 
    # max_nan_count > 17 is a situation (checked for this dataset) when not-none values
    # are only in index/time columns
    rows_nan_count = df.isna().sum(axis=1)
    drop_rows_mask = rows_nan_count > max_nan_count
    return df[~drop_rows_mask]
    
    

def drop_source_datetime_features(df, drop_start_meteo_day=False):
    drop_cols = ["year", "month", "day", "time", "localYear", "localMonth", "localDay", 
                 "localTimePeriod", "timePeriodNum", "localTime", "tz"]
    
    if drop_start_meteo_day:
        drop_cols.append("startMeteoDay") # don't drop by default to calc features for day and night (e.g., day/night temperature)
        
    return df.drop(columns=drop_cols)


def drop_useless_data(df):
    df = drop_source_datetime_features(df)
    df = drop_nan_features(df)
    df = drop_by_regex(df, "Sign")
    df = drop_by_regex(df, "Quality")
    df = drop_nan_rows(df)
    return df

    
def make_df_datetime(df):
    date_parts = df[["localYear", "localMonth", "localDay"]]
    date_parts.rename(columns={"localYear": "year", "localMonth": "month", "localDay": "day"}, inplace=True)
    return pd.to_datetime(date_parts) + pd.to_timedelta(df['localTime'], unit="hour")


def load_meteo(meteo_dir):
    files = os.listdir(meteo_dir)
    files = [file for file in files if file.split(".")[-1] == "csv"]#only data files

    meteo = build_meteo_from_parts(files)
    meteo = utils.reduce_memory_usage(meteo)
    meteo = drop_useless_data(meteo)
    
    return meteo

def build_meteo_from_parts(files):
    meteo = read_meteo(files[0])
    
    for filename in files[1:]:
        meteo_part = read_meteo(filename)
        meteo = meteo.append(meteo_part)
        
    return meteo


def read_meteo(filename):
    file_path = os.path.join(meteo_dir, filename)
    data = pd.read_csv(file_path, index_col=0)
    data["datetime"] = make_df_datetime(data)
    data = correct_time(data)
    
    return data


def correct_time(df):
    # данные до 1993-1-1 записывались по GMT+3, поэтому переведем их к Гринвичу
    df.loc[df['datetime'].dt.date < datetime(1993,1,1).date(), 'datetime'] -= timedelta(hours=3)
    return df

In [6]:
meteo = load_meteo(meteo_dir)
meteo.head(3)

Unnamed: 0,stationNumber,startMeteoDay,cloudCoverTotal,pastWeather,presentWeather,windDirection,windSpeed,maximumWindGustSpeed,totalAccumulatedPrecipitation,soilTemperature,...,maximumTemperatureOverPeriodSpecified,relativeHumidity,vapourPressure,dewpointTemperature,pressure,pressureReducedToMeanSeaLevel,characteristicOfPressureTendency,HourPressureChange3,stationId,datetime
52595,30879,21,0.0,0.0,2.0,0.0,0.0,0.0,0.0,,...,-32.099998,85.0,0.1,-35.299999,951.400024,1038.599976,2.0,0.4,5131961,1984-01-01 06:00:00
52596,30879,21,0.0,0.0,2.0,0.0,0.0,0.0,0.0,,...,-29.700001,85.0,0.1,-31.5,951.700012,1037.5,2.0,0.3,5131961,1984-01-01 09:00:00
52597,30879,21,0.0,0.0,2.0,0.0,0.0,0.0,0.0,,...,-25.799999,83.0,0.1,-27.700001,951.200012,1035.5,8.0,0.5,5131961,1984-01-01 12:00:00


In [7]:
len(meteo)

3568758

#### Построив распределения количества none от id станции и года, не выявлено станций или периодов, которые содержат большинство пропущенных значений. 
#### Вырезать None пока не будем, тк на это нужно время, а в xgboost встроена их интеллектуальная обработка. Если потребуется строить другие модели, прикручу удаление None.

In [6]:
# meteo.isna().sum().plot()
# plt.title("Count of nan value by column")
# meteo.isna().sum()
# meteo[meteo[["totalAccumulatedPrecipitation", "soilTemperature", "airTemperature"]].isna().all(axis=1)].isna().sum()

### Сразу разбираемся, можно ли все данные получить из предсказания, иначе они становятся бесполезны

In [7]:
def preprocess_dataset(df):
    df = process_cloud_cover(df)
    df = wind_direction_to_x_y(df)
    df = drop_cols_unable_to_forecast(df)
    return df


def process_cloud_cover(df):
    column = df["cloudCoverTotal"]
    column = column.astype(np.float32)
    column[column == 12] = 9.5 # согласно README, это "10" с просветами
    column[column == 11] = 0.05 # следы облаков
    column[column == 13] = np.nan # облака невозможно определить
    df["cloudCoverTotal"] = column
    
    return df


def wind_direction_to_x_y(df):
    wind_angle_x, wind_value_y = angle_to_x_y(df["windDirection"])
    df.drop(columns="windDirection", inplace=True)
    
    df["windAngleX"] = wind_angle_x
    df["windAngleY"] = wind_value_y
    
    return df


def angle_to_x_y(angles):
    out_x, out_y = np.zeros(len(angles)), np.zeros(len(angles))
    
    not_null_mask = (angles != 0) & (angles != 999)
     
    #working only with values in range (1, 360]
    angles = angles[not_null_mask]
    
    # from classical wind angles to geometry angles
    right_coords_angles = 90 - angles
    right_coords_angles[right_coords_angles < 0] += 360
    
    radians = np.radians(right_coords_angles)
    coses, sines = np.cos(radians), np.sin(radians)
    
    out_x[not_null_mask] = coses
    out_y[not_null_mask] = sines
    
    return out_x, out_y


def drop_cols_unable_to_forecast(df):
    unable_to_forecast_cols = ["pastWeather", "presentWeather", "maximumWindGustSpeed", 
                               "characteristicOfPressureTendency", "HourPressureChange3", 'vapourPressure']
    for col in unable_to_forecast_cols:
        if col in df.columns:
            df.drop(columns=col, inplace=True)
    return df


In [8]:
meteo = preprocess_dataset(meteo)
meteo.head(2)

Unnamed: 0,stationNumber,startMeteoDay,cloudCoverTotal,windSpeed,totalAccumulatedPrecipitation,soilTemperature,airTemperature,minimumTemperatureAtHeightAndOverPeriodSpecified,maximumTemperatureOverPeriodSpecified,relativeHumidity,dewpointTemperature,pressure,pressureReducedToMeanSeaLevel,stationId,datetime,windAngleX,windAngleY
52595,30879,21,0.0,0.0,0.0,,-33.700001,-33.700001,-32.099998,85.0,-35.299999,951.400024,1038.599976,5131961,1984-01-01 06:00:00,0.0,0.0
52596,30879,21,0.0,0.0,0.0,,-29.700001,-33.700001,-29.700001,85.0,-31.5,951.700012,1037.5,5131961,1984-01-01 09:00:00,0.0,0.0


#### Ниже - распределение значений, видны довольно сомнительные температуры воздуха и почвы на данных широтах. Принято решение оставить температуры как есть, тк данные завышены/занижены градусов на 5, это, скорее всего, происходит из-за естественных причин (несовершенность приборов, погрешность на критических значениях), а значит выборка не является смещённой

In [9]:
# for col in meteo.columns:
#     try:
#         sns.boxplot(y=meteo[col].values)
#     except: #not numeric feature
#         continue
        
#     print(col)
#     meteo[col].min()
#     meteo[col].max()
#     plt.show()

### Feature extraction of meteo data

#### Берём статистики и аггрегируем по разным временным промежуткам в зависимости от того, что кажется логичнее в физическом плане переменных

In [10]:
meteo = meteo.set_index(["stationNumber", "datetime"])
meteo.head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,startMeteoDay,cloudCoverTotal,windSpeed,totalAccumulatedPrecipitation,soilTemperature,airTemperature,minimumTemperatureAtHeightAndOverPeriodSpecified,maximumTemperatureOverPeriodSpecified,relativeHumidity,dewpointTemperature,pressure,pressureReducedToMeanSeaLevel,stationId,windAngleX,windAngleY
stationNumber,datetime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
30879,1984-01-01 06:00:00,21,0.0,0.0,0.0,,-33.700001,-33.700001,-32.099998,85.0,-35.299999,951.400024,1038.599976,5131961,0.0,0.0
30879,1984-01-01 09:00:00,21,0.0,0.0,0.0,,-29.700001,-33.700001,-29.700001,85.0,-31.5,951.700012,1037.5,5131961,0.0,0.0


In [11]:
# save_meteo = meteo.copy()

In [12]:
def differential(ts, step=1):
    shifted_ts = ts.shift(step)
    return ts - shifted_ts

def add_diferential(df, col_names):
    for col in col_names:
        new_col = str(col) + "_diff"
        diff_val = differential(df[col])
        diff_val[diff_val.isna()] = 0 # nans are at first values
        df[new_col] = diff_val
        
    return df


def add_is_day(df):
    hours = df.index.get_level_values(1).hour
    is_day_mask = (9 < hours) & (hours < 24) 
    df["isDay"] = is_day_mask
    return df

In [13]:
# добавляем разницу значений с предыдущим промежутком
meteo = add_diferential(meteo, ["cloudCoverTotal", "windSpeed", "totalAccumulatedPrecipitation", "soilTemperature", "airTemperature", 
                        "relativeHumidity", "pressureReducedToMeanSeaLevel", "windAngleX", "windAngleY"])
meteo = add_is_day(meteo)

meteo.head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,startMeteoDay,cloudCoverTotal,windSpeed,totalAccumulatedPrecipitation,soilTemperature,airTemperature,minimumTemperatureAtHeightAndOverPeriodSpecified,maximumTemperatureOverPeriodSpecified,relativeHumidity,dewpointTemperature,...,cloudCoverTotal_diff,windSpeed_diff,totalAccumulatedPrecipitation_diff,soilTemperature_diff,airTemperature_diff,relativeHumidity_diff,pressureReducedToMeanSeaLevel_diff,windAngleX_diff,windAngleY_diff,isDay
stationNumber,datetime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
30879,1984-01-01 06:00:00,21,0.0,0.0,0.0,,-33.700001,-33.700001,-32.099998,85.0,-35.299999,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
30879,1984-01-01 09:00:00,21,0.0,0.0,0.0,,-29.700001,-33.700001,-29.700001,85.0,-31.5,...,0.0,0.0,0.0,0.0,4.0,0.0,-1.099976,0.0,0.0,False


In [14]:
not_feature_cols = ["stationId"]
feature_cols = [col for col in meteo.columns if col not in not_feature_cols]

In [15]:
# TODO: drop startMeteoDay

# Аггрегировали всё по дням, чтобы дальше было легко считать статистики с лагом и окном в днях, хоть это и лишние вычисления
standard_stats = [np.nanmin, np.nanmax, np.nanmean] # not using sum and std

meteo["date"] = meteo.index.get_level_values(1).date

meteo_whole_day = meteo.groupby(by=["stationNumber", "date"])[feature_cols].agg(standard_stats)
meteo_day_night = meteo.groupby(by=["stationNumber", "date", "isDay"])[feature_cols].agg(standard_stats)
meteo_whole_day.head(1)
meteo_day_night.head(3)

agg_level_names = ["feature", "statname"]
meteo_whole_day.columns.names = agg_level_names
meteo_day_night.columns.names = agg_level_names

Unnamed: 0_level_0,Unnamed: 1_level_0,cloudCoverTotal,cloudCoverTotal,cloudCoverTotal,windSpeed,windSpeed,windSpeed,totalAccumulatedPrecipitation,totalAccumulatedPrecipitation,totalAccumulatedPrecipitation,soilTemperature,...,pressureReducedToMeanSeaLevel_diff,windAngleX_diff,windAngleX_diff,windAngleX_diff,windAngleY_diff,windAngleY_diff,windAngleY_diff,isDay,isDay,isDay
Unnamed: 0_level_1,Unnamed: 1_level_1,nanmin,nanmax,nanmean,nanmin,nanmax,nanmean,nanmin,nanmax,nanmean,nanmin,...,nanmean,nanmin,nanmax,nanmean,nanmin,nanmax,nanmean,nanmin,nanmax,nanmean
stationNumber,date,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
30673,1984-01-01,0.0,10.0,6.666667,0.0,1.0,0.166667,0.0,0.0,0.0,-35.0,...,4.600006,-0.173648,0.898794,0.149799,-0.984808,0.984808,-0.073062,False,True,0.666667


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,cloudCoverTotal,cloudCoverTotal,cloudCoverTotal,windSpeed,windSpeed,windSpeed,totalAccumulatedPrecipitation,totalAccumulatedPrecipitation,totalAccumulatedPrecipitation,soilTemperature,...,pressureReducedToMeanSeaLevel_diff,windAngleX_diff,windAngleX_diff,windAngleX_diff,windAngleY_diff,windAngleY_diff,windAngleY_diff,isDay,isDay,isDay
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,nanmin,nanmax,nanmean,nanmin,nanmax,nanmean,nanmin,nanmax,nanmean,nanmin,...,nanmean,nanmin,nanmax,nanmean,nanmin,nanmax,nanmean,nanmin,nanmax,nanmean
stationNumber,date,isDay,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2
30673,1984-01-01,False,10.0,10.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,-35.0,...,12.749969,0.0,0.898794,0.449397,-0.438371,0.0,-0.219186,False,False,False
30673,1984-01-01,True,0.0,10.0,5.0,0.0,1.0,0.25,0.0,0.0,0.0,-34.0,...,0.525024,-0.173648,0.173648,0.0,-0.984808,0.984808,0.0,True,True,True
30673,1984-01-02,False,0.0,3.0,0.75,0.0,0.0,0.0,0.0,0.0,0.0,-30.0,...,0.25,0.0,0.0,0.0,0.0,0.0,0.0,False,False,False


In [16]:
del meteo

In [17]:
def pivot_isday(df):
    # resetting columns
    old_cols = df.columns
    df.columns = [i for i in range(len(df.columns))]
    
    new_levels = [level for level in df.index.names if level != "isDay"]
#     print(new_levels, "isDay")
    df = df.reset_index().pivot(index=new_levels, columns="isDay")
    
#     print(old_cols)
#     old_cols_joined = pd.Index(join_multiindex(old_cols))
#     df.columns = old_cols
    new_cols = add_cols_level_with_tuples(old_cols, pd.Index([False, True], name="isDay"))
    df.columns = new_cols
    
    return df
    

def add_cols_level_with_tuples(columns, new_idx):
    new_names = list(columns.names)
    new_names.append(new_idx.name)
    
    multiindex_tuples = []
    
    for source_columns_idx in range(len(columns)):
        for new_level_value in new_idx.values:
            source_col = list(columns[source_columns_idx])
            source_col.append(new_level_value)
            multiindex_tuples.append(tuple(source_col))
            
    new_idx = pd.MultiIndex.from_tuples(multiindex_tuples, names=new_names)
    
    return new_idx
    
    
    
def join_multiindex(mul_idx):
    new_index = []
    for elem in mul_idx:
        new_index.append("_".join(elem))
        
    return new_index


In [18]:
meteo_day_night = pivot_isday(meteo_day_night)
meteo_day_night.head()

Unnamed: 0_level_0,feature,cloudCoverTotal,cloudCoverTotal,cloudCoverTotal,cloudCoverTotal,cloudCoverTotal,cloudCoverTotal,windSpeed,windSpeed,windSpeed,windSpeed,...,windAngleY_diff,windAngleY_diff,windAngleY_diff,windAngleY_diff,isDay,isDay,isDay,isDay,isDay,isDay
Unnamed: 0_level_1,statname,nanmin,nanmin,nanmax,nanmax,nanmean,nanmean,nanmin,nanmin,nanmax,nanmax,...,nanmax,nanmax,nanmean,nanmean,nanmin,nanmin,nanmax,nanmax,nanmean,nanmean
Unnamed: 0_level_2,isDay,False,True,False,True,False,True,False,True,False,True,...,False,True,False,True,False,True,False,True,False,True
stationNumber,date,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3,Unnamed: 22_level_3
30673,1984-01-01,10.0,0.0,10.0,10.0,10.0,5.0,0.0,0.0,0.0,1.0,...,0.0,0.984808,-0.219186,0.0,False,True,False,True,False,True
30673,1984-01-02,0.0,0.0,3.0,6.0,0.75,1.5,0.0,0.0,0.0,3.0,...,0.0,0.939693,0.0,-0.191511,False,True,False,True,False,True
30673,1984-01-03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.939693,0.0,0.191511,0.0,False,True,False,True,False,True
30673,1984-01-04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.642788,0.939693,0.0,0.0,False,True,False,True,False,True
30673,1984-01-05,0.0,0.0,0.0,10.0,0.0,4.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,False,True,False,True,False,True


In [21]:
def add_isday_level_whole_day(cols):
    new_cols = add_cols_level_with_tuples(cols, pd.Index([np.nan], name="isDay"))
    return new_cols

meteo_whole_day.columns = add_isday_level_whole_day(meteo_whole_day.columns)

In [23]:
meteo = pd.concat((meteo_day_night, meteo_whole_day), axis=1)
del meteo_day_night, meteo_whole_day

In [25]:
meteo.head(2)

Unnamed: 0_level_0,feature,cloudCoverTotal,cloudCoverTotal,cloudCoverTotal,cloudCoverTotal,cloudCoverTotal,cloudCoverTotal,windSpeed,windSpeed,windSpeed,windSpeed,...,pressureReducedToMeanSeaLevel_diff,windAngleX_diff,windAngleX_diff,windAngleX_diff,windAngleY_diff,windAngleY_diff,windAngleY_diff,isDay,isDay,isDay
Unnamed: 0_level_1,statname,nanmin,nanmin,nanmax,nanmax,nanmean,nanmean,nanmin,nanmin,nanmax,nanmax,...,nanmean,nanmin,nanmax,nanmean,nanmin,nanmax,nanmean,nanmin,nanmax,nanmean
Unnamed: 0_level_2,isDay,False,True,False,True,False,True,False,True,False,True,...,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN
stationNumber,date,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3,Unnamed: 22_level_3
30673,1984-01-01,10.0,0.0,10.0,10.0,10.0,5.0,0.0,0.0,0.0,1.0,...,4.600006,-0.173648,0.898794,0.149799,-0.984808,0.984808,-0.073062,False,True,0.666667
30673,1984-01-01,10.0,0.0,10.0,10.0,10.0,5.0,0.0,0.0,0.0,1.0,...,4.600006,-0.173648,0.898794,0.149799,-0.984808,0.984808,-0.073062,False,True,0.666667


In [33]:
meteo.columns.get_level_values(0).unique()

Index(['cloudCoverTotal', 'windSpeed', 'totalAccumulatedPrecipitation',
       'soilTemperature', 'airTemperature',
       'minimumTemperatureAtHeightAndOverPeriodSpecified',
       'maximumTemperatureOverPeriodSpecified', 'relativeHumidity',
       'dewpointTemperature', 'pressure', 'pressureReducedToMeanSeaLevel',
       'windAngleX', 'windAngleY', 'cloudCoverTotal_diff', 'windSpeed_diff',
       'totalAccumulatedPrecipitation_diff', 'soilTemperature_diff',
       'airTemperature_diff', 'relativeHumidity_diff',
       'pressureReducedToMeanSeaLevel_diff', 'windAngleX_diff',
       'windAngleY_diff', 'isDay'],
      dtype='object', name='feature')

In [29]:
# meteo.loc[:, meteo.columns.get_level_values(1) == "nanmin"]
agg_cols_settings = {
    "cloudCoverTotal": {
        "nanmin": {
            "funcs": [np.nanmin],
            "shifts": [1],
            "windows": [7]
        },
        "nanmax": {
            "funcs": [np.nanmax, np.nanmean],
            "shifts": [1, 1],
            "windows": [7, 30]
        }
        "nanmean": {
            "funcs": [np.nanmean, np.nanstd, np.nanmean, np.nanstd],
            "shifts": [1, 1, 1, 1],
            "windows": [7, 7, 30, 30]
        }
    }
    
    
    "windSpeed": {
        "nanmin": {
            "funcs": [np.nanmean], #mean, not min!
            "shifts": [1],
            "windows": [7]
        },
        "nanmax": {
            "funcs": [np.nanmax, np.nanmean],
            "shifts": [1, 1],
            "windows": [7, 30]
        }
        "nanmean": {
            "funcs": [np.nanmean, np.nanstd, np.nanmean, np.nanstd],
            "shifts": [1, 1, 1, 1],
            "windows": [7, 7, 30, 30]
        }
    }
    
    
    "totalAccumulatedPrecipitation": {
        "nanmin": {
            "funcs": [np.nanmin],
            "shifts": [1],
            "windows": [7]
        },
        "nanmax": {
            "funcs": [np.nanmax, np.nanmean],
            "shifts": [1, 1],
            "windows": [7, 30]
        }
        "nanmean": {
            "funcs": [np.nanmean, np.nanstd, np.nanmean, np.nanstd],
            "shifts": [1, 1, 1, 1],
            "windows": [7, 7, 30, 30]
        }
    }
}

330

### Get forecast and convert to dataset form

In [9]:
name_forecast_to_dataset = {"Total_cloud_cover_entire_atmosphere_Mixed_intervals_Average": "cloudCoverTotal",
                            'u-component_of_wind_height_above_ground': "windAngleX", 
                            'v-component_of_wind_height_above_ground': "windAngleY",
                            'Wind_speed_gust_surface': "windSpeed", 
                            'Total_precipitation_surface_Mixed_intervals_Accumulation': "totalAccumulatedPrecipitation", 
                            "Temperature_height_above_ground": 'airTemperature', 
                            'Maximum_temperature_height_above_ground_Mixed_intervals_Maximum': 'maximumTemperatureOverPeriodSpecified', 
                            'Minimum_temperature_height_above_ground_Mixed_intervals_Minimum': 'minimumTemperatureAtHeightAndOverPeriodSpecified',
                            'Temperature_surface': 'soilTemperature', 
                            'Relative_humidity_height_above_ground': 'relativeHumidity', 
                            'Pressure_height_above_ground': 'pressure', 
                            'Pressure_reduced_to_MSL_msl': 'pressureReducedToMeanSeaLevel',
                            "Dewpoint_temperature_height_above_ground": "dewpointTemperature"
                           }

required_data = list(name_forecast_to_dataset.keys())

In [19]:
# TODO: распараллелить получение предсказаний, попробовать пробивать по региону


def get_latest_dataset_client(catalog_url):
    gfs_cat = TDSCatalog(catalog_url)
    ncss_client = gfs_cat.latest.subset()
    return ncss_client


def build_base_query(client, required_data):
    query = client.query()
    
    query = query.variables(*required_data)
    query = query.all_times()
    query = query.accept("netCDF4")

    return query


def update_query_point(query, point): #also deletes old point
    query = query.lonlat_point(*point)
    #query = query.vertical_level(10)
    return query


def get_weather_forecast(points: list, required_data):
    url = "http://thredds.ucar.edu/thredds/catalog/grib/NCEP/GFS/Global_0p5deg/catalog.xml"
    point_outputs = []
    
    client = get_latest_dataset_client(url)
    query = build_base_query(client, required_data)
    
    for point in points:
        query = update_query_point(query, point)
        print("get data")#, query)
        t1 = time()
        out = client.get_data(query)
        print("Got data, time:", time() - t1)
        point_outputs.append(out)
    
    return point_outputs

forecast_coords = [(135, 50), (66, 45), (-105, 40)] # переделать в dataframe с парами id: coords
forecast = get_weather_forecast([forecast_coords], required_data)

get data
Got data, time: 153.09250497817993
get data
Got data, time: 149.59170627593994
get data
Got data, time: 65.92767429351807


In [20]:
def parse_forecast(forecasted_dataset):
    data = xarray.backends.NetCDF4DataStore(forecasted_dataset)
    
    values = {}
    for var_name in required_data:
        var_value = get_var_values(data, var_name)
#         print(var_value.shape)
#         print(var_value[-1])
        values[var_name] = var_value
        
    values = pd.DataFrame.from_dict(values, orient="columns")
    timestamps = get_timestamps(data)
    values.index = timestamps
    
    values = fill_forecast_nan(values)
    return values
    
    
def get_timestamps(data):
    attrs = data.get_attrs()
    start_time = pd.to_datetime(attrs["time_coverage_start"])
    
    hours_from_start = data.get_variables()["time"]
#     print(hours_from_start)
    time_deltas = timedelta_from_hours(hours_from_start)[:-1]
    
    timestamps = start_time + time_deltas
    
    timestamps.name = "datetime"
    return timestamps


def timedelta_from_hours(hours):
    return pd.to_timedelta(hours.values.flatten(), "h") 


def get_var_values(data, name):
    values = data.get_variables()[name].values
    shape = values.shape
    if len(shape) == 3:
        values = values[:, :, 0]
        
    if len(shape) > 3:
        raise ValueError("В Forecast размерность values > 3")
        
    return values.flatten()[:-1]


def fill_forecast_nan(df):
    df = fill_with_first_notnan(df, df.columns)
    return df


def fill_with_first_notnan(df, cols):
    cols = list(cols)
    
    for col in cols:
        notnan = df.loc[df[col].notnull(), col]
        first_notnan = notnan.iloc[0]
        df.loc[df[col].isna(), col] = first_notnan
    return df

In [17]:
def make_dataset_from_forecast(point_forecast):
    point_forecast = rescale_forecast(point_forecast)
    point_forecast = rename_forecast(point_forecast)
    point_forecast = agg_forecast_features(point_forecast)
    
    return point_forecast
    
    
def rescale_forecast(forecast):
    forecast = rescale_cloud_cover(forecast)
    forecast = rescale_wind_vector(forecast)
    forecast = rescale_temperature(forecast)
    forecast = rescale_pressure(forecast)
    return forecast


def rename_forecast(df):
    new_names = name_forecast_to_dataset # add them to utils??
    df.rename(columns=new_names, inplace=True)
    return df
    


def agg_forecast_features(forecast):
    # сделать аггрегирование, как у обычного dataset'а
    return forecast 
    
def rescale_cloud_cover(df):
    df["Total_cloud_cover_entire_atmosphere_Mixed_intervals_Average"] /= 10 # rescaling from 0-100 to 0-10
    return df


def rescale_wind_vector(df):
    wind_x = df['u-component_of_wind_height_above_ground']
    wind_y = df['v-component_of_wind_height_above_ground']
    vector_module = (wind_x ** 2 + wind_y ** 2)**0.5
    
    wind_x = np.sign(wind_x) * wind_x / vector_module
    wind_y = np.sign(wind_y) * wind_y / vector_module
    
    # if vector_module == 0, wind_x and wind_y are np.nan
    wind_x[vector_module == 0] = 0
    wind_y[vector_module == 0] = 0
    
    
    df['u-component_of_wind_height_above_ground'] = wind_x
    df['v-component_of_wind_height_above_ground'] = wind_y
    
    return df


def rescale_temperature(df):
    temperature_cols = ["Temperature_height_above_ground", "Temperature_surface",
                        "Maximum_temperature_height_above_ground_Mixed_intervals_Maximum",
                        "Minimum_temperature_height_above_ground_Mixed_intervals_Minimum",
                        "Dewpoint_temperature_height_above_ground"
                       ]
    
    df[temperature_cols] -= 273 # temperature there is in absolute form
    return df


def rescale_pressure(df):
    pressure_cols = ["Pressure_height_above_ground", "Pressure_reduced_to_MSL_msl"]
    df[pressure_cols] /= 100
    return df

In [None]:
def forecast_to_test_dataset(forecast):
    for point in forecast:
        forecast_data = parse_forecast(point)
        forecast_data = make_dataset_from_forecast(forecast_data)