In [14]:
import sys
sys.path.append("..")

In [15]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [16]:
import matplotlib.pyplot as plt

In [17]:
import pandas as pd
import numpy as np

import features

import lightgbm as lgb
from sklearn.metrics import mean_absolute_error as mae

from datetime import datetime, timedelta

In [18]:
data_dir = "../../datasets/"
s2m_path = "./handmade_s2m.csv"

In [19]:
X = pd.read_csv("past_features.csv", index_col=[0, 1])
target = pd.read_csv("past_target.csv", index_col=[0, 1])

X = features.utils.reduce_memory_usage(X)
target = features.utils.reduce_memory_usage(target)

In [20]:
cols = list(X.columns)
cols[8: 8+7] = [f"lag_{i}" for i in range(1, 8)]

X.columns = cols

In [21]:
X.head(2)
target.head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,stationNumber,water_levels_nanmean_1_7,water_levels_nanmean_1_30,water_levels_nanstd_1_30,water_levels_nanmax_1_7,water_levels_nanmin_1_7,water_levels_nanmax_1_30,water_levels_nanmin_1_30,lag_1,lag_2,...,maximumTemperatureOverPeriodSpecified_nanmean_1_7,cloudCoverTotal_diff_nanmean_1_7,windSpeed_diff_nanmean_1_7,totalAccumulatedPrecipitation_diff_nanmean_1_7,soilTemperature_diff_nanmean_1_7,airTemperature_diff_nanmean_1_7,relativeHumidity_diff_nanmean_1_7,pressureReducedToMeanSeaLevel_diff_nanmean_1_7,windAngleX_diff_nanmean_1_7,windAngleY_diff_nanmean_1_7
id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
5001,1984-01-01,31707,,,,,,,,,,...,,,,,,,,,,
5001,1984-01-02,31707,258.0,258.0,,258.0,258.0,258.0,258.0,258.0,,...,-17.424999,1.0,-1.333333,0.0,1.0,1.666667,-4.666667,-0.299988,-0.121696,-0.077529


Unnamed: 0_level_0,Unnamed: 1_level_0,target
id,date,Unnamed: 2_level_1
5001,1984-01-01,258.0
5001,1984-01-02,255.0


### find nearest and most corr points and concat

In [22]:
# global_features = features.groupby("date").agg(np.nanmean)
# global_target = target.groupby("date").agg(np.nanmean)

In [23]:
class TimeSeriesValFoldRetriever:
    def __init__(self, X, labels, nfolds=12, val_width=30):
        self.X = X
        self.labels = labels
        
        self.dates = self.X.index.get_level_values("date")
        self.uniq_dates = sorted(self.dates.unique())
        self.unique_dates_num = len(self.uniq_dates)
        
        self.nfolds = nfolds
        self.val_width = val_width
        
        self.set_folds_periods()
        
        
    def set_folds_periods(self):
        self.train_masks = []
        self.val_masks = []
        
        train_start = 0
        last_idx = self.unique_dates_num - 1
        
        for fold_idx in range(self.nfolds):
            folds_till_end = self.nfolds - fold_idx + 1
            train_end = last_idx - folds_till_end * self.val_width
            
            val_start = train_end
            val_end = val_start + self.val_width
            
            train_dates = self.uniq_dates[train_start: train_end]
            val_dates = self.uniq_dates[val_start: val_end]
            
            train_date_mask = self.dates.isin(train_dates)
            val_date_mask = self.dates.isin(val_dates)
            
            self.train_masks.append(train_date_mask)
            self.val_masks.append(val_date_mask)
        
#     def __next__(self):
        
        
        
    def __iter__(self):
        for fold_idx in range(self.nfolds):
            train_period = self.train_masks[fold_idx]
            val_period = self.val_masks[fold_idx]
            
            train_X, train_labels = self.X[train_period], self.labels[train_period]
            val_X, val_labels = self.X[val_period], self.labels[val_period]
            
            yield train_X, train_labels, val_X, val_labels

In [24]:
class CrossValidator:
    def __init__(self, folds, model, metric):
        """
        :param folds: iterable containing t_x, t_y, v_x, v_y"""
        self.folds = folds
        self.model = model
        self.metrics = metric
        
    def run_cv(self):
        self.metrics_vals = {f"{metric.__name__}": [] for metric in self.metrics}
        
        for fold_num, (xtrain, ytrain, xval, yval) in enumerate(self.folds):
            print(f"starting {fold_num} fold")
#             print(xtrain, ytrain)
            model.fit(xtrain, ytrain)
            val_preds = model.predict(xval)
            
            for metric in self.metrics:
                metric_val = metric(val_preds, yval)
                self.metrics_vals[f"{metric.__name__}"].append(metric_val)
            
        return self.metrics_vals


In [25]:
class StationModelsManager:
    """creates StattionFitters for every station_id from __init__, 
    Cat train and evaluate these models, return final metric for all stations"""
    def __init__(self, station_ids, models):
        self.station_ids = station_ids
        self.models = {}
        
        for id_stat, model in zip(self.station_ids, models):
            self.models[id_stat] = model
        
    
    def fit(self, X: pd.DataFrame, target: pd.DataFrame):
        
        for station_id, model in self.models.items():
            station_X = self._get_station_data(X, station_id)
            station_target = self._get_station_data(target, station_id)
            model.fit(station_X, station_target)
            
    def predict(self, X: pd.DataFrame):
        answers = np.zeros(len(X))
        
        for station_id, model in self.models.items():
            station_X = self._get_station_data(X, station_id)
            station_model = self.models[station_id]
            preds = station_model.predict(station_X)
            
            curr_station_mask = self._curr_station_mask(X, station_id)
            answers[curr_station_mask] = preds
            
        return answers
            
    def _get_station_data(self, df, station_id):
        station_mask = self._curr_station_mask(df, station_id)
        df_station = df.iloc[station_mask]
        return df_station
    
    def _curr_station_mask(self, df, station_id):
        id_col = df.reset_index()["id"]
        station_mask = (id_col == station_id).values
        
        return station_mask

In [26]:
class LgbModel:
    """Controlles process of training model on data from single station"""
    def __init__(self, model_config, fobj):
        self.lgb_param = model_config
        self.fobj = fobj
            
    def fit(self, x, y):
        dataset = lgb.Dataset(x, y)
        self.model = lgb.train(self.lgb_param, dataset, fobj=self.fobj)
        
    def predict(self, X):
        return self.model.predict(X)

In [27]:
def flood_mse_fobj(y_pred, y, alpha=1.5):
    y = y.get_label()
    
#     print("max y train", np.max(y))
#     print("max y pred", np.max(y_pred))
    
    deviation = (y_pred - y) ** 2
    gradient = 2 * (y_pred - y)
    hessian = np.sign(deviation) * 2
    
    gradient[gradient < 0] *= alpha
    hessian[hessian < 0] *= alpha
    
    return gradient, hessian

def flood_mse_feval(y_pred, y, alpha=1.5):
    y = y["target"].values
    deviation = (y_pred - y) ** 2
    deviation[deviation < 0] *= alpha
    
    return np.mean(deviation)

### merge with nearest and most corr stations

In [28]:
corr_and_nearest = pd.read_csv("corrs_and_dists.csv", index_col=0)

In [29]:
def preprocessor_corr_and_nearest(corr_and_nearest, id_idx):
    best_corr_nan_mask = corr_and_nearest["best_corr_post"].isna()

    fill_vals = corr_and_nearest.loc[best_corr_nan_mask].index 
    corr_and_nearest.loc[best_corr_nan_mask, "best_corr_post"] = fill_vals
    
    best_corr_map = corr_and_nearest["best_corr_post"].to_dict()
    id2bestcorr_id = id_idx.map(best_corr_map)
    
    nearest_map = corr_and_nearest["nearest_post"].to_dict()
    id2nearest_id = id_idx.map(nearest_map)
    
    return id2bestcorr_id, id2nearest_id

In [30]:
def merge_corr_nearest(X, corr_and_nearest):
    id2bestcorr_id, id2nearest_id = preprocessor_corr_and_nearest(corr_and_nearest, X.reset_index()["id"])

    X2 = X.copy()

    new_cols = [f"corr_{colname}" for colname in X2.columns]
    X2.columns = new_cols

    X.reset_index(inplace=True)
    X = X.merge(X2, left_on=[id2bestcorr_id, "date"], right_on=["id", "date"], how="left")

    new_cols = [f"nearest_{colname}" for colname in X2.columns]
    X2.columns = new_cols

    X = X.merge(X2, left_on=[id2nearest_id, "date"], right_on=["id", "date"], how="left")
    
    X.set_index(["id", "date"], inplace=True)

    return X

In [31]:
target = target.diff()

In [32]:
target_notnan_mask = target.notna().values
X, target = X[target_notnan_mask], target[target_notnan_mask]

In [34]:
# target_station_ids = [6005, 6022, 6027, 5004, 5012, 5024, 5805]
# target_stations_mask = X.reset_index()["id"].isin(target_station_ids).values

# X = X[target_stations_mask]
# target = target[target_stations_mask]

In [20]:
lgb_param = {
    "verbose": -1
}

target_station_ids = [6005, 6022, 6027, 5004, 5012, 5024, 5805]
target_stations_mask = X.reset_index()["id"].isin(target_station_ids).values

models = []
for idx in range(len(target_station_ids)):
    models.append(LgbModel(lgb_param, flood_mse_fobj))
    
model = StationModelsManager(target_station_ids, models)   

folds_retriever = TimeSeriesValFoldRetriever(X[target_stations_mask], target[target_stations_mask], nfolds=6, val_width=31)

cross_val = CrossValidator(folds_retriever, model, [flood_mse_feval, mae])

cross_val.run_cv()

starting 0 fold
starting 1 fold
starting 2 fold
starting 3 fold
starting 4 fold
starting 5 fold


{'flood_mse_feval': [12043.230755168724,
  758.6412185728067,
  413.94074889485904,
  240.77170999038665,
  250.42788422392155,
  459.7981402363564],
 'mean_absolute_error': [18.974347253693363,
  15.741925569294763,
  12.21883321020326,
  9.734233721467517,
  8.211703517986244,
  11.896107156943005]}

In [21]:
np.mean(cross_val.metrics_vals["mean_absolute_error"])
np.sqrt(np.mean(cross_val.metrics_vals["flood_mse_feval"]))

12.79619173826469

48.59151238828831

In [22]:
feature_importance = model.models[6005].model.feature_importance()
importances = pd.Series(feature_importance, index=X.columns)

n_features = 20

top_n_features = list(importances.sort_values(ascending=False).iloc[:n_features].index)

### 2 gen

In [33]:
with open("top_n_features.pkl", "rb") as f:
    top_n_features = pickle.load(f)

In [34]:
X = X[top_n_features]

In [35]:
X = merge_corr_nearest(X, corr_and_nearest)

In [36]:
target_station_ids = [6005, 6022, 6027, 5004, 5012, 5024, 5805]
target_stations_mask = X.reset_index()["id"].isin(target_station_ids).values

X = X[target_stations_mask]
target = target[target_stations_mask]

In [38]:
lgb_param = {
    "verbose": -1
}

target_station_ids = [6005, 6022, 6027, 5004, 5012, 5024, 5805]
target_stations_mask = X.reset_index()["id"].isin(target_station_ids).values

models = []
for idx in range(len(target_station_ids)):
    models.append(LgbModel(lgb_param, flood_mse_fobj))
    
model = StationModelsManager(target_station_ids, models) 

folds_retriever = TimeSeriesValFoldRetriever(X, target, nfolds=6, val_width=31)

cross_val = CrossValidator(folds_retriever, model, [flood_mse_feval, mae])

cross_val.run_cv()

starting 0 fold


KeyboardInterrupt: 

In [None]:
np.mean(cross_val.metrics_vals["mean_absolute_error"])
np.sqrt(np.mean(cross_val.metrics_vals["flood_mse_feval"]))

### test predict

In [40]:
model.fit(X, target)

In [44]:
for station, model in model.models.items():
    with open(f"../../model_{station}.pkl", "wb") as f:
        pickle.dump(model, f)

In [100]:
from importlib import reload

In [101]:
reload(features), reload(features.meteo)

(<module 'features' from '../features/__init__.py'>,
 <module 'features.meteo' from '../features/meteo/__init__.py'>)

In [103]:
# def _build_general_df(self, dfs):
#     ids = list(self.meteo_coords["station_id"])

#     for station_id, df in zip(ids, dfs):
#         df["id"] = station_id

#     general_df = pd.concat(dfs, axis=0)
#     return general_df


# meteo_forecast_loader._build_general_df = _build_general_df

In [104]:
forecast = meteo_forecast_loader.load(needed_meteo_ids) #[:2]

get data station_id     31725
lon           133.83
lat             48.6
Name: 0, dtype: object
got data 21.934844493865967
get data station_id      31735
lon           135.188
lat           48.5281
Name: 1, dtype: object
got data 22.21863842010498
parsing
parsing


TypeError: _build_general_df() missing 1 required positional argument: 'dfs'

In [108]:
with open("top_n_features.pkl", "wb") as f:
    pickle.dump(top_n_features, f)

In [109]:
with open("models_manager.pkl", "wb") as f:
    pickle.dump(model, f)

### 2 gen

In [43]:
import pickle

In [13]:
with open("models_manager.pkl", "rb") as f:
    model = pickle.load(f) 
    
with open("top_n_features.pkl", "rb") as f:
    top_n_features = pickle.load(f)

AttributeError: Can't get attribute 'flood_mse_fobj' on <module '__main__'>

In [50]:
test_date_fmt = "%Y-%m-%d"
test_start_date, test_end_date = datetime.strptime("2020-10-24", test_date_fmt), datetime.strptime("2020-11-04", test_date_fmt)

In [94]:
forecast_parser = features.meteo.ForecastParser()
forecast_preprocessor = features.meteo.ForecastMeteoPreprocessor()
coords_builder = features.CoordsBuilder(data_dir + "processed_data/asunp.pkl")

In [95]:
name_forecast_src = np.array([("Total_cloud_cover_entire_atmosphere_Mixed_intervals_Average", "cloudCoverTotal"),
                        ('u-component_of_wind_height_above_ground', "windAngleX"), 
                        ('v-component_of_wind_height_above_ground', "windAngleY"),
                        ('Wind_speed_gust_surface', "windSpeed"), 
                        ('Total_precipitation_surface_Mixed_intervals_Accumulation', "totalAccumulatedPrecipitation"), 
                        ("Temperature_height_above_ground", 'airTemperature'), 
                        ('Maximum_temperature_height_above_ground_Mixed_intervals_Maximum', 'maximumTemperatureOverPeriodSpecified'), 
                        ('Minimum_temperature_height_above_ground_Mixed_intervals_Minimum', 'minimumTemperatureAtHeightAndOverPeriodSpecified'),
                        ('Temperature_surface', 'soilTemperature'), 
                        ('Relative_humidity_height_above_ground', 'relativeHumidity'), 
                        ('Pressure_height_above_ground', 'pressure'), 
                        ('Pressure_reduced_to_MSL_msl', 'pressureReducedToMeanSeaLevel'),
                        ("Dewpoint_temperature_height_above_ground", "dewpointTemperature")
                       ])

retrieved_vars = list(name_forecast_src[:, 1])
varnames_table = pd.DataFrame(name_forecast_src, columns=["forecast", "src"])

In [96]:
meteo_forecast_loader = features.meteo.ForecastMeteoLoader(test_start_date, test_end_date, coords_builder, 
                                                          retrieved_vars, varnames_table, forecast_parser)

In [97]:
s2m = pd.read_csv(s2m_path)

In [98]:
def get_needed_meteo_ids(X):
    stations = X.reset_index()["id"].unique()
    s2m.set_index("station_id", inplace=True)
    print(s2m.reset_index().dtypes)
    
    needed_meteo = []
    for station in list(stations):
        needed_meteo.append(s2m.loc[station, "meteo_id"])
        
    return needed_meteo

In [99]:
needed_meteo_ids = get_needed_meteo_ids(X)

station_id    float64
meteo_id        int64
dtype: object
