In [1]:
#!c1.8
import pandas as pd
import os
import numpy as np
import pickle
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from catboost import CatBoostRegressor

In [2]:
#!c1.8
pd.set_option("display.max_columns", None)

In [3]:
#!c1.8

mapping = {
    1: "Туристская 2020 год.xlsx",
    2: "Коптевский бул. 2020 год.xlsx",
    3: "Останкино 0 2020 год.xlsx",
    4: "Глебовская 2020 год.xlsx",
    5: "Спиридоновка ул. 2020 год.xlsx",
    6: "Шаболовка 2020.xlsx",
    7: "Академика Анохина 2020.xlsx",
    8: "Бутлерова 2020.xlsx",
    9: "Пролетарский проспект 2020.xlsx",
    10: "Марьино 2020.xlsx"
}

In [4]:
#!c1.8
data = {}

params = {
    "skiprows": [1],
    "engine": "openpyxl"
}

for station_number, filename in mapping.items():
    path = "data/stations/" + filename
    data[station_number] = pd.read_excel(path, **params)

In [5]:
#!c1.8
profiles_dir = "data/ostankino_profile/"

daily_tables = {}

for file in os.scandir(profiles_dir):
    daily_table = pd.read_table(file.path, skiprows=19, decimal=",")
    date = file.name[4:12]
    daily_tables[date] = daily_table

In [6]:
#!c1.8
ost_profile_data = pd.concat(daily_tables)
ost_profile_data["data time"] = pd.to_datetime(ost_profile_data["data time"])
ost_profile_data.drop("Quality", axis=1, inplace = True)
ost_profile_data.rename({
    "data time": "datetime",
    "0": "t_0m",
    "50": "t_50m",
    "100": "t_100m",
    "150": "t_150m",
    "200": "t_200m",
    "250": "t_250m",
    "300": "t_300m",
    "350": "t_350m",
    "400": "t_400m",
    "450": "t_450m",
    "500": "t_500m",
    "550": "t_550m",
    "600": "t_600m",
    "OutsideTemperature": "outside_temperature"
}, axis=1, inplace =True)
ost_profile_data.reset_index(drop=True, inplace=True)
ost_profile_data = ost_profile_data.resample("20min", on="datetime").mean().reset_index()

In [7]:
#!c1.8
ost_253_meteo = pd.read_excel("data/ostankino_meteo.xls", skiprows=2, names=["datetime", "253_wind_direction", "253_wind_speed"])
ost_253_meteo = ost_253_meteo.resample("20min", on="datetime").mean().reset_index()

In [8]:
#!c1.8
ost_data = pd.merge(ost_profile_data, ost_253_meteo, how="inner", on="datetime")

In [9]:
#!c1.8
# Drop empty cols, rename then, cast datetime to datetime, join with city-level Ostankino data
for k, v in data.items():
    v = v.loc[:, [name for name in v.columns if "Unnamed" not in name]]
    v.dropna(axis=1, how="all", inplace=True)
    v.rename({
        "Дата и время": "datetime",
        "CO": "co",
        "NO2": "no2",
        "NO": "no",
        "PM10": "pm10",
        "PM2.5": "pm25",
        "-T-": "temperature",
        "| V |": "wind_speed",
        "_V_": "wind_direction",
        "Давление": "pressure",
        "Влажность": "humidity",
        "Осадки": "precipitation"
    }, axis=1, inplace=True)
    v["datetime"] = pd.to_datetime(v["datetime"])
    v = pd.merge(v, ost_data, how="inner", on="datetime")
    data[k] = v

In [10]:
#!c1.8

# Split by pollutant
pollutants = ["co", "no2", "no", "pm10", "pm25"]
for k, v in data.items():
    v_parts = {}
    for p in pollutants:
        if p in v.columns:
            cols_to_keep = ["temperature", "wind_speed", "wind_direction",\
                            "pressure", "humidity", "precipitation"] + [p] + list(ost_data.columns)
            v_part = v.loc[:, cols_to_keep]
            v_part.rename({p: "pollutant_concentration"}, axis=1,inplace=True)
            v_part = v_part.loc[v_part["pollutant_concentration"] > 0.0]
            v_parts[p] = v_part
    data[k] = v_parts

In [11]:
#!c1.8
    
# Add separate date and time variables
for n, dict_ in data.items():
    for pollutant, table in dict_.items():
        table["month"] = table["datetime"].dt.month
        table["day"] = table["datetime"].dt.day
        table["day_of_week"] = table["datetime"].dt.weekday
        table["hour"] = table["datetime"].dt.hour
        table.index = pd.Index(table.datetime)
        table.drop("datetime", axis=1, inplace=True)
    data[n][pollutant] = table

# Generate historical features
# Use rolling to capture some previuos and next measures
hist_features = ["temperature", "wind_speed", "wind_direction",\
                            "pressure", "humidity", "precipitation", "pollutant_concentration"]
for n, dict_ in data.items():
    for pollutant, table in dict_.items():
        for timeshift in [*range(1, 25)] + [168]:
            for feature in hist_features:
                if feature not in list(table.columns):
                    continue
                col_name = feature + "_prev_" + str(timeshift) + "h"
                if timeshift == 168:
                    window = 9
                else:
                    window_size = 6
                col_value = table[feature].rolling(window=window_size).mean().shift(3*timeshift)
                table[col_name] = col_value
        data[n][pollutant] = table

# Generate forecast features
forecast_features = ["temperature", "wind_speed", "wind_direction",\
                            "pressure", "humidity", "precipitation"]
for n, dict_ in data.items():
    for pollutant, table in dict_.items():
        for timeshift in range(1, 25):
            for feature in forecast_features:
                col_name = feature + "_forecast_" + str(timeshift) + "h"
                col_value = table[feature].rolling(window=6).mean().shift(-3*timeshift)
                table[col_name] = col_value
        data[n][pollutant] = table

# Generate target [pollution in 1…24 hours]
for n, dict_ in data.items():
    for pollutant, table in dict_.items():
        for timeshift in range(1, 25):
            col_name = "target_" + str(timeshift) + "h"
            col_value = table["pollutant_concentration"].shift(-3*timeshift)
            table[col_name] = col_value
        data[n][pollutant] = table

In [None]:
#!c1.8
metrics = {}
f_imp = {}
grid_search_results = {}

for station_number, station_data in data.items():
    metrics_by_pollutant = {}
    f_imp_by_pollutant = {}
    results_by_pollutant = {}
    for pollutant_name, pollutant_data in station_data.items():
        target_names = [name for name in pollutant_data.columns if "target" in name]
        train_data = pollutant_data.dropna(axis=0, subset=target_names)
        
        if train_data.shape[0] < 10000:
            print(f"Not enough data for {pollutant_name} on station {station_number}:\
            n of rows in the dataset is {train_data.shape}, skipping.")
            continue
        
        X = train_data.drop(target_names, axis=1)
        y = train_data.loc[:, target_names]
        
        param_grid = {
            "learning_rate": [0.01, 0.03, 0.05, 0.1, 0.25, 0.5, 1],
            "depth": range(2, 20, 2),
            "l2_leaf_reg": [1, 3, 5, 7, 9]}
        model =  CatBoostRegressor(loss_function="MultiRMSE", verbose=100, iterations=100)
        grid_search_results = model.grid_search(param_grid, X, y, cv=5, plot=True)
        
#         model.save_model(f"pretrained_models/{station_number}_{pollutant_name}.cbm")
        
        predictions = model.predict(X_test)
        
        predictions[predictions < 0] = 0
        
        r2 = r2_score(y_test, predictions, multioutput="raw_values")
        mse = mean_squared_error(y_test, predictions, multioutput="raw_values")
        rmse = model.get_best_score()
        metrics_by_pollutant[pollutant_name] = pd.DataFrame({"r2": r2, "mse": mse, "rmse": rmse["learn"]["MultiRMSE"]})
        
        f_imp_by_pollutant[pollutant_name] = pd.DataFrame({"features": list(X.columns),
                                                           "importance": model.get_feature_importance()})
        results_by_pollutant[pollutant_name] = results
    metrics[station_number] = pd.concat(metrics_by_pollutant)
    f_imp[station_number] = pd.concat(f_imp_by_pollutant)
    grid_search_results[statin_number] = results_by_pollutant

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 1.7441309	test: 1.6980393	best: 1.6980393 (0)	total: 260ms	remaining: 25.8s
99:	learn: 1.1325723	test: 1.0855494	best: 1.0855494 (99)	total: 26.8s	remaining: 0us

bestTest = 1.085549373
bestIteration = 99

0:	loss: 1.0855494	best: 1.0855494 (0)	total: 27.2s	remaining: 2h 22m 26s
0:	learn: 1.7215885	test: 1.6753146	best: 1.6753146 (0)	total: 343ms	remaining: 34s
99:	learn: 0.9063060	test: 0.8612904	best: 0.8612904 (99)	total: 23.5s	remaining: 0us

bestTest = 0.8612903547
bestIteration = 99

1:	loss: 0.8612904	best: 0.8612904 (1)	total: 50.7s	remaining: 2h 12m 19s
0:	learn: 1.6992160	test: 1.6527576	best: 1.6527576 (0)	total: 382ms	remaining: 37.8s
99:	learn: 0.8517420	test: 0.8091972	best: 0.8091972 (99)	total: 22.9s	remaining: 0us

bestTest = 0.8091971548
bestIteration = 99

2:	loss: 0.8091972	best: 0.8091972 (2)	total: 1m 13s	remaining: 2h 7m 43s
0:	learn: 1.6440739	test: 1.5971454	best: 1.5971454 (0)	total: 264ms	remaining: 26.2s
99:	learn: 0.7896187	test: 0.7604347	best: 0

99:	learn: 1.1335628	test: 1.0865432	best: 1.0865432 (99)	total: 26.6s	remaining: 0us

bestTest = 1.086543155
bestIteration = 99

28:	loss: 1.0865432	best: 0.6671421 (20)	total: 11m 49s	remaining: 1h 56m 40s
0:	learn: 1.7216646	test: 1.6753912	best: 1.6753912 (0)	total: 262ms	remaining: 25.9s
99:	learn: 0.9079782	test: 0.8634515	best: 0.8634515 (99)	total: 24.3s	remaining: 0us

bestTest = 0.8634515436
bestIteration = 99

29:	loss: 0.8634515	best: 0.6671421 (20)	total: 12m 14s	remaining: 1h 56m 14s
0:	learn: 1.6993419	test: 1.6528844	best: 1.6528844 (0)	total: 309ms	remaining: 30.6s
99:	learn: 0.8533945	test: 0.8118912	best: 0.8118912 (99)	total: 25.9s	remaining: 0us

bestTest = 0.8118912248
bestIteration = 99

30:	loss: 0.8118912	best: 0.6671421 (20)	total: 12m 40s	remaining: 1h 56m 3s
0:	learn: 1.6443205	test: 1.5973939	best: 1.5973939 (0)	total: 270ms	remaining: 26.8s
99:	learn: 0.7967889	test: 0.7660791	best: 0.7660791 (99)	total: 22.6s	remaining: 0us

bestTest = 0.76607914
bestIter

99:	learn: 1.0771440	test: 1.0325264	best: 1.0325264 (99)	total: 1m 2s	remaining: 0us

bestTest = 1.032526445
bestIteration = 99

56:	loss: 1.0325264	best: 0.5696718 (40)	total: 34m 8s	remaining: 2h 34m 32s
0:	learn: 1.7202029	test: 1.6741944	best: 1.6741944 (0)	total: 613ms	remaining: 1m
99:	learn: 0.8175270	test: 0.7884958	best: 0.7884958 (99)	total: 56.9s	remaining: 0us

bestTest = 0.7884958013
bestIteration = 99

57:	loss: 0.7884958	best: 0.5696718 (40)	total: 35m 5s	remaining: 2h 35m 29s
0:	learn: 1.6968945	test: 1.6508766	best: 1.6508766 (0)	total: 646ms	remaining: 1m 3s
99:	learn: 0.7543406	test: 0.7347851	best: 0.7347851 (99)	total: 52.5s	remaining: 0us

bestTest = 0.7347850786
bestIteration = 99

58:	loss: 0.7347851	best: 0.5696718 (40)	total: 35m 58s	remaining: 2h 36m 3s
0:	learn: 1.6393691	test: 1.5933121	best: 1.5933121 (0)	total: 585ms	remaining: 57.9s
99:	learn: 0.6885645	test: 0.6787664	best: 0.6787664 (99)	total: 48.8s	remaining: 0us

bestTest = 0.6787664406
bestIterati

In [None]:
import shutil

shutil.make_archive("models",  "zip", "pretrained_models/")

In [None]:
#!c1.8
metrics_df.groupby(level=1).mean()

In [None]:
#!c1.8
# With nulls in pollutant concentrations removed
metrics_df_2.groupby(level=1).mean()

In [None]:
#!c1.8
metrics_df_2 = pd.concat(metrics)

In [None]:
metrics_df_2.groupby(level=0).mean()

In [None]:
#!c1.8
metrics_df.groupby(level=0).mean()

In [None]:
#!c1.8
metrics_df.groupby(level=1).mean()

In [None]:
metrics_df_2.groupby(level=1).mean()

In [None]:
#!c1.8
metrics_df.groupby(level=2).mean()

In [None]:
#!c1.8
metrics_df_2.groupby(level=2).mean()

In [None]:
pd.concat(f_imp).reset_index().drop("level_2", axis=1).drop_duplicates(subset="features").sort_values("importance", ascending=False).head(40)