In [None]:
import copy
#import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
import re

from catboost import CatBoostRegressor
from geopy.distance import geodesic
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
#from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
#from sklearn.svm import SVR

In [2]:
# Функция для преобразования формата скорости ветра

def extract_value(value):
    if isinstance(value, (int, float)):
        return value
    match = re.search(r'\((\d+\.\d+)\)', str(value))
    if match:
        return float(match.group(1))
    return value

In [3]:
# Функция загрузки и подготовки данных по нужным веществам из всех 6 геоточек

def load_data(num, all_substances, meteo_params):
    '''num - номер геоточки
       all_substances - названия загрязняющих веществ
    '''
    
    point = pd.read_excel(f'../data/raw/Усредненные_данные_Нижнекамск_т.{num}.xlsx')
    indexes = list(point.loc[0])
    indexes = indexes[1:]
    point = point.loc[3:].set_index('Интервал отбора')
    point.columns = indexes

    point = point.apply(lambda x: x.str.replace(',', '.'))
    point['V ветра, м/с'] = point['V ветра, м/с'].apply(extract_value)
    point[['Угол ветра, °', 'Направление ветра']] = point['D ветра, °'].str.extract(r'(\d+)\s?\((.+)\)')
    point['Направление ветра'].replace(np.NaN, 'Ш', inplace=True)
        
    # Пересечение множеств названий столбцов и substances
    substances = set(indexes) & set(all_substances)
    columns = [*substances, *meteo_params]
    point = point[columns]

    # Заполнение пропусков, если их в столбце не более 5%
    for col in point.columns:
        point[col] = pd.to_numeric(point[col], errors='ignore')
        if point[col].isna().sum().sum() / len(point) < 0.05:
            point[col].interpolate(inplace=True)
        
    return point

In [4]:
def melt_columns(df, substances, Mr, meteo_params):
    """Функция 'расплавления' столбцов с веществами"""
    
    df = pd.melt(df, id_vars=meteo_params, ignore_index=False)

    # Закодируем названия веществ их молекулярной массой
    df.replace(substances, Mr, inplace=True)
    df.rename(columns={"variable": "Mr", "value": "C, мг/м³"}, inplace=True)

    df["Mr"] = pd.to_numeric(df["Mr"], errors='coerce')

    return df

In [5]:
def make_df(df, num, target_num, lags=None):
    '''Функция предобработки датасета с предикторами
       num - номер геоточки
       target_num - номер точки, в которой модель будет считать концентрацию
    '''

    coor = locations_list[num - 1]
    target_coor = locations_list[target_num - 1]

    df = df.copy()
    df.dropna(inplace=True)
    df['lat'], df['lon'] = coor

    # Добавим расстояние от данной точки до точки c target_coor в качестве входной переменной
    df['distance, km'] = geodesic(coor, target_coor).km
    
    # Фичи c временными лагам
    if lags:
        for i in range(1, lags):
            df[f"T внеш_{i}"] = df['T внеш., °C'].shift(i)
            df[f"P атм._{i}"] = df['P атм., мм.рт.ст.'].shift(i)
            df[f"V ветра_{i}"] = df['V ветра, м/с'].shift(i)
            df[f"Угол ветра_{i}"] = df['Угол ветра, °'].shift(i)
            df[f"C, мг/м³_{i}"] = df['C, мг/м³'].shift(i)
    df.dropna(inplace=True)

    return df

In [6]:
def concat_dfs(df_melted_list, num, lags=5):

    X_list = [make_df(df, i+1, num, lags) for i, df in enumerate(df_melted_list)]
    del X_list[num - 1]    # сопоставление точки самой с собой
    X_num = pd.concat(X_list, axis=0)
    return X_num

In [7]:
def split_scale_df(X, y):
    
    # Разбиваем
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
    
    # Масштабируем X
    scl = StandardScaler()
    scl.fit(X_train)
    X_train_scl = scl.transform(X_train)
    X_test_scl = scl.transform(X_test)

    return X_train_scl, X_test_scl, y_train, y_test

In [8]:
def make_X_y():
    X_list_substance = copy.deepcopy(X_num_list)
    data_with_substance = []
    
    # Добавление целевой переменной
    for (num, X_num), point in zip(enumerate(X_list_substance), points):
        if substance in list(point):
            data_with_substance.append(X_num)
            data_with_substance[-1][f'target_{substance}'] = point[substance]
            
    X = pd.concat(data_with_substance, axis=0)
    X.dropna(inplace=True)
    X.drop_duplicates(inplace=True)
    X = pd.get_dummies(X, columns=['Направление ветра'])
    y = X.pop(f'target_{substance}')
    return X, y

In [9]:
locations_list = [
    [55.539306, 051.856451],    # д. Клятле
    [55.622944, 051.825578],    # ул. Ахтубинская, 4б
    [55.650091, 051.852687],    # ул. Гагарина, 32
    [55.598983, 051.771936],    # ул. Юбилейная, 3
    [55.613193, 051.784821],    # ул. Южная, 3
    [55.654578, 051.800072]     # ул. Ямьле, 20
    ]

meteo_params = ['T внеш., °C',
                'P атм., мм.рт.ст.',
                'V ветра, м/с',
                'Угол ветра, °',
                'Направление ветра'
               ]
substances = ['CO, мг/м³',
              'NO, мг/м³',
              'NO2, мг/м³', 
              'NH3, мг/м³',
              'SO2, мг/м³',
              'H2S, мг/м³'
              ]
# Молекулярные массы веществ из substances
Mr = [28, 30, 46, 17, 64, 34]

# Временной лаг
lags = 4

## Optuna

In [10]:
# Загрузка, обработка и объединение данных

points = [load_data(i, substances, meteo_params) for i in range(1, 7)]
df_melted_list = [melt_columns(df, substances, Mr, meteo_params) for df in points]
X_num_list = [concat_dfs(df_melted_list, j+1, lags) for j in range(len(df_melted_list))]

In [11]:
for substance in substances:
    def objective(trial,
                  substance=substance,
                  points=points,
                  X_num_list=X_num_list
        ):
        
        X, y = make_X_y()
        X_train_scl, X_test_scl, y_train, y_test = split_scale_df(X, y)
        
        # CatBoostRegressor
        catboost_params = {
            'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1),
            'depth': trial.suggest_int('depth', 3, 10),
            'n_estimators': trial.suggest_int('n_estimators', 10, 300),    # =iterations
            'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
        }
        catboost = CatBoostRegressor(**catboost_params, random_state=0)
        catboost.fit(X_train_scl, y_train, early_stopping_rounds=200, verbose=0)
        
        # Прогнозы и ошибки
        pred = catboost.predict(X_test_scl)
        pred[pred < 0] = 0
        catboost_mae = mean_absolute_error(y_test, pred)
        
        return catboost_mae

    # Запуск оптимизации гиперпараметров
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=30)

    print('Best hyperparameters:', study.best_params)
    print('Best score:', study.best_value)
    print(substance)
    print()

[I 2023-10-27 12:21:34,168] A new study created in memory with name: no-name-db5ff638-ceef-4e9f-88bc-7d4b481f9b9e
[I 2023-10-27 12:21:35,340] Trial 0 finished with value: 0.15252137976032187 and parameters: {'learning_rate': 0.021721380120411013, 'depth': 3, 'n_estimators': 15, 'l2_leaf_reg': 9.201357757049689}. Best is trial 0 with value: 0.15252137976032187.
[I 2023-10-27 12:21:38,162] Trial 1 finished with value: 0.12680246908539577 and parameters: {'learning_rate': 0.03711872951915805, 'depth': 7, 'n_estimators': 124, 'l2_leaf_reg': 5.296569423836399}. Best is trial 1 with value: 0.12680246908539577.
[I 2023-10-27 12:21:41,568] Trial 2 finished with value: 0.13272485550427057 and parameters: {'learning_rate': 0.03312784887999468, 'depth': 4, 'n_estimators': 295, 'l2_leaf_reg': 7.711313261159572}. Best is trial 1 with value: 0.12680246908539577.
[I 2023-10-27 12:21:43,640] Trial 3 finished with value: 0.13421086654576084 and parameters: {'learning_rate': 0.09391654523976185, 'depth'

Best hyperparameters: {'learning_rate': 0.09353304901542561, 'depth': 10, 'n_estimators': 268, 'l2_leaf_reg': 5.424800121578394}
Best score: 0.08069015210079558
CO, мг/м³



[I 2023-10-27 12:25:22,010] Trial 0 finished with value: 0.0027297821155201686 and parameters: {'learning_rate': 0.08859387293378328, 'depth': 7, 'n_estimators': 242, 'l2_leaf_reg': 9.156173428061596}. Best is trial 0 with value: 0.0027297821155201686.
[I 2023-10-27 12:25:23,948] Trial 1 finished with value: 0.004056139901903261 and parameters: {'learning_rate': 0.07802285679777217, 'depth': 5, 'n_estimators': 99, 'l2_leaf_reg': 3.9766931905429383}. Best is trial 0 with value: 0.0027297821155201686.
[I 2023-10-27 12:25:25,678] Trial 2 finished with value: 0.0051031478570847805 and parameters: {'learning_rate': 0.04747682885506149, 'depth': 4, 'n_estimators': 90, 'l2_leaf_reg': 8.611383832242135}. Best is trial 0 with value: 0.0027297821155201686.
[I 2023-10-27 12:25:27,245] Trial 3 finished with value: 0.004809766107957077 and parameters: {'learning_rate': 0.06535282618787445, 'depth': 5, 'n_estimators': 62, 'l2_leaf_reg': 8.813862046873194}. Best is trial 0 with value: 0.0027297821155

Best hyperparameters: {'learning_rate': 0.09237777041884405, 'depth': 10, 'n_estimators': 225, 'l2_leaf_reg': 1.7892158913147767}
Best score: 0.0020123957063985435
NO, мг/м³



[I 2023-10-27 12:28:39,929] Trial 0 finished with value: 0.010674657240535759 and parameters: {'learning_rate': 0.01581745634250587, 'depth': 10, 'n_estimators': 84, 'l2_leaf_reg': 9.380892270595355}. Best is trial 0 with value: 0.010674657240535759.
[I 2023-10-27 12:28:45,267] Trial 1 finished with value: 0.007824605367061364 and parameters: {'learning_rate': 0.03368854294201423, 'depth': 7, 'n_estimators': 278, 'l2_leaf_reg': 8.095683249558626}. Best is trial 1 with value: 0.007824605367061364.
[I 2023-10-27 12:28:49,085] Trial 2 finished with value: 0.011747236376597837 and parameters: {'learning_rate': 0.006535597781673118, 'depth': 6, 'n_estimators': 222, 'l2_leaf_reg': 8.3169123254131}. Best is trial 1 with value: 0.007824605367061364.
[I 2023-10-27 12:28:52,124] Trial 3 finished with value: 0.00853833471684451 and parameters: {'learning_rate': 0.044909839993861486, 'depth': 7, 'n_estimators': 135, 'l2_leaf_reg': 8.99626259339622}. Best is trial 1 with value: 0.007824605367061364

Best hyperparameters: {'learning_rate': 0.09924059086275548, 'depth': 10, 'n_estimators': 255, 'l2_leaf_reg': 1.023897651393103}
Best score: 0.004642527376963059
NO2, мг/м³



[I 2023-10-27 12:33:03,225] Trial 0 finished with value: 0.0008344316584376965 and parameters: {'learning_rate': 0.07386387905942547, 'depth': 9, 'n_estimators': 117, 'l2_leaf_reg': 6.7282114757780915}. Best is trial 0 with value: 0.0008344316584376965.
[I 2023-10-27 12:33:06,286] Trial 1 finished with value: 0.0010998679147228786 and parameters: {'learning_rate': 0.04576787762707315, 'depth': 4, 'n_estimators': 229, 'l2_leaf_reg': 1.0371989251265226}. Best is trial 0 with value: 0.0008344316584376965.
[I 2023-10-27 12:33:09,158] Trial 2 finished with value: 0.004136602360878625 and parameters: {'learning_rate': 0.002411916733962227, 'depth': 6, 'n_estimators': 152, 'l2_leaf_reg': 9.719236117581627}. Best is trial 0 with value: 0.0008344316584376965.
[I 2023-10-27 12:33:10,210] Trial 3 finished with value: 0.00431846311816185 and parameters: {'learning_rate': 0.014796098909125914, 'depth': 4, 'n_estimators': 24, 'l2_leaf_reg': 6.140071378140955}. Best is trial 0 with value: 0.000834431

Best hyperparameters: {'learning_rate': 0.09775410083375517, 'depth': 10, 'n_estimators': 298, 'l2_leaf_reg': 3.575957776379207}
Best score: 0.0005551241038935759
NH3, мг/м³



[I 2023-10-27 12:37:08,963] Trial 0 finished with value: 0.009760378696830198 and parameters: {'learning_rate': 0.016705667663678675, 'depth': 8, 'n_estimators': 265, 'l2_leaf_reg': 2.7948031041574444}. Best is trial 0 with value: 0.009760378696830198.
[I 2023-10-27 12:37:10,600] Trial 1 finished with value: 0.011007786569171313 and parameters: {'learning_rate': 0.061817206486237, 'depth': 9, 'n_estimators': 25, 'l2_leaf_reg': 8.017223590354437}. Best is trial 0 with value: 0.009760378696830198.
[I 2023-10-27 12:37:16,651] Trial 2 finished with value: 0.008819811376060362 and parameters: {'learning_rate': 0.0525734231437532, 'depth': 10, 'n_estimators': 79, 'l2_leaf_reg': 1.569768727585246}. Best is trial 2 with value: 0.008819811376060362.
[I 2023-10-27 12:37:20,700] Trial 3 finished with value: 0.012344868998389472 and parameters: {'learning_rate': 0.016859954638949293, 'depth': 10, 'n_estimators': 49, 'l2_leaf_reg': 2.741175664984932}. Best is trial 2 with value: 0.00881981137606036

Best hyperparameters: {'learning_rate': 0.09983066685185675, 'depth': 9, 'n_estimators': 298, 'l2_leaf_reg': 3.8998860942858236}
Best score: 0.005992643897162339
SO2, мг/м³



[I 2023-10-27 12:40:23,552] Trial 0 finished with value: 0.0007278174420118116 and parameters: {'learning_rate': 0.018689838291331792, 'depth': 10, 'n_estimators': 162, 'l2_leaf_reg': 8.790079953287588}. Best is trial 0 with value: 0.0007278174420118116.
[I 2023-10-27 12:40:28,099] Trial 1 finished with value: 0.0006318969203167951 and parameters: {'learning_rate': 0.07604444607524286, 'depth': 9, 'n_estimators': 108, 'l2_leaf_reg': 8.044261877893739}. Best is trial 1 with value: 0.0006318969203167951.
[I 2023-10-27 12:40:48,823] Trial 2 finished with value: 0.0005005431648319136 and parameters: {'learning_rate': 0.058015085013842733, 'depth': 10, 'n_estimators': 281, 'l2_leaf_reg': 3.712242850290659}. Best is trial 2 with value: 0.0005005431648319136.
[I 2023-10-27 12:40:50,959] Trial 3 finished with value: 0.0010540482767675287 and parameters: {'learning_rate': 0.009239996393186376, 'depth': 7, 'n_estimators': 65, 'l2_leaf_reg': 4.5740596306673105}. Best is trial 2 with value: 0.0005

Best hyperparameters: {'learning_rate': 0.09924384160536547, 'depth': 10, 'n_estimators': 285, 'l2_leaf_reg': 1.0443515089959652}
Best score: 0.00041449968331349593
H2S, мг/м³



**CO, мг/м³**  
Best hyperparameters: {'learning_rate': 0.09353304901542561, 'depth': 10, 'n_estimators': 268, 'l2_leaf_reg': 5.424800121578394}  
Best score: 0.08069015210079558  м³м³м³м³м³

**NO, мг/м³**  
Best hyperparameters: {'learning_rate': 0.09237777041884405, 'depth': 10, 'n_estimators': 225, 'l2_leaf_reg': 1.7892158913147767}  
Best score: 0.0020123957063985435  

**NO2, мг/м³**  
Best hyperparameters: {'learning_rate': 0.09924059086275548, 'depth': 10, 'n_estimators': 255, 'l2_leaf_reg': 1.023897651393103}  
Best score: 0.004642527376963059  

**NH3, мг/м³**  
Best hyperparameters: {'learning_rate': 0.09775410083375517, 'depth': 10, 'n_estimators': 298, 'l2_leaf_reg': 3.575957776379207}  
Best score: 0.0005551241038935759  

**SO2, мг/м³**  
Best hyperparameters: {'learning_rate': 0.09983066685185675, 'depth': 9, 'n_estimators': 298, 'l2_leaf_reg': 3.8998860942858236}  
Best score: 0.005992643897162339  

**H2S, мг/м³**  
Best hyperparameters: {'learning_rate': 0.09924384160536547, 'depth': 10, 'n_estimators': 285, 'l2_leaf_reg': 1.0443515089959652}  
Best score: 0.00041449968331349593  