In [None]:
import copy
import json
import numpy as np
import optuna
import pandas as pd
import re

from catboost import CatBoostRegressor
from geopy.distance import geodesic
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler

In [2]:
# Функция для преобразования формата скорости ветра

def extract_value(value):
    if isinstance(value, (int, float)):
        return value
    match = re.search(r'\((\d+\.\d+)\)', str(value))
    if match:
        return float(match.group(1))
    return value

In [3]:
# Функция загрузки и подготовки данных по нужным веществам из всех 6 геоточек

def load_data(num, all_substances, meteo_params):
    '''num - номер геоточки
       all_substances - названия загрязняющих веществ
    '''
    
    point = pd.read_excel(f'../data/raw/Усредненные_данные_Нижнекамск_т.{num}.xlsx')
    indexes = list(point.loc[0])
    indexes = indexes[1:]
    point = point.loc[3:].set_index('Интервал отбора')
    point.columns = indexes

    point = point.apply(lambda x: x.str.replace(',', '.'))
    point['V ветра, м/с'] = point['V ветра, м/с'].apply(extract_value)
    point[['Угол ветра, °', 'Направление ветра']] = point['D ветра, °'].str.extract(r'(\d+)\s?\((.+)\)')
    point['Направление ветра'].replace(np.NaN, 'Ш', inplace=True)
        
    # Пересечение множеств названий столбцов и substances
    substances = set(indexes) & set(all_substances)
    columns = [*substances, *meteo_params]
    point = point[columns]

    # Заполнение пропусков, если их в столбце не более 5%
    for col in point.columns:
        point[col] = pd.to_numeric(point[col], errors='ignore')
        if point[col].isna().sum().sum() / len(point) < 0.05:
            point[col].interpolate(inplace=True)
        
    return point

In [4]:
def melt_columns(df, substances, Mr, meteo_params):
    """Функция 'расплавления' столбцов с веществами"""
    
    df = pd.melt(df, id_vars=meteo_params, ignore_index=False)

    # Закодируем названия веществ их молекулярной массой
    df.replace(substances, Mr, inplace=True)
    df.rename(columns={"variable": "Mr", "value": "C, мг/м³"}, inplace=True)

    df["Mr"] = pd.to_numeric(df["Mr"], errors='coerce')

    return df

In [5]:
def make_df(df, num, target_num, lags=None):
    '''Функция предобработки датасета с предикторами
       num - номер геоточки
       target_num - номер точки, в которой модель будет считать концентрацию
    '''

    coor = locations_list[num - 1]
    target_coor = locations_list[target_num - 1]

    df = df.copy()
    df.dropna(inplace=True)
    df['lat'], df['lon'] = coor

    # Добавим расстояние от данной точки до точки c target_coor в качестве входной переменной
    df['distance, km'] = geodesic(coor, target_coor).km
    
    # Фичи c временными лагам
    if lags:
        for i in range(1, lags):
            df[f"T внеш_{i}"] = df['T внеш., °C'].shift(i)
            df[f"P атм._{i}"] = df['P атм., мм.рт.ст.'].shift(i)
            df[f"V ветра_{i}"] = df['V ветра, м/с'].shift(i)
            df[f"Угол ветра_{i}"] = df['Угол ветра, °'].shift(i)
            df[f"C, мг/м³_{i}"] = df['C, мг/м³'].shift(i)
    df.dropna(inplace=True)

    return df

In [6]:
def concat_dfs(df_melted_list, num, lags=5):

    X_list = [make_df(df, i+1, num, lags) for i, df in enumerate(df_melted_list)]
    del X_list[num - 1]    # сопоставление точки самой с собой
    X_num = pd.concat(X_list, axis=0)
    return X_num

In [7]:
def split_scale_df(X, y):
    
    # Разбиваем
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
    
    # Масштабируем X
    scl = StandardScaler()
    scl.fit(X_train)
    X_train_scl = scl.transform(X_train)
    X_test_scl = scl.transform(X_test)

    return X_train_scl, X_test_scl, y_train, y_test

In [8]:
def make_X_y():
    X_list_substance = copy.deepcopy(X_num_list)
    data_with_substance = []
    
    # Добавление целевой переменной
    for (num, X_num), point in zip(enumerate(X_list_substance), points):
        if substance in list(point):
            data_with_substance.append(X_num)
            data_with_substance[-1][f'target_{substance}'] = point[substance]
            
    X = pd.concat(data_with_substance, axis=0)
    X.dropna(inplace=True)
    X.drop_duplicates(inplace=True)
    X = pd.get_dummies(X, columns=['Направление ветра'])
    y = X.pop(f'target_{substance}')
    return X, y

In [9]:
locations_list = [
    [55.539306, 051.856451],    # д. Клятле
    [55.622944, 051.825578],    # ул. Ахтубинская, 4б
    [55.650091, 051.852687],    # ул. Гагарина, 32
    [55.598983, 051.771936],    # ул. Юбилейная, 3
    [55.613193, 051.784821],    # ул. Южная, 3
    [55.654578, 051.800072]     # ул. Ямьле, 20
    ]

meteo_params = ['T внеш., °C',
                'P атм., мм.рт.ст.',
                'V ветра, м/с',
                'Угол ветра, °',
                'Направление ветра'
               ]
substances = ['CO, мг/м³',
              'NO, мг/м³',
              'NO2, мг/м³', 
              'NH3, мг/м³',
              'SO2, мг/м³',
              'H2S, мг/м³'
              ]
# Молекулярные массы веществ из substances
Mr = [28, 30, 46, 17, 64, 34]

# Временной лаг
lags = 4

SEED = 10
sampler = optuna.samplers.TPESampler(seed=SEED)

# Cловарь для гиперпараметров моделей
best_params_dict = {}

## Optuna  
Обучим несколько моделей: catboost, метод ближайших соседей, лассо-регрессию, линейную регрессию. Подберем лучшие гиперпараметры.

In [10]:
# Загрузка, обработка и объединение данных

points = [load_data(i, substances, meteo_params) for i in range(1, 7)]
df_melted_list = [melt_columns(df, substances, Mr, meteo_params) for df in points]
X_num_list = [concat_dfs(df_melted_list, j+1, lags) for j in range(len(df_melted_list))]

#### CatBoost

In [11]:
for substance in substances:
    def objective(trial,
                  substance=substance,
                  points=points,
                  X_num_list=X_num_list
        ):
        
        X, y = make_X_y()
        X_train_scl, X_test_scl, y_train, y_test = split_scale_df(X, y)
        
        # CatBoostRegressor
        catboost_params = {
            'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1),
            'depth': trial.suggest_int('depth', 3, 10),
            'n_estimators': trial.suggest_int('n_estimators', 10, 300),    # =iterations
            'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
        }
        catboost = CatBoostRegressor(**catboost_params, random_state=SEED)
        catboost.fit(X_train_scl, y_train, early_stopping_rounds=200, verbose=0)
        
        # Прогнозы и ошибки
        pred = catboost.predict(X_test_scl)
        pred[pred < 0] = 0
        catboost_mae = mean_absolute_error(y_test, pred)
        
        return catboost_mae

    # Запуск оптимизации гиперпараметров
    study = optuna.create_study(direction='minimize', sampler=sampler)
    study.optimize(objective, n_trials=30)

    # Сохранение гиперпараметров
    best_params_dict[substance] = {
        'best_params': study.best_params
    }

    print('Best hyperparameters:', study.best_params)
    print('Best score:', study.best_value)
    print(substance)
    print()

# Сохранение гиперпараметров в JSON-файл
with open('../hyperparams.json', 'w') as f:
    json.dump(best_params_dict, f)

[I 2023-10-29 23:26:21,440] A new study created in memory with name: no-name-56f68da4-646c-4a13-93c1-4accec8f3820
[I 2023-10-29 23:26:25,394] Trial 0 finished with value: 0.13367068912381472 and parameters: {'learning_rate': 0.07736074368340785, 'depth': 3, 'n_estimators': 194, 'l2_leaf_reg': 7.7392349428475065}. Best is trial 0 with value: 0.13367068912381472.
[I 2023-10-29 23:26:26,935] Trial 1 finished with value: 0.13952564112932497 and parameters: {'learning_rate': 0.050352194217956454, 'depth': 4, 'n_estimators': 67, 'l2_leaf_reg': 7.8447764097906285}. Best is trial 0 with value: 0.13367068912381472.
[I 2023-10-29 23:26:29,503] Trial 2 finished with value: 0.1415292492193459 and parameters: {'learning_rate': 0.01774197281969101, 'depth': 3, 'n_estimators': 209, 'l2_leaf_reg': 9.58054011575443}. Best is trial 0 with value: 0.13367068912381472.
[I 2023-10-29 23:26:34,565] Trial 3 finished with value: 0.14999825560123425 and parameters: {'learning_rate': 0.0013908783664635307, 'dept

Best hyperparameters: {'learning_rate': 0.09867177317899258, 'depth': 10, 'n_estimators': 286, 'l2_leaf_reg': 1.0132855405292736}
Best score: 0.07223647919060053
CO, мг/м³



[I 2023-10-29 23:31:20,572] Trial 0 finished with value: 0.003749682170405919 and parameters: {'learning_rate': 0.055211029436005116, 'depth': 9, 'n_estimators': 67, 'l2_leaf_reg': 8.711652722119599}. Best is trial 0 with value: 0.003749682170405919.
[I 2023-10-29 23:31:25,106] Trial 1 finished with value: 0.003864938719048033 and parameters: {'learning_rate': 0.03581361130377671, 'depth': 9, 'n_estimators': 96, 'l2_leaf_reg': 8.955428316050677}. Best is trial 0 with value: 0.003749682170405919.
[I 2023-10-29 23:31:27,186] Trial 2 finished with value: 0.005154778542440953 and parameters: {'learning_rate': 0.03322565214539263, 'depth': 4, 'n_estimators': 124, 'l2_leaf_reg': 1.8411433710278526}. Best is trial 0 with value: 0.003749682170405919.
[I 2023-10-29 23:31:29,234] Trial 3 finished with value: 0.0043067044470911815 and parameters: {'learning_rate': 0.08228946012585592, 'depth': 4, 'n_estimators': 121, 'l2_leaf_reg': 9.49834641014921}. Best is trial 0 with value: 0.0037496821704059

Best hyperparameters: {'learning_rate': 0.08839263335791345, 'depth': 10, 'n_estimators': 252, 'l2_leaf_reg': 7.947221697201789}
Best score: 0.0021020558242820383
NO, мг/м³



[I 2023-10-29 23:35:53,724] Trial 0 finished with value: 0.007601213758727488 and parameters: {'learning_rate': 0.06095217887498192, 'depth': 7, 'n_estimators': 183, 'l2_leaf_reg': 3.3599409501875526}. Best is trial 0 with value: 0.007601213758727488.
[I 2023-10-29 23:35:55,486] Trial 1 finished with value: 0.012229793445743873 and parameters: {'learning_rate': 0.030786259585130017, 'depth': 3, 'n_estimators': 98, 'l2_leaf_reg': 3.1786828786317463}. Best is trial 0 with value: 0.007601213758727488.
[I 2023-10-29 23:35:58,980] Trial 2 finished with value: 0.00806079382810046 and parameters: {'learning_rate': 0.05620024067760178, 'depth': 7, 'n_estimators': 148, 'l2_leaf_reg': 3.6351817866055818}. Best is trial 0 with value: 0.007601213758727488.
[I 2023-10-29 23:36:08,252] Trial 3 finished with value: 0.012018937795474928 and parameters: {'learning_rate': 0.00736085500878762, 'depth': 10, 'n_estimators': 108, 'l2_leaf_reg': 5.455437677942089}. Best is trial 0 with value: 0.0076012137587

Best hyperparameters: {'learning_rate': 0.09969352033258613, 'depth': 10, 'n_estimators': 299, 'l2_leaf_reg': 7.917395978913174}
Best score: 0.004728024255884616
NO2, мг/м³



[I 2023-10-29 23:40:59,798] Trial 0 finished with value: 0.0026595585787142217 and parameters: {'learning_rate': 0.022658165568774136, 'depth': 9, 'n_estimators': 41, 'l2_leaf_reg': 7.701429166980046}. Best is trial 0 with value: 0.0026595585787142217.
[I 2023-10-29 23:41:01,600] Trial 1 finished with value: 0.001443102405366158 and parameters: {'learning_rate': 0.04750874405060895, 'depth': 7, 'n_estimators': 52, 'l2_leaf_reg': 2.6563133988383614}. Best is trial 1 with value: 0.001443102405366158.
[I 2023-10-29 23:41:03,168] Trial 2 finished with value: 0.0016166211170242644 and parameters: {'learning_rate': 0.06486214052035595, 'depth': 3, 'n_estimators': 82, 'l2_leaf_reg': 5.881676646052038}. Best is trial 1 with value: 0.001443102405366158.
[I 2023-10-29 23:41:07,876] Trial 3 finished with value: 0.0010701014190687532 and parameters: {'learning_rate': 0.02345056098373091, 'depth': 6, 'n_estimators': 278, 'l2_leaf_reg': 9.328211855809991}. Best is trial 3 with value: 0.0010701014190

Best hyperparameters: {'learning_rate': 0.09993190419743035, 'depth': 10, 'n_estimators': 300, 'l2_leaf_reg': 5.166700233930888}
Best score: 0.0005678383507616598
NH3, мг/м³



[I 2023-10-29 23:45:55,897] Trial 0 finished with value: 0.014890096068083948 and parameters: {'learning_rate': 0.0302944724598629, 'depth': 3, 'n_estimators': 18, 'l2_leaf_reg': 5.077945001811101}. Best is trial 0 with value: 0.014890096068083948.
[I 2023-10-29 23:45:58,844] Trial 1 finished with value: 0.00937724861409447 and parameters: {'learning_rate': 0.07474154361805671, 'depth': 7, 'n_estimators': 122, 'l2_leaf_reg': 2.5126551777935675}. Best is trial 1 with value: 0.00937724861409447.
[I 2023-10-29 23:46:03,863] Trial 2 finished with value: 0.008314032143006068 and parameters: {'learning_rate': 0.0839878707496656, 'depth': 7, 'n_estimators': 237, 'l2_leaf_reg': 8.636582637052431}. Best is trial 2 with value: 0.008314032143006068.
[I 2023-10-29 23:46:11,746] Trial 3 finished with value: 0.007660284107804104 and parameters: {'learning_rate': 0.06071313461034145, 'depth': 9, 'n_estimators': 189, 'l2_leaf_reg': 1.1904867205493948}. Best is trial 3 with value: 0.007660284107804104.

Best hyperparameters: {'learning_rate': 0.09986097070444452, 'depth': 10, 'n_estimators': 296, 'l2_leaf_reg': 1.0246217002904532}
Best score: 0.004830886372891995
SO2, мг/м³



[I 2023-10-29 23:51:35,066] Trial 0 finished with value: 0.000617967216325551 and parameters: {'learning_rate': 0.08151829228340961, 'depth': 9, 'n_estimators': 124, 'l2_leaf_reg': 8.780312744217579}. Best is trial 0 with value: 0.000617967216325551.
[I 2023-10-29 23:51:38,732] Trial 1 finished with value: 0.0007725572185461472 and parameters: {'learning_rate': 0.03901904604345752, 'depth': 5, 'n_estimators': 251, 'l2_leaf_reg': 7.627444334547071}. Best is trial 0 with value: 0.000617967216325551.
[I 2023-10-29 23:51:41,311] Trial 2 finished with value: 0.0007643613783205478 and parameters: {'learning_rate': 0.05125248989744098, 'depth': 8, 'n_estimators': 72, 'l2_leaf_reg': 9.061380538332022}. Best is trial 0 with value: 0.000617967216325551.
[I 2023-10-29 23:51:45,164] Trial 3 finished with value: 0.0006854204202723693 and parameters: {'learning_rate': 0.0966286788992737, 'depth': 5, 'n_estimators': 261, 'l2_leaf_reg': 3.7925533624610708}. Best is trial 0 with value: 0.00061796721632

Best hyperparameters: {'learning_rate': 0.07758038922363764, 'depth': 10, 'n_estimators': 282, 'l2_leaf_reg': 5.239522638905359}
Best score: 0.00046732997128016935
H2S, мг/м³



#### kNN

In [11]:
for substance in substances:
    def objective(trial,
                  substance=substance,
                  points=points,
                  X_num_list=X_num_list
        ):
        
        X, y = make_X_y()
        X_train_scl, X_test_scl, y_train, y_test = split_scale_df(X, y)
        
        # KNeighborsRegressor
        knn_params = {
        'n_neighbors': trial.suggest_int('n_neighbors', 1, 20),
        'weights': trial.suggest_categorical('weights', ['uniform', 'distance'])
        }
        knn = KNeighborsRegressor(**knn_params)
        knn.fit(X_train_scl, y_train)
        
        # Прогнозы и ошибки
        pred = knn.predict(X_test_scl)
        pred[pred < 0] = 0
        knn_mae = mean_absolute_error(y_test, pred)
        
        return knn_mae

    # Запуск оптимизации гиперпараметров
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=30)

    print('Best hyperparameters:', study.best_params)
    print('Best score:', study.best_value)
    print(substance)
    print()

[I 2023-10-27 22:00:46,441] A new study created in memory with name: no-name-98f41f39-a219-4f92-a87c-98943bf53d8d
[I 2023-10-27 22:00:55,252] Trial 0 finished with value: 0.13814959746779473 and parameters: {'n_neighbors': 1, 'weights': 'uniform'}. Best is trial 0 with value: 0.13814959746779473.
[I 2023-10-27 22:01:02,581] Trial 1 finished with value: 0.1307750349941811 and parameters: {'n_neighbors': 17, 'weights': 'uniform'}. Best is trial 1 with value: 0.1307750349941811.
[I 2023-10-27 22:01:09,974] Trial 2 finished with value: 0.1263833058028506 and parameters: {'n_neighbors': 9, 'weights': 'uniform'}. Best is trial 2 with value: 0.1263833058028506.
[I 2023-10-27 22:01:16,945] Trial 3 finished with value: 0.13814959746779473 and parameters: {'n_neighbors': 1, 'weights': 'uniform'}. Best is trial 2 with value: 0.1263833058028506.
[I 2023-10-27 22:01:24,406] Trial 4 finished with value: 0.12755210381472978 and parameters: {'n_neighbors': 11, 'weights': 'uniform'}. Best is trial 2 wi

Best hyperparameters: {'n_neighbors': 4, 'weights': 'uniform'}
Best score: 0.1221555542299583
CO, мг/м³



[I 2023-10-27 22:04:32,055] Trial 0 finished with value: 0.00618638024551094 and parameters: {'n_neighbors': 9, 'weights': 'distance'}. Best is trial 0 with value: 0.00618638024551094.
[I 2023-10-27 22:04:39,904] Trial 1 finished with value: 0.00558877219866584 and parameters: {'n_neighbors': 3, 'weights': 'uniform'}. Best is trial 1 with value: 0.00558877219866584.
[I 2023-10-27 22:04:47,715] Trial 2 finished with value: 0.006239089280463149 and parameters: {'n_neighbors': 6, 'weights': 'distance'}. Best is trial 1 with value: 0.00558877219866584.
[I 2023-10-27 22:04:55,595] Trial 3 finished with value: 0.005687629057231073 and parameters: {'n_neighbors': 20, 'weights': 'uniform'}. Best is trial 1 with value: 0.00558877219866584.
[I 2023-10-27 22:05:03,523] Trial 4 finished with value: 0.0056090603482643895 and parameters: {'n_neighbors': 15, 'weights': 'uniform'}. Best is trial 1 with value: 0.00558877219866584.
[I 2023-10-27 22:05:11,629] Trial 5 finished with value: 0.0055198309975

Best hyperparameters: {'n_neighbors': 6, 'weights': 'uniform'}
Best score: 0.005481346169886977
NO, мг/м³



[I 2023-10-27 22:08:35,176] Trial 0 finished with value: 0.013604325967730155 and parameters: {'n_neighbors': 9, 'weights': 'distance'}. Best is trial 0 with value: 0.013604325967730155.
[I 2023-10-27 22:08:43,480] Trial 1 finished with value: 0.01218651136384957 and parameters: {'n_neighbors': 4, 'weights': 'uniform'}. Best is trial 1 with value: 0.01218651136384957.
[I 2023-10-27 22:08:51,257] Trial 2 finished with value: 0.013517351965718566 and parameters: {'n_neighbors': 12, 'weights': 'distance'}. Best is trial 1 with value: 0.01218651136384957.
[I 2023-10-27 22:08:59,321] Trial 3 finished with value: 0.013517351965718566 and parameters: {'n_neighbors': 12, 'weights': 'distance'}. Best is trial 1 with value: 0.01218651136384957.
[I 2023-10-27 22:09:07,279] Trial 4 finished with value: 0.013604325967730155 and parameters: {'n_neighbors': 9, 'weights': 'distance'}. Best is trial 1 with value: 0.01218651136384957.
[I 2023-10-27 22:09:15,224] Trial 5 finished with value: 0.0134529478

Best hyperparameters: {'n_neighbors': 6, 'weights': 'uniform'}
Best score: 0.012125588256017665
NO2, мг/м³



[I 2023-10-27 22:12:35,578] Trial 0 finished with value: 0.0026876201711313512 and parameters: {'n_neighbors': 1, 'weights': 'distance'}. Best is trial 0 with value: 0.0026876201711313512.
[I 2023-10-27 22:12:43,461] Trial 1 finished with value: 0.002963766240047416 and parameters: {'n_neighbors': 17, 'weights': 'uniform'}. Best is trial 0 with value: 0.0026876201711313512.
[I 2023-10-27 22:12:51,634] Trial 2 finished with value: 0.0029503392400715986 and parameters: {'n_neighbors': 16, 'weights': 'uniform'}. Best is trial 0 with value: 0.0026876201711313512.
[I 2023-10-27 22:12:59,706] Trial 3 finished with value: 0.0026876201711313512 and parameters: {'n_neighbors': 1, 'weights': 'distance'}. Best is trial 0 with value: 0.0026876201711313512.
[I 2023-10-27 22:13:07,925] Trial 4 finished with value: 0.0029603517076688076 and parameters: {'n_neighbors': 19, 'weights': 'distance'}. Best is trial 0 with value: 0.0026876201711313512.
[I 2023-10-27 22:13:15,821] Trial 5 finished with value

Best hyperparameters: {'n_neighbors': 3, 'weights': 'uniform'}
Best score: 0.0025628162538184522
NH3, мг/м³



[I 2023-10-27 22:16:35,222] Trial 0 finished with value: 0.008533527161228568 and parameters: {'n_neighbors': 14, 'weights': 'distance'}. Best is trial 0 with value: 0.008533527161228568.
[I 2023-10-27 22:16:43,165] Trial 1 finished with value: 0.007049406238929751 and parameters: {'n_neighbors': 4, 'weights': 'distance'}. Best is trial 1 with value: 0.007049406238929751.
[I 2023-10-27 22:16:51,222] Trial 2 finished with value: 0.009512143857252305 and parameters: {'n_neighbors': 18, 'weights': 'uniform'}. Best is trial 1 with value: 0.007049406238929751.
[I 2023-10-27 22:16:59,572] Trial 3 finished with value: 0.007928633246657182 and parameters: {'n_neighbors': 6, 'weights': 'uniform'}. Best is trial 1 with value: 0.007049406238929751.
[I 2023-10-27 22:17:07,464] Trial 4 finished with value: 0.009391546812355565 and parameters: {'n_neighbors': 16, 'weights': 'uniform'}. Best is trial 1 with value: 0.007049406238929751.
[I 2023-10-27 22:17:15,458] Trial 5 finished with value: 0.008166

Best hyperparameters: {'n_neighbors': 2, 'weights': 'distance'}
Best score: 0.006443928404019076
SO2, мг/м³



[I 2023-10-27 22:20:41,358] Trial 0 finished with value: 0.0007973191615518511 and parameters: {'n_neighbors': 3, 'weights': 'uniform'}. Best is trial 0 with value: 0.0007973191615518511.
[I 2023-10-27 22:20:49,387] Trial 1 finished with value: 0.0008196410723563459 and parameters: {'n_neighbors': 8, 'weights': 'uniform'}. Best is trial 0 with value: 0.0007973191615518511.
[I 2023-10-27 22:20:57,389] Trial 2 finished with value: 0.0008638185135101596 and parameters: {'n_neighbors': 3, 'weights': 'distance'}. Best is trial 0 with value: 0.0007973191615518511.
[I 2023-10-27 22:21:05,468] Trial 3 finished with value: 0.0008602575627773101 and parameters: {'n_neighbors': 20, 'weights': 'uniform'}. Best is trial 0 with value: 0.0007973191615518511.
[I 2023-10-27 22:21:13,609] Trial 4 finished with value: 0.0008602575627773101 and parameters: {'n_neighbors': 20, 'weights': 'uniform'}. Best is trial 0 with value: 0.0007973191615518511.
[I 2023-10-27 22:21:21,570] Trial 5 finished with value: 

Best hyperparameters: {'n_neighbors': 3, 'weights': 'uniform'}
Best score: 0.0007973191615518511
H2S, мг/м³



**CO, мг/м³**  
Best hyperparameters: {'n_neighbors': 4, 'weights': 'uniform'}  
Best score: 0.122155554229958  

**NO, мг/м³**  
Best hyperparameters: {'n_neighbors': 6, 'weights': 'uniform'}  
Best score: 0.00548134616988697  
  
**NO2, мг/м³**  
Best hyperparameters: {'n_neighbors': 6, 'weights': 'uniform'}  
Best score: 0.01212558825601766  

**NH3, мг/м³**  
Best hyperparameters: {'n_neighbors': 3, 'weights': 'uniform'}  
Best score: 0.002562816253818452  

**SO2, мг/м³**  
Best hyperparameters: {'n_neighbors': 2, 'weights': 'distance'}  
Best score: 0.006443928404019076  

**H2S, мг/м³**  
Best hyperparameters: {'n_neighbors': 3, 'weights': 'uniform'}  
Best score: 0.0007973191615518511м³
м³
м³

#### Lasso

In [11]:
for substance in substances:
    def objective(trial,
              substance=substance,
              points=points,
              X_num_list=X_num_list
    ):
        X, y = make_X_y()
        X_train_scl, X_test_scl, y_train, y_test = split_scale_df(X, y)
        
        # Lasso регрессия
        alpha = trial.suggest_float('alpha', 0.001, 10.0)
        
        lasso = Lasso(alpha=alpha)
        lasso.fit(X_train_scl, y_train)
        
        # Прогнозы и ошибки
        pred = lasso.predict(X_test_scl)
        pred[pred < 0] = 0
        lasso_mae = mean_absolute_error(y_test, pred)
        
        return lasso_mae

    # Запуск оптимизации гиперпараметров
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=30)

    print('Best hyperparameters:', study.best_params)
    print('Best score:', study.best_value)
    print(substance)
    print()

[I 2023-10-28 18:19:55,217] A new study created in memory with name: no-name-924570df-b049-4944-ba31-7514acf2affd
[I 2023-10-28 18:19:56,022] Trial 0 finished with value: 0.15714539048111392 and parameters: {'alpha': 8.383025124005142}. Best is trial 0 with value: 0.15714539048111392.
[I 2023-10-28 18:19:56,717] Trial 1 finished with value: 0.15714539048111392 and parameters: {'alpha': 2.654181050627008}. Best is trial 0 with value: 0.15714539048111392.
[I 2023-10-28 18:19:57,475] Trial 2 finished with value: 0.15714539048111392 and parameters: {'alpha': 4.764577455055466}. Best is trial 0 with value: 0.15714539048111392.
[I 2023-10-28 18:19:58,170] Trial 3 finished with value: 0.15714539048111392 and parameters: {'alpha': 4.874905254257231}. Best is trial 0 with value: 0.15714539048111392.
[I 2023-10-28 18:19:58,866] Trial 4 finished with value: 0.15714539048111392 and parameters: {'alpha': 6.883188366669951}. Best is trial 0 with value: 0.15714539048111392.
[I 2023-10-28 18:19:59,580

Best hyperparameters: {'alpha': 8.383025124005142}
Best score: 0.15714539048111392
CO, мг/м³



[I 2023-10-28 18:20:17,371] Trial 0 finished with value: 0.007346214856669681 and parameters: {'alpha': 4.243614865238524}. Best is trial 0 with value: 0.007346214856669681.
[I 2023-10-28 18:20:18,083] Trial 1 finished with value: 0.007346214856669681 and parameters: {'alpha': 3.894336337590316}. Best is trial 0 with value: 0.007346214856669681.
[I 2023-10-28 18:20:18,798] Trial 2 finished with value: 0.007346214856669681 and parameters: {'alpha': 8.238435388191402}. Best is trial 0 with value: 0.007346214856669681.
[I 2023-10-28 18:20:19,512] Trial 3 finished with value: 0.007346214856669681 and parameters: {'alpha': 7.8402562319891125}. Best is trial 0 with value: 0.007346214856669681.
[I 2023-10-28 18:20:20,225] Trial 4 finished with value: 0.007346214856669681 and parameters: {'alpha': 4.959554523387224}. Best is trial 0 with value: 0.007346214856669681.
[I 2023-10-28 18:20:20,941] Trial 5 finished with value: 0.007346214856669681 and parameters: {'alpha': 3.6158043674320974}. Best

Best hyperparameters: {'alpha': 4.243614865238524}
Best score: 0.007346214856669681
NO, мг/м³



[I 2023-10-28 18:20:38,894] Trial 0 finished with value: 0.016614101327460693 and parameters: {'alpha': 9.842043133890922}. Best is trial 0 with value: 0.016614101327460693.
[I 2023-10-28 18:20:39,605] Trial 1 finished with value: 0.016614101327460693 and parameters: {'alpha': 2.6204364158895412}. Best is trial 0 with value: 0.016614101327460693.
[I 2023-10-28 18:20:40,319] Trial 2 finished with value: 0.016614101327460693 and parameters: {'alpha': 4.279074082597729}. Best is trial 0 with value: 0.016614101327460693.
[I 2023-10-28 18:20:41,034] Trial 3 finished with value: 0.016614101327460693 and parameters: {'alpha': 7.0165181208307414}. Best is trial 0 with value: 0.016614101327460693.
[I 2023-10-28 18:20:41,748] Trial 4 finished with value: 0.016614101327460693 and parameters: {'alpha': 6.045807361170839}. Best is trial 0 with value: 0.016614101327460693.
[I 2023-10-28 18:20:42,456] Trial 5 finished with value: 0.016614101327460693 and parameters: {'alpha': 7.952662206785438}. Best

Best hyperparameters: {'alpha': 9.842043133890922}
Best score: 0.016614101327460693
NO2, мг/м³



[I 2023-10-28 18:21:00,232] Trial 0 finished with value: 0.0054864883215744835 and parameters: {'alpha': 9.535006161682626}. Best is trial 0 with value: 0.0054864883215744835.
[I 2023-10-28 18:21:00,965] Trial 1 finished with value: 0.0054864883215744835 and parameters: {'alpha': 4.748799645757859}. Best is trial 0 with value: 0.0054864883215744835.
[I 2023-10-28 18:21:01,713] Trial 2 finished with value: 0.0054864883215744835 and parameters: {'alpha': 1.9610751373535773}. Best is trial 0 with value: 0.0054864883215744835.
[I 2023-10-28 18:21:02,414] Trial 3 finished with value: 0.0054864883215744835 and parameters: {'alpha': 8.355661618730522}. Best is trial 0 with value: 0.0054864883215744835.
[I 2023-10-28 18:21:03,117] Trial 4 finished with value: 0.0054864883215744835 and parameters: {'alpha': 8.149034658872957}. Best is trial 0 with value: 0.0054864883215744835.
[I 2023-10-28 18:21:03,824] Trial 5 finished with value: 0.0054864883215744835 and parameters: {'alpha': 0.685879821467

Best hyperparameters: {'alpha': 9.535006161682626}
Best score: 0.0054864883215744835
NH3, мг/м³



[I 2023-10-28 18:21:21,510] Trial 0 finished with value: 0.018028464197945175 and parameters: {'alpha': 3.227260431245439}. Best is trial 0 with value: 0.018028464197945175.
[I 2023-10-28 18:21:22,208] Trial 1 finished with value: 0.018028464197945175 and parameters: {'alpha': 2.0943305715570584}. Best is trial 0 with value: 0.018028464197945175.
[I 2023-10-28 18:21:22,902] Trial 2 finished with value: 0.018028464197945175 and parameters: {'alpha': 4.179449553856484}. Best is trial 0 with value: 0.018028464197945175.
[I 2023-10-28 18:21:23,598] Trial 3 finished with value: 0.018028464197945175 and parameters: {'alpha': 8.915791339126196}. Best is trial 0 with value: 0.018028464197945175.
[I 2023-10-28 18:21:24,301] Trial 4 finished with value: 0.018028464197945175 and parameters: {'alpha': 1.4224669602442597}. Best is trial 0 with value: 0.018028464197945175.
[I 2023-10-28 18:21:24,998] Trial 5 finished with value: 0.018028464197945175 and parameters: {'alpha': 1.630203008625273}. Best

Best hyperparameters: {'alpha': 3.227260431245439}
Best score: 0.018028464197945175
SO2, мг/м³



[I 2023-10-28 18:21:42,622] Trial 0 finished with value: 0.0012749227921536863 and parameters: {'alpha': 1.9610427224194547}. Best is trial 0 with value: 0.0012749227921536863.
[I 2023-10-28 18:21:43,327] Trial 1 finished with value: 0.0012749227921536863 and parameters: {'alpha': 8.121952755914164}. Best is trial 0 with value: 0.0012749227921536863.
[I 2023-10-28 18:21:44,031] Trial 2 finished with value: 0.0012749227921536863 and parameters: {'alpha': 7.391700137718645}. Best is trial 0 with value: 0.0012749227921536863.
[I 2023-10-28 18:21:44,740] Trial 3 finished with value: 0.0012749227921536863 and parameters: {'alpha': 3.5879828530313604}. Best is trial 0 with value: 0.0012749227921536863.
[I 2023-10-28 18:21:45,448] Trial 4 finished with value: 0.0012749227921536863 and parameters: {'alpha': 4.459292559365164}. Best is trial 0 with value: 0.0012749227921536863.
[I 2023-10-28 18:21:46,157] Trial 5 finished with value: 0.0012749227921536863 and parameters: {'alpha': 2.02793344539

Best hyperparameters: {'alpha': 1.9610427224194547}
Best score: 0.0012749227921536863
H2S, мг/м³



#### Линейная регрессия

In [13]:
for substance in substances:
    
    X, y = make_X_y()
    X_train_scl, X_test_scl, y_train, y_test = split_scale_df(X, y)
    
    linreg = LinearRegression()
    linreg.fit(X_train_scl, y_train)
    
    # Прогнозы и ошибки
    pred = linreg.predict(X_test_scl)
    pred[pred < 0] = 0
    linreg_mae = mean_absolute_error(y_test, pred)
    
    print('MAE =', linreg_mae)
    print(substance)
    print()

MAE = 0.15218398917789194
CO, мг/м³

MAE = 0.006780200482458089
NO, мг/м³

MAE = 0.014317262274034504
NO2, мг/м³

MAE = 0.003859706514124793
NH3, мг/м³

MAE = 0.017153156931728655
SO2, мг/м³

MAE = 0.0010133777635253245
H2S, мг/м³



Градиентный бустинг catboost по-прежнему остается лучшей моделью, поэтому для вычислений в приложении будем использовать его. Также можно не масштабировать входные данные, так как диапазон значений переменных не влияет на результаты градиентного бустинга.