<img src="https://github.com/sberbank-ai-lab/LightAutoML/raw/master/imgs/LightAutoML_logo_big.png" alt="LightAutoML logo" style="width:70%;"/>

##  Предсказание энергопотребления бласти, скорости изменения энергопотребления (в разрезе 30 мин) по параметрам погодных условий ,измеряемым метостанцией "Храброво"

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#!pip install --user lightautoml
#!pip install --user pandas-profiling
#pip install tableformatterц

In [None]:
# Standard python libraries
import os
import time
# Essential DS libraries
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
import torch
# LightAutoML presets, task and report generation
import lightautoml
from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.tasks import Task
import pandas as pd
from pandas_profiling import ProfileReport
from sklearn import metrics

ModuleNotFoundError: ignored

In [None]:
TARGET_NAME = 'speed'
full_data = pd.read_csv('pivot_data.csv', sep = ',')
full_data.dropna(subset=[TARGET_NAME], inplace=True) 

#Если решаем задачу регрессии
task = Task('reg', loss = 'mae', metric = 'mae')

#Если решаем задачу множественной классификации
#task = Task('multiclass', loss = 'crossentropy', metric = 'auc')

roles = {
    'target': TARGET_NAME,
    'drop': ['mean','Unnamed: 0']
}


N_THREADS = 2
N_FOLDS = 3
RANDOM_STATE = 42
TEST_SIZE = 0.1
#TIMEOUT = 2400 # equal to 10 minutes
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)


In [None]:
%%time 
train_data = full_data
#submission = test_data.sample(50)
tr_data, te_data = train_test_split(
    train_data, 
    test_size=TEST_SIZE, 
    random_state=RANDOM_STATE
)


print(f'Data splitted. Parts sizes: tr_data = {tr_data.shape}, te_data = {te_data.shape}')
automl = TabularAutoML(
    task = task, 
    timing_params =  {'mode': 0},
    cpu_limit = N_THREADS,
    reader_params = {'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': RANDOM_STATE}
)

oof_pred = automl.fit_predict(tr_data, roles = roles, verbose = 3)
print(automl.create_model_str_desc())
te_pred = automl.predict(te_data)





## Эксп.1 : предсказать энергопоторебление по погодным условиям и дате

In [None]:

#Рассчитываем MAE
print('Train MAE score: {:.3f} '.format(metrics.mean_absolute_error(tr_data[TARGET_NAME].values, oof_pred.data)))
#Рассчитываем MedianMAE
print('Train MedianMAE score: {:.3f} '.format(metrics.median_absolute_error(tr_data[TARGET_NAME].values, oof_pred.data)))
print()
print()


#Рассчитываем MAE
print('Test_MAE score: {:.3f} '.format(metrics.mean_absolute_error(te_data[TARGET_NAME].values, te_pred.data)))
print('Test_MedianMAE score: {:.3f} '.format(metrics.median_absolute_error(te_data[TARGET_NAME].values, te_pred.data),squared= False))

result = pd.DataFrame({'Предсказание' : np.round(te_pred.data[:,0],0),'Реальность': te_data[TARGET_NAME].values})
print(result.sample(15))

# оценим степень влияния признаков на процесс обучения

In [None]:
%%time

# Fast feature importances calculation
fast_fi = automl.get_feature_scores('fast')
fast_fi.set_index('Feature')['Importance'].plot.bar(figsize = (30, 10), grid = True)

# Предсказанное энергопотребление в разрезе "год"

In [None]:
automl.plot_pdp(te_data, feature_name='time')

# предсказанное энергопотребление в разрезе "месяц"

In [None]:
%%time

automl.plot_pdp(te_data, feature_name='time', datetime_level='month')

# Предсказанное энергопотребление в рпзрезе "день"

In [None]:
automl.plot_pdp(te_data, feature_name='time', datetime_level='dayofweek')

## Эксп.2: Предсказать изменение скорости энергопотребления по погодным условиям, дате

In [None]:

#Рассчитываем MAE
print('Train MAE score: {:.3f} '.format(metrics.mean_absolute_error(tr_data[TARGET_NAME].values, oof_pred.data)))
#Рассчитываем MedianMAE
print('Train MedianMAE score: {:.3f} '.format(metrics.median_absolute_error(tr_data[TARGET_NAME].values, oof_pred.data)))
print()
print()


#Рассчитываем MAE
print('Test_MAE score: {:.3f} '.format(metrics.mean_absolute_error(te_data[TARGET_NAME].values, te_pred.data)))
print('Test_MedianMAE score: {:.3f} '.format(metrics.median_absolute_error(te_data[TARGET_NAME].values, te_pred.data),squared= False))

result = pd.DataFrame({'Предсказание' : np.round(te_pred.data[:,0],0),'Реальность': te_data[TARGET_NAME].values})
print(result.sample(15))

In [None]:
%%time

# Fast feature importances calculation
fast_fi = automl.get_feature_scores('fast')
fast_fi.set_index('Feature')['Importance'].plot.bar(figsize = (30, 10), grid = True)

In [None]:
%%time

#grid, ys, counts = automl.get_individual_pdp(te_data, feature_name='time', n_bins=30)

In [None]:
import matplotlib as plt

In [None]:
automl.plot_pdp(te_data, feature_name='time')

In [None]:
%%time

automl.plot_pdp(te_data, feature_name='time', datetime_level='month')

In [None]:
automl.plot_pdp(te_data, feature_name='time', datetime_level='dayofweek')

## Ресурсы

- [Official LightAutoML github repo](https://github.com/sb-ai-lab/LightAutoML/blob/master/examples/tutorials/Tutorial_1_basics.ipynb)
- [LightAutoML documentation](https://lightautoml.readthedocs.io/en/latest)
- [LightAutoML tutorials](https://github.com/sberbank-ai-lab/LightAutoML/tree/master/examples/tutorials)
- LightAutoML course:
    - [Part 1 - general overview](https://ods.ai/tracks/automl-course-part1) 
    - [Part 2 - LightAutoML specific applications](https://ods.ai/tracks/automl-course-part2)
- [OpenDataScience AutoML benchmark leaderboard](https://ods.ai/competitions/automl-benchmark/leaderboard)