In [1]:
# !pip install -U https://github.com/sberbank-ai-lab/LightAutoML/raw/fix/logging/LightAutoML-0.2.16.2-py3-none-any.whl
# !pip install openpyxl
!pip install -q pycaret

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
fancyimpute 0.5.5 requires tensorflow, which is not installed.
scattertext 0.1.3 requires gensim>=4.0.0, but you have gensim 3.8.3 which is incompatible.
pdpbox 0.2.1 requires matplotlib==3.1.1, but you have matplotlib 3.4.2 which is incompatible.
matrixprofile 1.1.10 requires protobuf==3.11.2, but you have protobuf 3.17.3 which is incompatible.[0m


# Step 0.1. Import libraries

Here we will import the libraries we use in this kernel:
- Standard python libraries for timing, working with OS etc.
- Essential python DS libraries like numpy, pandas, scikit-learn and torch (the last we will use in the next cell)
- LightAutoML modules: presets for AutoML, task and report generation module

In [2]:
# Standard python libraries
import os
import time

# Essential DS libraries
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_log_error
import matplotlib.pyplot as plt
import torch

from pycaret.regression import setup, compare_models, tune_model, blend_models, finalize_model, predict_model, plot_model

# LightAutoML presets, task and report generation
# from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
# from lightautoml.tasks import Task
# from lightautoml.dataset.roles import DatetimeRole
# from lightautoml.report.report_deco import ReportDeco

import warnings
warnings.simplefilter('ignore')

# Step 0.2. Constants

Here we setup the constants to use in the kernel:
- `N_THREADS` - number of vCPUs for LightAutoML model creation
- `N_FOLDS` - number of folds in LightAutoML inner CV
- `RANDOM_STATE` - random seed for better reproducibility
- `TIMEOUT` - limit in seconds for model to train
- `TARGET_NAME` - target column name in dataset

In [3]:
CFG = {
    'carbon_monoxide': {
        'target': 'target_carbon_monoxide',
        'seed': 2001,
        'n_select': 2,
        'fold': 3,
        'fold_strategy': 'timeseries',
        'tuning': True,
        'normalize': True,
        'optimize': 'RMSLE',
    },
    'benzene': {
        'target': 'target_benzene',
        'seed': 2001,
        'n_select': 2,
        'fold': 3,
        'fold_strategy': 'timeseries',
        'tuning': True,
        'normalize': True,
        'optimize': 'RMSLE',        
    },
    'nitrogen_oxides': {
        'target': 'target_nitrogen_oxides',
        'seed': 2001,
        'n_select': 2,
        'fold': 2,
        'fold_strategy': 'timeseries',
        'tuning': True,
        'normalize': True,
        'optimize': 'RMSLE',        
    },
}

In [4]:
N_THREADS = 4
N_FOLDS = 5
RANDOM_STATE = 2001
TIMEOUT = 3 * 3600
TARGET_NAME = 'target'

# Step 0.3. Imported models setup

For better reproducibility we fix numpy random seed with max number of threads for Torch (which usually try to use all the threads on server):

In [5]:
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

# Step 0.4. Data loading
Let's check the data we have:

In [6]:
%%time

train_data = pd.read_csv('../input/tabular-playground-series-jul-2021/train.csv')
train_data.head()

CPU times: user 18.6 ms, sys: 5.99 ms, total: 24.6 ms
Wall time: 36.7 ms


Unnamed: 0,date_time,deg_C,relative_humidity,absolute_humidity,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,target_carbon_monoxide,target_benzene,target_nitrogen_oxides
0,2010-03-10 18:00:00,13.1,46.0,0.7578,1387.2,1087.8,1056.0,1742.8,1293.4,2.5,12.0,167.7
1,2010-03-10 19:00:00,13.2,45.3,0.7255,1279.1,888.2,1197.5,1449.9,1010.9,2.1,9.9,98.9
2,2010-03-10 20:00:00,12.6,56.2,0.7502,1331.9,929.6,1060.2,1586.1,1117.0,2.2,9.2,127.1
3,2010-03-10 21:00:00,11.0,62.4,0.7867,1321.0,929.0,1102.9,1536.5,1263.2,2.2,9.7,177.2
4,2010-03-10 22:00:00,11.9,59.0,0.7888,1272.0,852.7,1180.9,1415.5,1132.2,1.5,6.4,121.8


In [7]:
train_data.shape

(7111, 12)

In [8]:
%%time

test_data = pd.read_csv('../input/tabular-playground-series-jul-2021/test.csv')
test_data.head()

CPU times: user 6.29 ms, sys: 29 µs, total: 6.32 ms
Wall time: 10.6 ms


Unnamed: 0,date_time,deg_C,relative_humidity,absolute_humidity,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5
0,2011-01-01 00:00:00,8.0,41.3,0.4375,1108.8,745.7,797.1,880.0,1273.1
1,2011-01-01 01:00:00,5.1,51.7,0.4564,1249.5,864.9,687.9,972.8,1714.0
2,2011-01-01 02:00:00,5.8,51.5,0.4689,1102.6,878.0,693.7,941.9,1300.8
3,2011-01-01 03:00:00,5.0,52.3,0.4693,1139.7,916.2,725.6,1011.0,1283.0
4,2011-01-01 04:00:00,4.5,57.5,0.465,1022.4,838.5,871.5,967.0,1142.3


In [9]:
test_data.shape

(2247, 9)

In [10]:
%%time

sample_sub = pd.read_csv('../input/tabular-playground-series-jul-2021/sample_submission.csv')
sample_sub.head()

CPU times: user 4.57 ms, sys: 80 µs, total: 4.65 ms
Wall time: 7.77 ms


Unnamed: 0,date_time,target_carbon_monoxide,target_benzene,target_nitrogen_oxides
0,2011-01-01 00:00:00,2.0,10.0,300.0
1,2011-01-01 01:00:00,2.0,10.0,300.0
2,2011-01-01 02:00:00,2.0,10.0,300.0
3,2011-01-01 03:00:00,2.0,10.0,300.0
4,2011-01-01 04:00:00,2.0,10.0,300.0


In [11]:
sample_sub.shape

(2247, 4)

## Don't know what to do with -200? Use pseudolabelling 🧐

In [12]:
! pip install openpyxl 

Collecting openpyxl
  Downloading openpyxl-3.0.7-py2.py3-none-any.whl (243 kB)
[K     |████████████████████████████████| 243 kB 593 kB/s 
[?25hCollecting et-xmlfile
  Downloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.0.7


In [13]:
# Pseudolabels from true dataset 
pseudolabels_true = pd.read_excel('/kaggle/input/air-quality-time-series-data-uci/AirQualityUCI.xlsx')
pseudolabels_true = pseudolabels_true.iloc[7110:].reset_index(drop = True)
pseudolabels_true.rename({'CO(GT)': 'target_carbon_monoxide', 'C6H6(GT)': 'target_benzene', 'NOx(GT)': 'target_nitrogen_oxides'}, axis = 1, inplace = True)
pseudolabels_true

Unnamed: 0,Date,Time,target_carbon_monoxide,PT08.S1(CO),NMHC(GT),target_benzene,PT08.S2(NMHC),target_nitrogen_oxides,PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,Unnamed: 15,Unnamed: 16
0,2005-01-01,00:00:00,-200.0,1046.25,-200,4.247511,724.25,-200.0,848.00,-200.0,897.75,1201.25,8.200,40.099999,0.437451,,
1,2005-01-01,01:00:00,1.6,1274.50,-200,8.763027,930.00,214.9,648.50,106.3,1023.50,1617.00,5.325,50.725000,0.456368,,
2,2005-01-01,02:00:00,2.5,1172.75,-200,7.498693,878.25,299.8,738.00,128.6,1002.00,1355.25,5.925,50.049999,0.468928,,
3,2005-01-01,03:00:00,2.7,1162.50,-200,7.563761,881.00,-200.0,748.25,-200.0,1001.25,1295.50,4.850,53.875000,0.469339,,
4,2005-01-01,04:00:00,1.9,1054.00,-200,5.560496,791.00,253.1,829.50,126.0,966.50,1131.00,4.325,55.325000,0.465048,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2242,2005-04-04,10:00:00,3.1,1314.25,-200,13.529605,1101.25,471.7,538.50,189.8,1374.25,1728.50,21.850,29.250000,0.756824,,
2243,2005-04-04,11:00:00,2.4,1162.50,-200,11.355157,1027.00,353.3,603.75,179.2,1263.50,1269.00,24.325,23.725000,0.711864,,
2244,2005-04-04,12:00:00,2.4,1142.00,-200,12.374538,1062.50,293.0,603.25,174.7,1240.75,1092.00,26.900,18.350000,0.640649,,
2245,2005-04-04,13:00:00,2.1,1002.50,-200,9.547187,960.50,234.5,701.50,155.7,1041.00,769.75,28.325,13.550000,0.513866,,


In [14]:
pseudolabels_preds = pd.read_csv('../input/tps-lightautoml-baseline-with-pseudolabels/lightautoml_with_pseudolabelling_kernel_version_16.csv')
pseudolabels_preds

Unnamed: 0,date_time,target_carbon_monoxide,target_benzene,target_nitrogen_oxides
0,2011-01-01 00:00:00,1.459172,4.247511,197.915939
1,2011-01-01 01:00:00,1.600000,8.763027,214.900000
2,2011-01-01 02:00:00,2.500000,7.498693,299.800000
3,2011-01-01 03:00:00,2.700000,7.563761,329.955383
4,2011-01-01 04:00:00,1.900000,5.560496,253.100000
...,...,...,...,...
2242,2011-04-04 10:00:00,3.100000,13.529605,471.700000
2243,2011-04-04 11:00:00,2.400000,11.355157,353.300000
2244,2011-04-04 12:00:00,2.400000,12.374538,293.000000
2245,2011-04-04 13:00:00,2.100000,9.547187,234.500000


In [15]:
test_data['target_carbon_monoxide'] = np.where(pseudolabels_true['target_carbon_monoxide'].values >= 0, 
                                               pseudolabels_true['target_carbon_monoxide'].values, 
                                               pseudolabels_preds['target_carbon_monoxide'].values)
test_data['target_benzene'] = np.where(pseudolabels_true['target_benzene'].values >= 0, 
                                       pseudolabels_true['target_benzene'].values, 
                                       pseudolabels_preds['target_benzene'].values)
test_data['target_nitrogen_oxides'] = np.where(pseudolabels_true['target_nitrogen_oxides'].values >= 0, 
                                       pseudolabels_true['target_nitrogen_oxides'].values, 
                                       pseudolabels_preds['target_nitrogen_oxides'].values)
    
test_data

Unnamed: 0,date_time,deg_C,relative_humidity,absolute_humidity,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,target_carbon_monoxide,target_benzene,target_nitrogen_oxides
0,2011-01-01 00:00:00,8.0,41.3,0.4375,1108.8,745.7,797.1,880.0,1273.1,1.459172,4.247511,197.915939
1,2011-01-01 01:00:00,5.1,51.7,0.4564,1249.5,864.9,687.9,972.8,1714.0,1.600000,8.763027,214.900000
2,2011-01-01 02:00:00,5.8,51.5,0.4689,1102.6,878.0,693.7,941.9,1300.8,2.500000,7.498693,299.800000
3,2011-01-01 03:00:00,5.0,52.3,0.4693,1139.7,916.2,725.6,1011.0,1283.0,2.700000,7.563761,329.955383
4,2011-01-01 04:00:00,4.5,57.5,0.4650,1022.4,838.5,871.5,967.0,1142.3,1.900000,5.560496,253.100000
...,...,...,...,...,...,...,...,...,...,...,...,...
2242,2011-04-04 10:00:00,23.2,28.7,0.7568,1340.3,1023.9,522.8,1374.0,1659.8,3.100000,13.529605,471.700000
2243,2011-04-04 11:00:00,24.5,22.5,0.7119,1232.8,955.1,616.1,1226.1,1269.0,2.400000,11.355157,353.300000
2244,2011-04-04 12:00:00,26.6,19.0,0.6406,1187.7,1052.4,572.8,1253.4,1081.1,2.400000,12.374538,293.000000
2245,2011-04-04 13:00:00,29.1,12.7,0.5139,1053.2,1009.0,702.0,1009.8,808.5,2.100000,9.547187,234.500000


In [16]:
test_data['target_carbon_monoxide'].value_counts()

1.000000    104
1.400000     98
1.100000     96
1.500000     84
1.600000     82
0.900000     82
0.800000     74
1.300000     73
0.700000     72
1.700000     67
0.600000     66
1.200000     64
2.100000     60
1.900000     59
2.200000     59
0.500000     52
2.000000     51
2.300000     50
2.400000     48
2.700000     47
1.800000     47
0.400000     43
3.100000     37
3.000000     37
2.800000     36
3.300000     36
2.500000     35
2.600000     35
3.400000     28
3.800000     26
2.900000     26
3.200000     24
0.300000     23
3.600000     23
3.900000     22
3.500000     19
0.200000     19
4.000000     18
4.300000     18
0.100000     16
4.200000     16
4.100000     16
3.700000     13
4.900000     12
4.500000     11
5.000000     11
4.800000     10
5.900000      8
5.800000      7
5.400000      7
5.600000      7
5.700000      7
4.600000      6
4.700000      6
4.400000      6
5.200000      6
5.100000      5
5.500000      5
6.100000      4
6.400000      3
6.000000      3
5.300000      3
6.500000

In [17]:
test_data['target_benzene'].value_counts()

5.151339     6
7.928930     5
5.277768     5
6.557207     4
3.512548     4
            ..
15.132196    1
15.986462    1
16.785655    1
1.965171     1
6.941043     1
Name: target_benzene, Length: 1756, dtype: int64

In [18]:
test_data['target_nitrogen_oxides'].value_counts()

156.1    4
122.0    3
221.0    3
180.4    3
260.8    3
        ..
458.5    1
55.0     1
299.3    1
185.9    1
166.7    1
Name: target_nitrogen_oxides, Length: 1892, dtype: int64

In [19]:
ALL_DF = pd.concat([train_data, test_data]).reset_index(drop = True)
print(ALL_DF.shape)

(9358, 12)


In [20]:
# Feature engineering func from Remek Kinas kernel with MLJAR (https://www.kaggle.com/remekkinas/mljar-code-minimal) - do not forget to upvote his kernel
    
import math

def pb_add(X):
    X['day'] = X.date_time.dt.weekday
    is_odd = (X['sensor_4'] < 646) & (X['absolute_humidity'] < 0.238)
    X['is_odd'] = is_odd
    diff = X['date_time'] - min(X['date_time'])
    trend = diff.dt.days
    X['f1s'] = np.sin(trend * 2 * math.pi / (365 * 1)) 
    X['f1c'] = np.cos(trend * 2 * math.pi / (365 * 1))
    X['f2s'] = np.sin(2 * math.pi * trend / (365 * 2)) 
    X['f2c'] = np.cos(2 * math.pi * trend / (365 * 2)) 
    X['f3s'] = np.sin(2 * math.pi * trend / (365 * 3)) 
    X['f3c'] = np.cos(2 * math.pi * trend / (365 * 3)) 
    X['f4s'] = np.sin(2 * math.pi * trend / (365 * 4)) 
    X['f4c'] = np.cos(2 * math.pi * trend / (365 * 4)) 
    X['fh1s'] = np.sin(diff.dt.seconds * 2 * math.pi / ( 3600 * 24 * 1))
    X['fh1c'] = np.cos(diff.dt.seconds * 2 * math.pi / ( 3600 * 24 * 1))
    X['fh2s'] = np.sin(diff.dt.seconds * 2 * math.pi / ( 3600 * 24 * 2))
    X['fh2c'] = np.cos(diff.dt.seconds * 2 * math.pi / ( 3600 * 24 * 2))
    X['fh3s'] = np.sin(diff.dt.seconds * 2 * math.pi / ( 3600 * 24 * 3))
    X['fh3c'] = np.cos(diff.dt.seconds * 2 * math.pi / ( 3600 * 24 * 3))
    
    sensor_features = [
        'deg_C', 
        'relative_humidity', 'absolute_humidity', 
        'sensor_1', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5' ]
    
    lags = [-1, -4, -24, -7 * 24]  
    for sensor_feature in sensor_features:
        this = X[sensor_feature]

        for lag in lags:
            feature = f'{sensor_feature}_{abs(lag)}b'
            this_f = X[sensor_feature].shift(lag)
            X[feature] = (this_f - this).fillna(0)
        # look forwards
        for lag in lags:
            feature = f'{sensor_feature}_{abs(-lag)}f'
            this_f = X[sensor_feature].shift(-lag)
            X[feature] = (this_f - this).fillna(0)
            
    return X

ALL_DF['date_time'] = pd.to_datetime(ALL_DF['date_time'])
ALL_DF["hour"] = ALL_DF["date_time"].dt.hour
ALL_DF["working_hours"] =  ALL_DF["hour"].isin(np.arange(8, 21, 1)).astype("int")
ALL_DF["is_weekend"] = (ALL_DF["date_time"].dt.dayofweek >= 5).astype("int")
ALL_DF['hr'] = ALL_DF.date_time.dt.hour * 60 + ALL_DF.date_time.dt.minute
ALL_DF['satday'] = (ALL_DF.date_time.dt.weekday==5).astype("int")
ALL_DF["SMC"] = (ALL_DF["absolute_humidity"] * 100) / ALL_DF["relative_humidity"]
ALL_DF.drop(columns = 'hour', inplace = True)

pb_add(ALL_DF)

ALL_DF['date_time'] = ALL_DF['date_time'].astype(str)

## Important - cell below works only because of the data leak. In real life you can't create features using the future data. Be careful!

In [21]:
def create_target_feats(df):
    for lag in [1, 4, 24, 7 * 24]:
        for t in ['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']:
            df['{}_lag_{}'.format(t, lag)] = df[t].shift(lag)
            df['{}_lag_m{}'.format(t, lag)] = df[t].shift(-lag)
            df['diff_{}_{}'.format(t, lag)] = df['{}_lag_m{}'.format(t, lag)] - df['{}_lag_{}'.format(t, lag)]
            df['div_{}_{}'.format(t, lag)] = df['{}_lag_m{}'.format(t, lag)] / df['{}_lag_{}'.format(t, lag)]
create_target_feats(ALL_DF)

In [22]:
ALL_DF

Unnamed: 0,date_time,deg_C,relative_humidity,absolute_humidity,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,target_carbon_monoxide,target_benzene,target_nitrogen_oxides,working_hours,is_weekend,hr,satday,SMC,day,is_odd,f1s,f1c,f2s,f2c,f3s,f3c,f4s,f4c,fh1s,fh1c,fh2s,fh2c,fh3s,fh3c,deg_C_1b,deg_C_4b,deg_C_24b,deg_C_168b,deg_C_1f,deg_C_4f,deg_C_24f,deg_C_168f,relative_humidity_1b,relative_humidity_4b,relative_humidity_24b,relative_humidity_168b,relative_humidity_1f,relative_humidity_4f,relative_humidity_24f,relative_humidity_168f,absolute_humidity_1b,absolute_humidity_4b,absolute_humidity_24b,absolute_humidity_168b,absolute_humidity_1f,absolute_humidity_4f,absolute_humidity_24f,absolute_humidity_168f,sensor_1_1b,sensor_1_4b,sensor_1_24b,sensor_1_168b,sensor_1_1f,sensor_1_4f,sensor_1_24f,sensor_1_168f,sensor_2_1b,sensor_2_4b,sensor_2_24b,sensor_2_168b,sensor_2_1f,sensor_2_4f,sensor_2_24f,sensor_2_168f,sensor_3_1b,sensor_3_4b,sensor_3_24b,sensor_3_168b,sensor_3_1f,sensor_3_4f,sensor_3_24f,sensor_3_168f,sensor_4_1b,sensor_4_4b,sensor_4_24b,sensor_4_168b,sensor_4_1f,sensor_4_4f,sensor_4_24f,sensor_4_168f,sensor_5_1b,sensor_5_4b,sensor_5_24b,sensor_5_168b,sensor_5_1f,sensor_5_4f,sensor_5_24f,sensor_5_168f,target_carbon_monoxide_lag_1,target_carbon_monoxide_lag_m1,diff_target_carbon_monoxide_1,div_target_carbon_monoxide_1,target_benzene_lag_1,target_benzene_lag_m1,diff_target_benzene_1,div_target_benzene_1,target_nitrogen_oxides_lag_1,target_nitrogen_oxides_lag_m1,diff_target_nitrogen_oxides_1,div_target_nitrogen_oxides_1,target_carbon_monoxide_lag_4,target_carbon_monoxide_lag_m4,diff_target_carbon_monoxide_4,div_target_carbon_monoxide_4,target_benzene_lag_4,target_benzene_lag_m4,diff_target_benzene_4,div_target_benzene_4,target_nitrogen_oxides_lag_4,target_nitrogen_oxides_lag_m4,diff_target_nitrogen_oxides_4,div_target_nitrogen_oxides_4,target_carbon_monoxide_lag_24,target_carbon_monoxide_lag_m24,diff_target_carbon_monoxide_24,div_target_carbon_monoxide_24,target_benzene_lag_24,target_benzene_lag_m24,diff_target_benzene_24,div_target_benzene_24,target_nitrogen_oxides_lag_24,target_nitrogen_oxides_lag_m24,diff_target_nitrogen_oxides_24,div_target_nitrogen_oxides_24,target_carbon_monoxide_lag_168,target_carbon_monoxide_lag_m168,diff_target_carbon_monoxide_168,div_target_carbon_monoxide_168,target_benzene_lag_168,target_benzene_lag_m168,diff_target_benzene_168,div_target_benzene_168,target_nitrogen_oxides_lag_168,target_nitrogen_oxides_lag_m168,diff_target_nitrogen_oxides_168,div_target_nitrogen_oxides_168
0,2010-03-10 18:00:00,13.1,46.0,0.7578,1387.2,1087.8,1056.0,1742.8,1293.4,2.5,12.000000,167.7,1,0,1080,0,1.647391,2,False,0.000000,1.000000,0.000000,1.00000,0.000000,1.000000,0.000000,1.000000,0.000000,1.000000e+00,0.000000,1.000000,0.000000,1.000000e+00,0.1,-1.2,-2.7,13.6,0.0,0.0,0.0,0.0,-0.7,13.0,21.4,-29.5,0.0,0.0,0.0,0.0,-0.0323,0.0310,0.0487,-0.2341,0.0000,0.0000,0.0000,0.0000,-108.1,-115.2,114.7,200.9,0.0,0.0,0.0,0.0,-199.6,-235.1,178.4,390.2,0.0,0.0,0.0,0.0,141.5,124.9,-312.9,-302.7,0.0,0.0,0.0,0.0,-292.9,-327.3,194.4,159.0,0.0,0.0,0.0,0.0,-282.5,-161.2,73.3,275.6,0.0,0.0,0.0,0.0,,2.1,,,,9.900000,,,,98.9,,,,1.5,,,,6.40000,,,,121.8,,,,4.7,,,,21.6,,,,275.4,,,,5.1,,,,25.9,,,,296.0,,
1,2010-03-10 19:00:00,13.2,45.3,0.7255,1279.1,888.2,1197.5,1449.9,1010.9,2.1,9.900000,98.9,1,0,1140,0,1.601545,2,False,0.000000,1.000000,0.000000,1.00000,0.000000,1.000000,0.000000,1.000000,0.258819,9.659258e-01,0.130526,0.991445,0.087156,9.961947e-01,-0.6,-2.0,-3.7,11.1,-0.1,0.0,0.0,0.0,10.9,11.5,24.0,-19.9,0.7,0.0,0.0,0.0,0.0247,0.0593,0.1064,0.0148,0.0323,0.0000,0.0000,0.0000,52.8,-58.2,496.9,674.2,108.1,0.0,0.0,0.0,41.4,-190.7,510.5,744.6,199.6,0.0,0.0,0.0,-137.3,219.7,-474.4,-599.0,-141.5,0.0,0.0,0.0,136.2,12.7,789.8,891.7,292.9,0.0,0.0,0.0,106.1,-61.9,573.8,1410.4,282.5,0.0,0.0,0.0,2.5,2.2,-0.3,0.880000,12.000000,9.200000,-2.800000,0.766667,167.7,127.1,-40.6,0.757901,,1.2,,,,4.40000,,,,88.1,,,,7.2,,,,28.2,,,,406.0,,,,8.1,,,,39.6,,,,411.0,,
2,2010-03-10 20:00:00,12.6,56.2,0.7502,1331.9,929.6,1060.2,1586.1,1117.0,2.2,9.200000,127.1,1,0,1200,0,1.334875,2,False,0.000000,1.000000,0.000000,1.00000,0.000000,1.000000,0.000000,1.000000,0.500000,8.660254e-01,0.258819,0.965926,0.173648,9.848078e-01,-1.6,-1.9,-2.5,8.9,0.6,0.0,0.0,0.0,6.2,-0.5,7.5,-18.4,-10.9,0.0,0.0,0.0,0.0365,0.0101,0.0631,0.1631,-0.0247,0.0000,0.0000,0.0000,-10.9,-87.7,275.3,564.1,-52.8,0.0,0.0,0.0,-0.6,-260.3,460.4,720.7,-41.4,0.0,0.0,0.0,42.7,431.0,-324.6,-445.3,137.3,0.0,0.0,0.0,-49.6,-173.1,517.3,854.1,-136.2,0.0,0.0,0.0,146.2,-347.4,421.2,1171.2,-106.1,0.0,0.0,0.0,2.1,2.2,0.1,1.047619,9.900000,9.700000,-0.200000,0.979798,98.9,177.2,78.3,1.791709,,1.2,,,,3.70000,,,,59.5,,,,6.1,,,,25.2,,,,337.0,,,,6.5,,,,36.9,,,,357.4,,
3,2010-03-10 21:00:00,11.0,62.4,0.7867,1321.0,929.0,1102.9,1536.5,1263.2,2.2,9.700000,177.2,0,0,1260,0,1.260737,2,False,0.000000,1.000000,0.000000,1.00000,0.000000,1.000000,0.000000,1.000000,0.707107,7.071068e-01,0.382683,0.923880,0.258819,9.659258e-01,0.9,-0.7,-2.4,7.1,1.6,0.0,0.0,0.0,-3.4,-5.4,5.4,-22.8,-6.2,0.0,0.0,0.0,0.0021,-0.0165,-0.0448,0.1427,-0.0365,0.0000,0.0000,0.0000,-49.0,-139.6,57.6,492.0,10.9,0.0,0.0,0.0,-76.3,-297.3,104.0,511.4,0.6,0.0,0.0,0.0,78.0,408.2,-212.9,-476.0,-42.7,0.0,0.0,0.0,-121.0,-176.8,272.9,681.0,49.6,0.0,0.0,0.0,-131.0,-547.8,-42.4,684.3,-146.2,0.0,0.0,0.0,2.2,1.5,-0.7,0.681818,9.200000,6.400000,-2.800000,0.695652,127.1,121.8,-5.3,0.958301,,1.0,,,,3.40000,,,,63.9,,,,3.9,,,,13.6,,,,242.4,,,,5.5,,,,25.8,,,,356.2,,
4,2010-03-10 22:00:00,11.9,59.0,0.7888,1272.0,852.7,1180.9,1415.5,1132.2,1.5,6.400000,121.8,0,0,1320,0,1.336949,2,False,0.000000,1.000000,0.000000,1.00000,0.000000,1.000000,0.000000,1.000000,0.866025,5.000000e-01,0.500000,0.866025,0.342020,9.396926e-01,-0.7,-1.8,-4.1,5.1,-0.9,1.2,0.0,0.0,-2.2,3.7,3.8,-17.1,3.4,-13.0,0.0,0.0,-0.0040,-0.0240,-0.0983,0.0670,-0.0021,-0.0310,0.0000,0.0000,-51.1,-112.4,-278.0,200.9,49.0,115.2,0.0,0.0,-155.2,-249.8,-148.6,216.8,76.3,235.1,0.0,0.0,236.3,429.7,183.9,-441.7,-78.0,-124.9,0.0,0.0,47.1,-203.3,-95.8,277.0,121.0,327.3,0.0,0.0,-183.2,-475.0,-319.4,491.2,131.0,161.2,0.0,0.0,2.2,1.2,-1.0,0.545455,9.700000,4.400000,-5.300000,0.453608,177.2,88.1,-89.1,0.497178,2.5,0.9,-1.6,0.36,12.000000,2.20000,-9.800000,0.183333,167.7,46.4,-121.3,0.276685,,1.5,,,,4.7,,,,87.4,,,,2.9,,,,14.4,,,,187.9,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9353,2011-04-04 10:00:00,23.2,28.7,0.7568,1340.3,1023.9,522.8,1374.0,1659.8,3.1,13.529605,471.7,1,0,600,0,2.636934,0,False,0.401488,0.915864,-0.205104,-0.97874,0.789187,-0.614153,0.994671,-0.103102,-0.866025,-5.000000e-01,0.866025,-0.500000,0.984808,1.736482e-01,1.3,4.7,0.0,0.0,-4.3,-13.7,-2.7,-5.6,-6.2,-15.2,0.0,0.0,9.1,38.2,-3.8,33.0,-0.0449,-0.2540,0.0000,0.0000,-0.0081,-0.0037,-0.1693,0.4868,-107.5,-215.7,0.0,0.0,-56.3,-350.5,-312.1,-307.1,-68.8,54.5,0.0,0.0,78.1,-337.7,-326.6,-243.2,93.3,85.4,0.0,0.0,-20.9,282.8,409.6,208.6,-147.9,-312.7,0.0,0.0,-26.5,-312.7,-452.3,-136.2,-390.8,-843.8,0.0,0.0,-92.6,-818.2,-1022.3,-781.4,3.9,2.4,-1.5,0.615385,13.552393,11.355157,-2.197236,0.837871,522.7,353.3,-169.4,0.675914,1.1,2.2,1.1,2.00,4.359341,11.93206,7.572719,2.737125,182.2,265.2,83.0,1.455543,1.4,,,,3.402194,,,,166.4,,,,1.5,,,,5.191654,,,,216.0,,,
9354,2011-04-04 11:00:00,24.5,22.5,0.7119,1232.8,955.1,616.1,1226.1,1269.0,2.4,11.355157,353.3,1,0,660,0,3.164000,0,False,0.401488,0.915864,-0.205104,-0.97874,0.789187,-0.614153,0.994671,-0.103102,-0.965926,-2.588190e-01,0.793353,-0.608761,0.996195,8.715574e-02,2.1,0.0,0.0,0.0,-1.3,-15.4,-1.2,-3.9,-3.5,0.0,0.0,0.0,6.2,38.2,-2.8,29.9,-0.0713,0.0000,0.0000,0.0000,0.0449,0.0327,-0.1446,0.5094,-45.1,0.0,0.0,0.0,107.5,178.9,-212.8,-92.5,97.3,0.0,0.0,0.0,68.8,180.4,-168.3,-159.2,-43.3,0.0,0.0,0.0,-93.3,-141.4,213.1,41.3,27.3,0.0,0.0,0.0,147.9,357.9,-295.2,78.9,-187.9,0.0,0.0,0.0,390.8,246.3,-661.0,-404.2,3.1,2.4,-0.7,0.774194,13.529605,12.374538,-1.155066,0.914627,471.7,293.0,-178.7,0.621158,4.0,,,,17.364240,,,,593.7,,,,1.3,,,,5.313402,,,,171.3,,,,1.6,,,,5.493102,,,,219.8,,,
9355,2011-04-04 12:00:00,26.6,19.0,0.6406,1187.7,1052.4,572.8,1253.4,1081.1,2.4,12.374538,293.0,1,0,720,0,3.371579,0,False,0.401488,0.915864,-0.205104,-0.97874,0.789187,-0.614153,0.994671,-0.103102,-1.000000,-1.836970e-16,0.707107,-0.707107,1.000000,6.123234e-17,2.5,0.0,0.0,0.0,-2.1,-13.2,0.7,-5.0,-6.3,0.0,0.0,0.0,3.5,28.4,-3.3,29.2,-0.1267,0.0000,0.0000,0.0000,0.0713,0.1147,-0.0812,0.5553,-134.5,0.0,0.0,0.0,45.1,214.9,-141.9,-134.6,-43.4,0.0,0.0,0.0,-97.3,336.8,-326.1,-345.8,129.2,0.0,0.0,0.0,43.3,-145.4,176.8,180.7,-243.6,0.0,0.0,0.0,-27.3,399.2,-238.8,37.9,-272.6,0.0,0.0,0.0,187.9,589.8,-418.5,-249.9,2.4,2.1,-0.3,0.875000,11.355157,9.547187,-1.807971,0.840780,353.3,234.5,-118.8,0.663742,5.0,,,,22.393233,,,,586.2,,,,1.4,,,,5.349138,,,,177.0,,,,1.4,,,,4.187455,,,,182.6,,,
9356,2011-04-04 13:00:00,29.1,12.7,0.5139,1053.2,1009.0,702.0,1009.8,808.5,2.1,9.547187,234.5,1,0,780,0,4.046457,0,False,0.401488,0.915864,-0.205104,-0.97874,0.789187,-0.614153,0.994671,-0.103102,-0.965926,2.588190e-01,0.608761,-0.793353,0.996195,-8.715574e-02,-1.2,0.0,0.0,0.0,-2.5,-10.2,0.5,-5.5,0.8,0.0,0.0,0.0,6.3,25.1,0.6,27.4,-0.0111,0.0000,0.0000,0.0000,0.1267,0.2348,0.0163,0.6367,71.4,0.0,0.0,0.0,134.5,230.8,-143.8,17.0,69.4,0.0,0.0,0.0,43.4,93.0,-294.0,-359.1,-93.8,0.0,0.0,0.0,-129.2,-200.1,251.8,130.0,51.5,0.0,0.0,0.0,243.6,337.7,-161.9,237.7,7.5,0.0,0.0,0.0,272.6,758.7,-366.2,-130.5,2.4,2.2,-0.2,0.916667,12.374538,11.932060,-0.442479,0.964243,293.0,265.2,-27.8,0.905119,3.9,,,,13.552393,,,,522.7,,,,1.0,,,,3.817080,,,,118.7,,,,1.4,,,,3.297505,,,,153.7,,,


In [23]:
train_data, test_data = ALL_DF.iloc[:(len(ALL_DF) - len(test_data)), :], ALL_DF.iloc[(len(ALL_DF) - len(test_data)):, :]
print(train_data.shape, test_data.shape)

(7111, 145) (2247, 145)


In [24]:
train_data.head()

Unnamed: 0,date_time,deg_C,relative_humidity,absolute_humidity,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,target_carbon_monoxide,target_benzene,target_nitrogen_oxides,working_hours,is_weekend,hr,satday,SMC,day,is_odd,f1s,f1c,f2s,f2c,f3s,f3c,f4s,f4c,fh1s,fh1c,fh2s,fh2c,fh3s,fh3c,deg_C_1b,deg_C_4b,deg_C_24b,deg_C_168b,deg_C_1f,deg_C_4f,deg_C_24f,deg_C_168f,relative_humidity_1b,relative_humidity_4b,relative_humidity_24b,relative_humidity_168b,relative_humidity_1f,relative_humidity_4f,relative_humidity_24f,relative_humidity_168f,absolute_humidity_1b,absolute_humidity_4b,absolute_humidity_24b,absolute_humidity_168b,absolute_humidity_1f,absolute_humidity_4f,absolute_humidity_24f,absolute_humidity_168f,sensor_1_1b,sensor_1_4b,sensor_1_24b,sensor_1_168b,sensor_1_1f,sensor_1_4f,sensor_1_24f,sensor_1_168f,sensor_2_1b,sensor_2_4b,sensor_2_24b,sensor_2_168b,sensor_2_1f,sensor_2_4f,sensor_2_24f,sensor_2_168f,sensor_3_1b,sensor_3_4b,sensor_3_24b,sensor_3_168b,sensor_3_1f,sensor_3_4f,sensor_3_24f,sensor_3_168f,sensor_4_1b,sensor_4_4b,sensor_4_24b,sensor_4_168b,sensor_4_1f,sensor_4_4f,sensor_4_24f,sensor_4_168f,sensor_5_1b,sensor_5_4b,sensor_5_24b,sensor_5_168b,sensor_5_1f,sensor_5_4f,sensor_5_24f,sensor_5_168f,target_carbon_monoxide_lag_1,target_carbon_monoxide_lag_m1,diff_target_carbon_monoxide_1,div_target_carbon_monoxide_1,target_benzene_lag_1,target_benzene_lag_m1,diff_target_benzene_1,div_target_benzene_1,target_nitrogen_oxides_lag_1,target_nitrogen_oxides_lag_m1,diff_target_nitrogen_oxides_1,div_target_nitrogen_oxides_1,target_carbon_monoxide_lag_4,target_carbon_monoxide_lag_m4,diff_target_carbon_monoxide_4,div_target_carbon_monoxide_4,target_benzene_lag_4,target_benzene_lag_m4,diff_target_benzene_4,div_target_benzene_4,target_nitrogen_oxides_lag_4,target_nitrogen_oxides_lag_m4,diff_target_nitrogen_oxides_4,div_target_nitrogen_oxides_4,target_carbon_monoxide_lag_24,target_carbon_monoxide_lag_m24,diff_target_carbon_monoxide_24,div_target_carbon_monoxide_24,target_benzene_lag_24,target_benzene_lag_m24,diff_target_benzene_24,div_target_benzene_24,target_nitrogen_oxides_lag_24,target_nitrogen_oxides_lag_m24,diff_target_nitrogen_oxides_24,div_target_nitrogen_oxides_24,target_carbon_monoxide_lag_168,target_carbon_monoxide_lag_m168,diff_target_carbon_monoxide_168,div_target_carbon_monoxide_168,target_benzene_lag_168,target_benzene_lag_m168,diff_target_benzene_168,div_target_benzene_168,target_nitrogen_oxides_lag_168,target_nitrogen_oxides_lag_m168,diff_target_nitrogen_oxides_168,div_target_nitrogen_oxides_168
0,2010-03-10 18:00:00,13.1,46.0,0.7578,1387.2,1087.8,1056.0,1742.8,1293.4,2.5,12.0,167.7,1,0,1080,0,1.647391,2,False,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.1,-1.2,-2.7,13.6,0.0,0.0,0.0,0.0,-0.7,13.0,21.4,-29.5,0.0,0.0,0.0,0.0,-0.0323,0.031,0.0487,-0.2341,0.0,0.0,0.0,0.0,-108.1,-115.2,114.7,200.9,0.0,0.0,0.0,0.0,-199.6,-235.1,178.4,390.2,0.0,0.0,0.0,0.0,141.5,124.9,-312.9,-302.7,0.0,0.0,0.0,0.0,-292.9,-327.3,194.4,159.0,0.0,0.0,0.0,0.0,-282.5,-161.2,73.3,275.6,0.0,0.0,0.0,0.0,,2.1,,,,9.9,,,,98.9,,,,1.5,,,,6.4,,,,121.8,,,,4.7,,,,21.6,,,,275.4,,,,5.1,,,,25.9,,,,296.0,,
1,2010-03-10 19:00:00,13.2,45.3,0.7255,1279.1,888.2,1197.5,1449.9,1010.9,2.1,9.9,98.9,1,0,1140,0,1.601545,2,False,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.258819,0.965926,0.130526,0.991445,0.087156,0.996195,-0.6,-2.0,-3.7,11.1,-0.1,0.0,0.0,0.0,10.9,11.5,24.0,-19.9,0.7,0.0,0.0,0.0,0.0247,0.0593,0.1064,0.0148,0.0323,0.0,0.0,0.0,52.8,-58.2,496.9,674.2,108.1,0.0,0.0,0.0,41.4,-190.7,510.5,744.6,199.6,0.0,0.0,0.0,-137.3,219.7,-474.4,-599.0,-141.5,0.0,0.0,0.0,136.2,12.7,789.8,891.7,292.9,0.0,0.0,0.0,106.1,-61.9,573.8,1410.4,282.5,0.0,0.0,0.0,2.5,2.2,-0.3,0.88,12.0,9.2,-2.8,0.766667,167.7,127.1,-40.6,0.757901,,1.2,,,,4.4,,,,88.1,,,,7.2,,,,28.2,,,,406.0,,,,8.1,,,,39.6,,,,411.0,,
2,2010-03-10 20:00:00,12.6,56.2,0.7502,1331.9,929.6,1060.2,1586.1,1117.0,2.2,9.2,127.1,1,0,1200,0,1.334875,2,False,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.5,0.866025,0.258819,0.965926,0.173648,0.984808,-1.6,-1.9,-2.5,8.9,0.6,0.0,0.0,0.0,6.2,-0.5,7.5,-18.4,-10.9,0.0,0.0,0.0,0.0365,0.0101,0.0631,0.1631,-0.0247,0.0,0.0,0.0,-10.9,-87.7,275.3,564.1,-52.8,0.0,0.0,0.0,-0.6,-260.3,460.4,720.7,-41.4,0.0,0.0,0.0,42.7,431.0,-324.6,-445.3,137.3,0.0,0.0,0.0,-49.6,-173.1,517.3,854.1,-136.2,0.0,0.0,0.0,146.2,-347.4,421.2,1171.2,-106.1,0.0,0.0,0.0,2.1,2.2,0.1,1.047619,9.9,9.7,-0.2,0.979798,98.9,177.2,78.3,1.791709,,1.2,,,,3.7,,,,59.5,,,,6.1,,,,25.2,,,,337.0,,,,6.5,,,,36.9,,,,357.4,,
3,2010-03-10 21:00:00,11.0,62.4,0.7867,1321.0,929.0,1102.9,1536.5,1263.2,2.2,9.7,177.2,0,0,1260,0,1.260737,2,False,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.707107,0.707107,0.382683,0.92388,0.258819,0.965926,0.9,-0.7,-2.4,7.1,1.6,0.0,0.0,0.0,-3.4,-5.4,5.4,-22.8,-6.2,0.0,0.0,0.0,0.0021,-0.0165,-0.0448,0.1427,-0.0365,0.0,0.0,0.0,-49.0,-139.6,57.6,492.0,10.9,0.0,0.0,0.0,-76.3,-297.3,104.0,511.4,0.6,0.0,0.0,0.0,78.0,408.2,-212.9,-476.0,-42.7,0.0,0.0,0.0,-121.0,-176.8,272.9,681.0,49.6,0.0,0.0,0.0,-131.0,-547.8,-42.4,684.3,-146.2,0.0,0.0,0.0,2.2,1.5,-0.7,0.681818,9.2,6.4,-2.8,0.695652,127.1,121.8,-5.3,0.958301,,1.0,,,,3.4,,,,63.9,,,,3.9,,,,13.6,,,,242.4,,,,5.5,,,,25.8,,,,356.2,,
4,2010-03-10 22:00:00,11.9,59.0,0.7888,1272.0,852.7,1180.9,1415.5,1132.2,1.5,6.4,121.8,0,0,1320,0,1.336949,2,False,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.866025,0.5,0.5,0.866025,0.34202,0.939693,-0.7,-1.8,-4.1,5.1,-0.9,1.2,0.0,0.0,-2.2,3.7,3.8,-17.1,3.4,-13.0,0.0,0.0,-0.004,-0.024,-0.0983,0.067,-0.0021,-0.031,0.0,0.0,-51.1,-112.4,-278.0,200.9,49.0,115.2,0.0,0.0,-155.2,-249.8,-148.6,216.8,76.3,235.1,0.0,0.0,236.3,429.7,183.9,-441.7,-78.0,-124.9,0.0,0.0,47.1,-203.3,-95.8,277.0,121.0,327.3,0.0,0.0,-183.2,-475.0,-319.4,491.2,131.0,161.2,0.0,0.0,2.2,1.2,-1.0,0.545455,9.7,4.4,-5.3,0.453608,177.2,88.1,-89.1,0.497178,2.5,0.9,-1.6,0.36,12.0,2.2,-9.8,0.183333,167.7,46.4,-121.3,0.276685,,1.5,,,,4.7,,,,87.4,,,,2.9,,,,14.4,,,,187.9,,


In [25]:
test_data.head()

Unnamed: 0,date_time,deg_C,relative_humidity,absolute_humidity,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,target_carbon_monoxide,target_benzene,target_nitrogen_oxides,working_hours,is_weekend,hr,satday,SMC,day,is_odd,f1s,f1c,f2s,f2c,f3s,f3c,f4s,f4c,fh1s,fh1c,fh2s,fh2c,fh3s,fh3c,deg_C_1b,deg_C_4b,deg_C_24b,deg_C_168b,deg_C_1f,deg_C_4f,deg_C_24f,deg_C_168f,relative_humidity_1b,relative_humidity_4b,relative_humidity_24b,relative_humidity_168b,relative_humidity_1f,relative_humidity_4f,relative_humidity_24f,relative_humidity_168f,absolute_humidity_1b,absolute_humidity_4b,absolute_humidity_24b,absolute_humidity_168b,absolute_humidity_1f,absolute_humidity_4f,absolute_humidity_24f,absolute_humidity_168f,sensor_1_1b,sensor_1_4b,sensor_1_24b,sensor_1_168b,sensor_1_1f,sensor_1_4f,sensor_1_24f,sensor_1_168f,sensor_2_1b,sensor_2_4b,sensor_2_24b,sensor_2_168b,sensor_2_1f,sensor_2_4f,sensor_2_24f,sensor_2_168f,sensor_3_1b,sensor_3_4b,sensor_3_24b,sensor_3_168b,sensor_3_1f,sensor_3_4f,sensor_3_24f,sensor_3_168f,sensor_4_1b,sensor_4_4b,sensor_4_24b,sensor_4_168b,sensor_4_1f,sensor_4_4f,sensor_4_24f,sensor_4_168f,sensor_5_1b,sensor_5_4b,sensor_5_24b,sensor_5_168b,sensor_5_1f,sensor_5_4f,sensor_5_24f,sensor_5_168f,target_carbon_monoxide_lag_1,target_carbon_monoxide_lag_m1,diff_target_carbon_monoxide_1,div_target_carbon_monoxide_1,target_benzene_lag_1,target_benzene_lag_m1,diff_target_benzene_1,div_target_benzene_1,target_nitrogen_oxides_lag_1,target_nitrogen_oxides_lag_m1,diff_target_nitrogen_oxides_1,div_target_nitrogen_oxides_1,target_carbon_monoxide_lag_4,target_carbon_monoxide_lag_m4,diff_target_carbon_monoxide_4,div_target_carbon_monoxide_4,target_benzene_lag_4,target_benzene_lag_m4,diff_target_benzene_4,div_target_benzene_4,target_nitrogen_oxides_lag_4,target_nitrogen_oxides_lag_m4,diff_target_nitrogen_oxides_4,div_target_nitrogen_oxides_4,target_carbon_monoxide_lag_24,target_carbon_monoxide_lag_m24,diff_target_carbon_monoxide_24,div_target_carbon_monoxide_24,target_benzene_lag_24,target_benzene_lag_m24,diff_target_benzene_24,div_target_benzene_24,target_nitrogen_oxides_lag_24,target_nitrogen_oxides_lag_m24,diff_target_nitrogen_oxides_24,div_target_nitrogen_oxides_24,target_carbon_monoxide_lag_168,target_carbon_monoxide_lag_m168,diff_target_carbon_monoxide_168,div_target_carbon_monoxide_168,target_benzene_lag_168,target_benzene_lag_m168,diff_target_benzene_168,div_target_benzene_168,target_nitrogen_oxides_lag_168,target_nitrogen_oxides_lag_m168,diff_target_nitrogen_oxides_168,div_target_nitrogen_oxides_168
7111,2011-01-01 00:00:00,8.0,41.3,0.4375,1108.8,745.7,797.1,880.0,1273.1,1.459172,4.247511,197.915939,0,1,0,1,1.059322,5,False,-0.927542,0.37372,0.559589,-0.82877,0.991861,-0.127325,0.956235,0.2926,1.0,-1.608123e-16,0.707107,0.707107,0.5,0.866025,-2.9,-3.5,-3.4,-2.9,0.0,1.1,-1.2,1.8,10.4,16.2,20.3,43.8,0.0,-8.1,7.0,27.2,0.0189,0.0275,0.1049,0.312,0.0,-0.0609,0.0654,0.3494,140.7,-86.4,1.5,117.1,0.0,-86.1,-206.7,378.4,119.2,92.8,29.3,317.6,0.0,44.3,-1.2,298.7,-109.2,74.4,-4.3,-243.4,0.0,154.5,83.0,-278.7,92.8,87.0,162.4,488.9,0.0,32.9,101.0,501.0,440.9,-130.8,-161.7,138.1,0.0,-369.7,-428.5,817.8,1.4,1.6,0.2,1.142857,4.1,8.763027,4.663027,2.137324,186.5,214.9,28.4,1.152279,1.4,1.9,0.5,1.357143,5.8,5.560496,-0.239504,0.958706,221.3,253.1,31.8,1.143696,1.4,1.8,0.4,1.285714,4.8,5.232103,0.432103,1.090021,180.9,229.5,48.6,1.268657,5.5,3.4,-2.1,0.618182,14.2,13.039875,-1.160125,0.918301,495.4,566.7,71.3,1.143924
7112,2011-01-01 01:00:00,5.1,51.7,0.4564,1249.5,864.9,687.9,972.8,1714.0,1.6,8.763027,214.9,0,1,60,1,0.882785,5,False,-0.927542,0.37372,0.559589,-0.82877,0.991861,-0.127325,0.956235,0.2926,0.965926,-0.258819,0.793353,0.608761,0.573576,0.819152,0.7,-0.6,-0.5,1.0,2.9,4.5,1.3,5.0,-0.2,2.0,7.4,24.6,-10.4,-17.1,-1.0,14.5,0.0125,0.0195,0.0847,0.2805,-0.0189,-0.0254,0.0434,0.3318,-146.9,-245.5,-216.9,-197.1,-140.7,-205.1,-371.3,167.2,13.1,-119.4,-124.4,-13.5,-119.2,-97.6,-191.3,301.1,5.8,226.3,204.6,97.0,109.2,174.0,287.2,-129.4,-30.9,16.3,18.6,96.0,-92.8,-83.6,1.3,315.2,-413.2,-740.2,-595.9,-646.8,-440.9,-554.9,-1126.2,141.6,1.459172,2.5,1.040828,1.7133,4.247511,7.498693,3.251182,1.765432,197.915939,299.8,101.884061,1.514785,1.6,1.4,-0.2,0.875,5.2,4.794557,-0.405443,0.92203,227.4,181.0,-46.4,0.795954,0.8,1.6,0.8,2.0,2.8,4.270707,1.470707,1.525252,144.9,194.7,49.8,1.343685,5.7,2.4,-3.3,0.421053,13.1,6.691615,-6.408385,0.51081,487.9,357.4,-130.5,0.732527
7113,2011-01-01 02:00:00,5.8,51.5,0.4689,1102.6,878.0,693.7,941.9,1300.8,2.5,7.498693,299.8,0,1,120,1,0.910485,5,False,-0.927542,0.37372,0.559589,-0.82877,0.991861,-0.127325,0.956235,0.2926,0.866025,-0.5,0.866025,0.5,0.642788,0.766044,-0.8,-2.5,-1.5,1.1,-0.7,2.2,1.3,4.0,0.8,3.3,8.9,19.2,0.2,-10.8,-5.1,17.1,0.0004,-0.0053,0.0725,0.2728,-0.0125,-0.0604,0.018,0.3162,37.1,-161.7,-70.9,-101.7,146.9,-149.8,-288.3,222.4,38.2,-139.8,-241.0,-89.4,-13.1,-186.1,-329.9,210.6,31.9,122.3,194.8,117.5,-5.8,214.8,485.3,-128.2,69.1,-45.1,78.9,191.2,30.9,-24.9,-134.9,319.7,-17.8,-251.4,-327.9,-313.8,413.2,-94.5,-856.5,524.2,1.6,2.7,1.1,1.6875,8.763027,7.563761,-1.199266,0.863145,214.9,329.955383,115.055383,1.53539,1.5,1.5,0.0,1.0,4.6,5.277768,0.677768,1.147341,199.8,170.7,-29.1,0.854354,0.5,1.3,0.8,2.6,1.2,3.551096,2.351096,2.959247,92.5,167.2,74.7,1.807568,4.9,1.4,-3.5,0.285714,11.1,4.620274,-6.479726,0.416241,506.3,186.6,-319.7,0.368556
7114,2011-01-01 03:00:00,5.0,52.3,0.4693,1139.7,916.2,725.6,1011.0,1283.0,2.7,7.563761,329.955383,0,1,180,1,0.897323,5,False,-0.927542,0.37372,0.559589,-0.82877,0.991861,-0.127325,0.956235,0.2926,0.707107,-0.7071068,0.92388,0.382683,0.707107,0.707107,-0.5,-1.8,-2.1,2.6,0.8,3.0,2.6,4.0,5.2,8.4,19.8,22.2,-0.8,-11.0,-6.1,19.8,-0.0043,-0.0026,0.0695,0.2699,-0.0004,-0.0318,0.0021,0.3232,-117.3,-185.2,-145.7,-150.7,-37.1,-30.9,-425.5,258.9,-77.7,-202.3,-206.8,-208.2,-38.2,-170.5,-506.8,90.8,145.9,109.1,142.4,136.7,-31.9,71.5,844.2,-154.7,-44.0,-75.4,-77.3,69.5,-69.1,-131.0,-279.7,201.7,-140.7,-326.7,-304.5,-359.3,17.8,-9.9,-938.0,428.7,2.5,1.9,-0.6,0.76,7.498693,5.560496,-1.938198,0.741529,299.8,253.1,-46.7,0.844229,1.4,1.4,0.0,1.0,4.1,4.458207,0.358207,1.087367,186.5,168.1,-18.4,0.90134,0.3,1.3,1.0,4.333333,0.4,4.519882,4.119882,11.299706,35.7,171.796173,136.096173,4.812218,2.3,1.4,-0.9,0.608696,9.8,3.830373,-5.969627,0.390854,365.7,183.867035,-181.832965,0.502781
7115,2011-01-01 04:00:00,4.5,57.5,0.465,1022.4,838.5,871.5,967.0,1142.3,1.9,5.560496,253.1,0,1,240,1,0.808696,5,False,-0.927542,0.37372,0.559589,-0.82877,0.991861,-0.127325,0.956235,0.2926,0.5,-0.8660254,0.965926,0.258819,0.766044,0.642788,0.0,-2.0,-1.2,3.9,0.5,3.5,2.1,4.2,-3.8,8.2,13.5,12.5,-5.2,-16.2,-5.7,13.0,0.0109,0.0071,0.066,0.2763,0.0043,-0.0275,0.0263,0.3389,-18.4,-52.5,-47.0,-79.6,117.3,86.4,-299.5,245.5,-93.0,-159.4,-145.3,-211.7,77.7,-92.8,-395.5,113.2,42.7,72.3,57.5,45.2,-145.9,-74.4,774.9,-268.9,22.1,-7.7,46.0,44.4,44.0,-87.0,-175.0,208.0,-168.5,-250.3,-269.0,-216.2,140.7,130.8,-815.4,376.4,2.7,1.4,-1.3,0.518519,7.563761,4.794557,-2.769204,0.633885,329.955383,181.0,-148.955383,0.548559,1.459172,1.1,-0.359172,0.753852,4.247511,3.011833,-1.235678,0.709082,197.915939,168.7,-29.215939,0.852382,0.3,1.3,1.0,4.333333,0.4,3.339213,2.939213,8.348032,34.4,148.4,114.0,4.313953,2.1,1.2,-0.9,0.571429,8.9,3.351769,-5.548231,0.376603,257.6,169.1,-88.5,0.656444


# =============== LightAutoML model building ===============


# Step 1. Task setup

On the cell below we create Task object - the class to setup what task LightAutoML model should solve with specific loss and metric if necessary (more info can be found [here](https://lightautoml.readthedocs.io/en/latest/generated/lightautoml.tasks.base.Task.html#lightautoml.tasks.base.Task) in our documentation):

In [26]:
# %%time

# def rmsle_metric(y_true, y_pred, sample_weight, **kwargs):
#     mask = (sample_weight > 1)
#     return mean_squared_log_error(y_true[mask], np.clip(y_pred[mask], 0, None), **kwargs) ** 0.5

# task = Task('reg', loss = 'rmsle', metric = rmsle_metric, greater_is_better=False)

# Step 2. Feature roles setup

To solve the task, we need to setup columns roles. The **only role you must setup is target role**, everything else (drop, numeric, categorical, group, weights etc.) is up to user - LightAutoML models have automatic columns typization inside:

In [27]:
# ?DatetimeRole

### Checking BIZEN idea from comments - no drop for any target, another targets using as features

In [28]:
# %%time

# targets_and_drop = {
#     'target_carbon_monoxide': [],
#     'target_benzene': [],
#     'target_nitrogen_oxides': []
# }

# roles = {
#     # delete day of month from features
#     DatetimeRole(base_date=False, base_feats=True, seasonality=('d', 'wd', 'hour')): 'date_time'
# }

# Step 3. LightAutoML model creation - TabularAutoML preset

In next the cell we are going to create LightAutoML model with `TabularAutoML` class - preset with default model structure like in the image below:

<img src="https://github.com/sberbank-ai-lab/lightautoml-datafest-workshop/raw/master/imgs/tutorial_blackbox_pipeline.png" alt="TabularAutoML preset pipeline" style="width:70%;"/>

in just several lines. Let's discuss the params we can setup:
- `task` - the type of the ML task (the only **must have** parameter)
- `timeout` - time limit in seconds for model to train
- `cpu_limit` - vCPU count for model to use
- `reader_params` - parameter change for Reader object inside preset, which works on the first step of data preparation: automatic feature typization, preliminary almost-constant features, correct CV setup etc. For example, we setup `n_jobs` threads for typization algo, `cv` folds and `random_state` as inside CV seed.
- `general_params` - we use `use_algos` key to setup the model structure to work with (two LGBM models and two CatBoost models on the first level and their weighted composition creation on the second). This setup is only to speedup the kernel, you can remove this `general_params` setup if you want the whole LightAutoML model to run.

**Important note**: `reader_params` key is one of the YAML config keys, which is used inside `TabularAutoML` preset. [More details](https://github.com/sberbank-ai-lab/LightAutoML/blob/master/lightautoml/automl/presets/tabular_config.yml) on its structure with explanation comments can be found on the link attached. Each key from this config can be modified with user settings during preset object initialization. To get more info about different parameters setting (for example, ML algos which can be used in `general_params->use_algos`) please take a look at our [article on TowardsDataScience](https://towardsdatascience.com/lightautoml-preset-usage-tutorial-2cce7da6f936). 

In [29]:
# %%time 
# importances = {}
# dt = pd.to_datetime(ALL_DF['date_time'])
# for targ in targets_and_drop:
#     print('='*50, '='*50, sep = '\n')
#     automl = TabularAutoML(task = task, 
#                            timeout = TIMEOUT,
#                            cpu_limit = N_THREADS,
#                            reader_params = {'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': RANDOM_STATE},
#                            general_params = {'use_algos': [['lgb', 'lgb_tuned', 'cb', 'cb_tuned']]},
#                            verbose = 3 # 0 for no output, 1 - only main steps, 2 - more detailed, 3 - show everything including model scores, optuna iterations etc.
#                           )
    
#     ALL_DF['weight'] = [1.001] * len(train_data) + list(np.where(pseudolabels_true[targ].values >= 0, 1.001, 0.999))
#     roles['weights'] = 'weight'

#     roles['target'] = targ
#     roles['drop'] = targets_and_drop[targ]

#     if targ == 'target_nitrogen_oxides':
#         oof_pred = automl.fit_predict(ALL_DF[dt >= np.datetime64('2010-09-01')], roles = roles)
#     else:
#         oof_pred = automl.fit_predict(ALL_DF, roles = roles)
#     print('oof_pred:\n{}\nShape = {}'.format(oof_pred, oof_pred.shape))
    
#     # MODEL STRUCTURE - NEW FEATURE
#     print('\nFitted model structure:\n{}\n'.format(automl.create_model_str_desc()))
    
#     # Fast feature importances calculation
#     fast_fi = automl.get_feature_scores('fast')
#     importances[targ] = fast_fi
    
#     test_pred = automl.predict(test_data)
#     print('Prediction for te_data:\n{}\nShape = {}'.format(test_pred, test_pred.shape))
    
#     sample_sub[targ] = np.clip(test_pred.data[:, 0], 0, None)

In [30]:
# for targ in targets_and_drop:
#     plt.figure(figsize = (30, 10))
#     importances[targ].set_index('Feature')['Importance'].plot.bar()
#     plt.title('Feature importances for {} model'.format(targ))
#     plt.grid(True)
#     plt.show()

In [31]:
def pycaret_model(train, test, config):
    print('Setup Your Data....')
    setup(
        data=train,
        target=config['target'],
        numeric_imputation='mean',
        session_id=config['seed'],
        normalize = config['normalize'],
        silent= True,
        fold_strategy=config['fold_strategy'],
        use_gpu = True
    )

    print(f"Comparing Models....")
    best = compare_models(sort=config['optimize'], n_select=config['n_select'], fold=config['fold'], exclude=['xgboost'])

    if config['tuning']:
        print(f"Tuning Models....")
        best_tuned = [tune_model(model) for model in best]

        print(f"Blending Models....")
        blended = blend_models(estimator_list=best+best_tuned, fold=config['fold'], optimize=config['optimize'])
    else:
        print(f"Blending Models....")
        blended = blend_models(estimator_list=best, fold=config['fold'], optimize=config['optimize'])
        
    pred_holdout = predict_model(blended)

    print(f"Finallizing Models....")
    final_model = finalize_model(blended)

    print('Done...!!!')
    pred = predict_model(final_model, test)
    re = pred['Label']

    return re, final_model

In [32]:
ALL_DF['date_time'] = pd.to_datetime(ALL_DF['date_time'])
# weights = [1.001] * len(train_data) + list(np.where(pseudolabels_true['target_carbon_monoxide'].values >= 0, 1.001, 0.999))
pred_CO, model_CO = pycaret_model(ALL_DF.drop(['target_benzene', 'target_nitrogen_oxides'], 1), test_data.drop(['target_benzene', 'target_nitrogen_oxides', 'target_carbon_monoxide'], 1), CFG['carbon_monoxide'])

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,0.2104,0.1009,0.3176,0.9481,0.0964,0.1516
1,0.2046,0.0931,0.3051,0.9534,0.0847,0.1215
2,0.2054,0.0943,0.3072,0.9546,0.0848,0.1236
Mean,0.2068,0.0961,0.3099,0.9521,0.0886,0.1322
SD,0.0026,0.0034,0.0055,0.0028,0.0055,0.0137


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Voting Regressor,0.1898,0.0791,0.2812,0.9608,0.0816,0.1233


Finallizing Models....
Done...!!!


In [33]:
pred_BE, model_BE = pycaret_model(ALL_DF.drop(['target_nitrogen_oxides'], 1), test_data.drop(['target_benzene', 'target_nitrogen_oxides'], 1), CFG['benzene'])

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,0.7092,1.1397,1.0676,0.9798,0.0926,0.1337
1,0.6579,1.0543,1.0268,0.9814,0.0805,0.1051
2,0.6762,0.9544,0.9769,0.9829,0.0817,0.0974
Mean,0.6811,1.0495,1.0238,0.9813,0.085,0.1121
SD,0.0212,0.0757,0.0371,0.0012,0.0054,0.0156


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Voting Regressor,0.6633,0.934,0.9665,0.9832,0.0803,0.096


Finallizing Models....
Done...!!!


In [34]:
dt = pd.to_datetime(ALL_DF['date_time'])
pred_NO, model_NO = pycaret_model(ALL_DF[dt >= np.datetime64('2010-09-01')], test_data.drop(['target_nitrogen_oxides'], 1), CFG['nitrogen_oxides'])

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,33.3873,2510.7581,50.1075,0.9513,0.1654,0.1278
1,32.265,2426.665,49.2612,0.95,0.1556,0.118
Mean,32.8262,2468.7116,49.6843,0.9507,0.1605,0.1229
SD,0.5611,42.0465,0.4231,0.0007,0.0049,0.0049


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Voting Regressor,31.4984,2442.912,49.4258,0.9513,0.1373,0.1068


Finallizing Models....
Done...!!!


# Step 4. Create submission file

In [35]:
sample_sub

Unnamed: 0,date_time,target_carbon_monoxide,target_benzene,target_nitrogen_oxides
0,2011-01-01 00:00:00,2.0,10.0,300.0
1,2011-01-01 01:00:00,2.0,10.0,300.0
2,2011-01-01 02:00:00,2.0,10.0,300.0
3,2011-01-01 03:00:00,2.0,10.0,300.0
4,2011-01-01 04:00:00,2.0,10.0,300.0
...,...,...,...,...
2242,2011-04-04 10:00:00,2.0,10.0,300.0
2243,2011-04-04 11:00:00,2.0,10.0,300.0
2244,2011-04-04 12:00:00,2.0,10.0,300.0
2245,2011-04-04 13:00:00,2.0,10.0,300.0


In [36]:
pseudolabels_true[['target_carbon_monoxide','target_benzene','target_nitrogen_oxides']]

Unnamed: 0,target_carbon_monoxide,target_benzene,target_nitrogen_oxides
0,-200.0,4.247511,-200.0
1,1.6,8.763027,214.9
2,2.5,7.498693,299.8
3,2.7,7.563761,-200.0
4,1.9,5.560496,253.1
...,...,...,...
2242,3.1,13.529605,471.7
2243,2.4,11.355157,353.3
2244,2.4,12.374538,293.0
2245,2.1,9.547187,234.5


In [37]:
# for targ in targets_and_drop:
#     preds = sample_sub[targ].values
#     real_values = pseudolabels_true[targ].values
#     final_preds = np.where(real_values >= 0, real_values, preds)
#     print(final_preds)
#     sample_sub[targ] = final_preds

sample_sub['target_carbon_monoxide'] = pred_CO.values
sample_sub['target_benzene'] = pred_BE.values
sample_sub['target_nitrogen_oxides'] = pred_NO.values

In [38]:
sample_sub

Unnamed: 0,date_time,target_carbon_monoxide,target_benzene,target_nitrogen_oxides
0,2011-01-01 00:00:00,1.397667,4.643472,195.326807
1,2011-01-01 01:00:00,2.180464,8.048659,290.985514
2,2011-01-01 02:00:00,2.006343,7.593914,277.570471
3,2011-01-01 03:00:00,2.520158,8.007037,327.705703
4,2011-01-01 04:00:00,1.808858,6.084886,248.992203
...,...,...,...,...
2242,2011-04-04 10:00:00,3.134039,13.281689,486.372699
2243,2011-04-04 11:00:00,2.341147,10.856887,341.249081
2244,2011-04-04 12:00:00,2.409689,12.328235,309.934376
2245,2011-04-04 13:00:00,1.935169,10.242796,252.527778


In [39]:
sample_sub.to_csv('pycaret_psuedolabel.csv', index = False)

# Additional materials

- [Official LightAutoML github repo](https://github.com/sberbank-ai-lab/LightAutoML)
- [LightAutoML documentation](https://lightautoml.readthedocs.io/en/latest)
- [Pseudolabelling technique description post](https://www.kaggle.com/c/tabular-playground-series-apr-2021/discussion/231738#1268903)
- [Baseline LightAutoML kernel without pseudolabelling](https://www.kaggle.com/alexryzhkov/tps-july-21-lightautoml-baseline)

## Do not forget to upvote if you like the kernel 👍