# Libraries
---

In [31]:
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
import random
import os
import gc

# from pycaret.regression import setup, compare_models, tune_model, blend_models, finalize_model, predict_model, plot_model

import statsmodels.api as sm

pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

import warnings
warnings.simplefilter('ignore')

from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import train_test_split, LeaveOneGroupOut
import optuna
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor 
import pickle
from catboost import CatBoostRegressor, Pool
from sklearn.feature_selection import SelectKBest, f_classif,chi2,SelectPercentile

In [19]:
# CFG = {
#     'carbon_monoxide': {
#         'target': 'target_carbon_monoxide',
#         'seed': 2021,
#         'n_select': 2,
#         'fold': 3,
#         'fold_strategy': 'timeseries',
#         'tuning': False,
#         'normalize': True,
#         'optimize': 'RMSE',
#     },
#     'benzene': {
#         'target': 'target_benzene',
#         'seed': 2021,
#         'n_select': 2,
#         'fold': 3,
#         'fold_strategy': 'timeseries',
#         'tuning': False,
#         'normalize': True,
#         'optimize': 'RMSE',        
#     },
#     'nitrogen_oxides': {
#         'target': 'target_nitrogen_oxides',
#         'seed': 2021,
#         'n_select': 2,
#         'fold': 2,
#         'fold_strategy': 'timeseries',
#         'tuning': False,
#         'normalize': True,
#         'optimize': 'RMSE',        
#     },
# } 

cb_params = [
                {'learning_rate': 0.010169009412219588,
                 'l2_leaf_reg': 8.908337085912136,
                 'bagging_temperature': 8.384477224270551,
                 'random_strength': 1.950237493637981,
                 'depth': 6,
                 'grow_policy': 'Lossguide',
                 'leaf_estimation_method': 'Newton'},
                {'learning_rate': 0.166394867169309,
                 'l2_leaf_reg': 8.704675157564441,
                 'bagging_temperature': 3.340826164726799,
                 'random_strength': 1.538518016574368,
                 'depth': 2,
                 'grow_policy': 'Depthwise',
                 'leaf_estimation_method': 'Newton'},
                {'learning_rate': 0.028141156076957437,
                 'l2_leaf_reg': 3.116523267336638,
                 'bagging_temperature': 4.420661209459851,
                 'random_strength': 1.8011752694610028,
                 'depth': 6,
                 'grow_policy': 'Depthwise',
                 'leaf_estimation_method': 'Newton'},
            ]

In [20]:
def seed_everything(seed=2021):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything()

# Datasets
---

In [21]:
train = pd.read_csv("../input/tabular-playground-series-jul-2021/train.csv", index_col="date_time", parse_dates=True)
test = pd.read_csv("../input/tabular-playground-series-jul-2021/test.csv", index_col="date_time", parse_dates=True)
submission = pd.read_csv("../input/tabular-playground-series-jul-2021/sample_submission.csv", index_col="date_time", parse_dates=True)

target = [col for col in train.columns if 'target_' in col]
train[target] = np.log1p(train[target])

# sub18874 = pd.read_csv("../input/tps07-18874/submission.csv", index_col="date_time", parse_dates=True)
# sub18967 = pd.read_csv("../input/tps07-18967/lightautoml_with_pseudolabelling_kernel_version_6.csv", index_col="date_time", parse_dates=True)
# sub19166 = pd.read_csv("../input/tps07-19166/sub_NN_double_pseudo_1.csv", index_col="date_time", parse_dates=True)

# pseudo_label = submission.copy()

# pseudo_label[CFG['carbon_monoxide']['target']] = sub19166[CFG['carbon_monoxide']['target']]
# pseudo_label[CFG['benzene']['target']] = sub19166[CFG['benzene']['target']]
# pseudo_label[CFG['nitrogen_oxides']['target']] = sub18874[CFG['nitrogen_oxides']['target']]
# test_carbon_monoxide = pd.concat([test, pseudo_label], axis=1)
# test_carbon_monoxide[target] = np.log1p(test_carbon_monoxide[target])

# pseudo_label[CFG['carbon_monoxide']['target']] = sub19166[CFG['carbon_monoxide']['target']]
# pseudo_label[CFG['benzene']['target']] = sub18967[CFG['benzene']['target']]
# pseudo_label[CFG['nitrogen_oxides']['target']] = sub18874[CFG['nitrogen_oxides']['target']]
# test_benzene = pd.concat([test, pseudo_label], axis=1)
# test_benzene[target] = np.log1p(test_benzene[target])

# pseudo_label[CFG['carbon_monoxide']['target']] = sub18967[CFG['carbon_monoxide']['target']]
# pseudo_label[CFG['benzene']['target']] = sub19166[CFG['benzene']['target']]
# pseudo_label[CFG['nitrogen_oxides']['target']] = sub18874[CFG['nitrogen_oxides']['target']]
# test_nitrogen_oxides = pd.concat([test, pseudo_label], axis=1)
# test_nitrogen_oxides[target] = np.log1p(test_nitrogen_oxides[target])

# Feature engineering
---

In [22]:
def make_CO_features(df):
    """
    Adds new features to a given dataset
    """
#     df['year'] = df.index.year
    df['month'] = df.index.month
#     df['week'] = df.index.week
#     df['day'] = df.index.day
    df["day_of_week"] = df.index.dayofweek
    df['day_of_year'] = df.index.dayofyear
    df["quarter"] = df.index.quarter

    df['time'] = df.index.date - df.index.date.min()
    df['hour'] = df.index.hour
    df['time'] = df['time'].apply(lambda x : x.days)

    df['working_hours'] =  df['hour'].isin(np.arange(8, 21, 1)).astype('int')
    df['maximum_hours'] =  df['hour'].isin([8, 9, 17, 18, 19, 20]).astype('int')
    df['is_weekend'] = (df.index.dayofweek >= 5).astype('int')

    df['SMC'] = (df['absolute_humidity'] * 100) / df['relative_humidity']
    
    df['sensor_6'] = (df['sensor_2'] - df['sensor_5']) / df['sensor_5']
    df['sensor_7'] = (df['sensor_3'] - df['sensor_4']) / df['sensor_4']

    for periods in [3, 6]:
        df[f'dt-{periods}'] = df['deg_C'] - df['deg_C'].shift(periods=periods, fill_value=0)

    for periods in [3, 6]:
        df[f'abshum-{periods}'] = df['absolute_humidity'] - df['absolute_humidity'].shift(periods=periods, fill_value=0)
#         df[f'relhum-{periods}'] = df['relative_humidity'] - df['relative_humidity'].shift(periods=periods, fill_value=0)
        
    for periods in [1]:
        df[f's1-{periods}'] = df['sensor_1'] - df['sensor_1'].shift(periods=periods, fill_value=0)
        df[f's2-{periods}'] = df['sensor_2'] - df['sensor_2'].shift(periods=periods, fill_value=0)
        df[f's3-{periods}'] = df['sensor_3'] - df['sensor_3'].shift(periods=periods, fill_value=0)
        df[f's4-{periods}'] = df['sensor_4'] - df['sensor_4'].shift(periods=periods, fill_value=0)
        df[f's5-{periods}'] = df['sensor_5'] - df['sensor_5'].shift(periods=periods, fill_value=0)
        df[f's6-{periods}'] = df['sensor_6'] - df['sensor_6'].shift(periods=periods, fill_value=0)
        df[f's7-{periods}'] = df['sensor_7'] - df['sensor_7'].shift(periods=periods, fill_value=0)
            
    return df

In [23]:
def make_benzene_features(df):
    """
    Adds new features to a given dataset
    """
#     df['year'] = df.index.year
    df['month'] = df.index.month
#     df['week'] = df.index.week
#     df['day'] = df.index.day
    df['dayofweek'] = df.index.dayofweek
    df['time'] = df.index.date - df.index.date.min()
    df['hour'] = df.index.hour
    df['time'] = df['time'].apply(lambda x : x.days)

    df['working_hours'] =  df['hour'].isin(np.arange(8, 21, 1)).astype('int')
    df['maximum_hours'] =  df['hour'].isin([8, 9, 17, 18, 19, 20]).astype('int')
    df['is_weekend'] = (df.index.dayofweek >= 5).astype('int')

    df['SMC'] = (df['absolute_humidity'] * 100) / df['relative_humidity']
    
    df['sensor_6'] = (df['sensor_2'] - df['sensor_5']) / df['sensor_5']
    df['sensor_7'] = (df['sensor_3'] - df['sensor_4']) / df['sensor_4']

    for periods in [3, 6]:
        df[f'dt-{periods}'] = df['deg_C'] - df['deg_C'].shift(periods=periods, fill_value=0)

#     for periods in [3, 6]:
#         df[f'abshum-{periods}'] = df['absolute_humidity'] - df['absolute_humidity'].shift(periods=periods, fill_value=0)
    df[f'relhum-{periods}'] = df['relative_humidity'] - df['relative_humidity'].shift(periods=6, fill_value=0)
        
    for periods in [1]:
        df[f's1-{periods}'] = df['sensor_1'] - df['sensor_1'].shift(periods=periods, fill_value=0)
        df[f's2-{periods}'] = df['sensor_2'] - df['sensor_2'].shift(periods=periods, fill_value=0)
        df[f's3-{periods}'] = df['sensor_3'] - df['sensor_3'].shift(periods=periods, fill_value=0)
        df[f's4-{periods}'] = df['sensor_4'] - df['sensor_4'].shift(periods=periods, fill_value=0)
        df[f's5-{periods}'] = df['sensor_5'] - df['sensor_5'].shift(periods=periods, fill_value=0)
        df[f's6-{periods}'] = df['sensor_6'] - df['sensor_6'].shift(periods=periods, fill_value=0)
        df[f's7-{periods}'] = df['sensor_7'] - df['sensor_7'].shift(periods=periods, fill_value=0)
            
    return df

In [24]:
def make_NOx_features(df):
    """
    Adds new features to a given dataset
    """
#     df['year'] = df.index.year
    df['month'] = df.index.month
#     df['week'] = df.index.week
#     df['day'] = df.index.day
    df['dayofweek'] = df.index.dayofweek
    df['time'] = df.index.date - df.index.date.min()
    df['hour'] = df.index.hour
    df['time'] = df['time'].apply(lambda x : x.days)

    df['working_hours'] =  df['hour'].isin(np.arange(8, 21, 1)).astype('int')
    df['maximum_hours'] =  df['hour'].isin([8, 9, 17, 18, 19, 20]).astype('int')
    df['is_weekend'] = (df.index.dayofweek >= 5).astype('int')

    df['SMC'] = (df['absolute_humidity'] * 100) / df['relative_humidity']
    
    df['sensor_6'] = (df['sensor_2'] - df['sensor_5']) / df['sensor_5']
    df['sensor_7'] = (df['sensor_3'] - df['sensor_4']) / df['sensor_4']

    for periods in [3, 6]:
        df[f'dt-{periods}'] = df['deg_C'] - df['deg_C'].shift(periods=periods, fill_value=0)

    for periods in [3, 6]:
        df[f'abshum-{periods}'] = df['absolute_humidity'] - df['absolute_humidity'].shift(periods=periods, fill_value=0)
#         df[f'relhum-{periods}'] = df['relative_humidity'] - df['relative_humidity'].shift(periods=periods, fill_value=0)
        
    for periods in [1]:
        df[f's1-{periods}'] = df['sensor_1'] - df['sensor_1'].shift(periods=periods, fill_value=0)
        df[f's2-{periods}'] = df['sensor_2'] - df['sensor_2'].shift(periods=periods, fill_value=0)
        df[f's3-{periods}'] = df['sensor_3'] - df['sensor_3'].shift(periods=periods, fill_value=0)
        df[f's4-{periods}'] = df['sensor_4'] - df['sensor_4'].shift(periods=periods, fill_value=0)
        df[f's5-{periods}'] = df['sensor_5'] - df['sensor_5'].shift(periods=periods, fill_value=0)
        df[f's6-{periods}'] = df['sensor_6'] - df['sensor_6'].shift(periods=periods, fill_value=0)
        df[f's7-{periods}'] = df['sensor_7'] - df['sensor_7'].shift(periods=periods, fill_value=0)
            
    return df

# PyCaret
---

In [25]:
# def pycaret_model(train, test, config):
#     print('Setup Your Data....')
#     setup(
#         data=train,
#         target=config['target'],
#         numeric_imputation='mean',
#         session_id=config['seed'],
#         normalize = config['normalize'],
#         silent= True,
#         fold_strategy=config['fold_strategy'],
#     )

#     print(f"Comparing Models....")
#     best = compare_models(sort=config['optimize'], n_select=config['n_select'], fold=config['fold'], exclude=['xgboost'])

#     if config['tuning']:
#         print(f"Tuning Models....")
#         best_tuned = [tune_model(model) for model in best]

#         print(f"Blending Models....")
#         blended = blend_models(estimator_list=best+best_tuned, fold=config['fold'], optimize=config['optimize'])
#     else:
#         print(f"Blending Models....")
#         blended = blend_models(estimator_list=best, fold=config['fold'], optimize=config['optimize'])
        
#     pred_holdout = predict_model(blended)

#     print(f"Finallizing Models....")
#     final_model = finalize_model(blended)

#     print('Done...!!!')
#     pred = predict_model(final_model, test)
#     re = pred['Label']

#     return re, final_model

In [39]:
def train_cat(train, test, target, model_num):
    name = 'catboost_leave_one_group_out'
    seed_list=[2001, 2002, 2003]
    oof = np.zeros((len(train)))
    test_preds_list = []
    score_list = []

    groups = train['month']
    logo = LeaveOneGroupOut()

    for train_index, val_index in logo.split(train, target, groups):

        X_train, X_val = train.iloc[train_index], train.iloc[val_index]   
        y_train, y_val = target.iloc[train_index], target.iloc[val_index]

        val_preds_list = []

        for seed in seed_list:
            rand_state = seed
            model = CatBoostRegressor(random_state=rand_state,
                                     thread_count=4,
                                     verbose=False,
                                     loss_function='RMSE',
                                     eval_metric='RMSE',
                                     od_type="Iter",
                                     early_stopping_rounds=500,
                                     use_best_model=True,
                                     iterations=10000,
                                     task_type = "GPU",
                                     **cb_params[model_num])

            model.fit(X_train, y_train,
                      eval_set=(X_val, y_val),
                      verbose=False,
                      cat_features=["working_hours", "is_weekend", "maximum_hours"])

            val_preds_list.append(model.predict(X_val))
            test_preds_list.append(model.predict(test))

        oof[val_index] = np.mean(val_preds_list,axis=0)
        score = np.sqrt(mean_squared_log_error(y_val, oof[val_index]))
        print(f"log_loss: {score}")
        score_list.append(score)
        # print(f"fold: {fold}, class0 tr %: {y_train.value_counts()[0]/len(y_train)}, class0 val %: {y_val.value_counts()[0]/len(y_val)} ")

    cv_RMSLE = np.mean(score_list)
    print(f"{name} ,RMSLE: {cv_RMSLE}")

    preds= np.mean(test_preds_list,axis=0)


    # file_name_oof = name + "_oof.txt"
    # file_name_test = name + "_test.csv"
    # with open(file_name_oof, "wb") as fp:
    #       pickle.dump(oof, fp)

    # files.download(file_name_oof)

    submission.target_carbon_monoxide = preds
    # df_submission.to_csv(file_name_test,index=None)
    # files.download(file_name_test) 
    return preds, oof

## for Carbon Monoxide

In [27]:
train = train.drop(train.tail(1).index)
target_co = train.target_carbon_monoxide
target_be = train.target_benzene
target_no = train.target_nitrogen_oxides
train = train.drop(['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides'], 1)
all_df = pd.concat([train, test], 0)

In [40]:
# 'target_carbon_monoxide'
train_co = make_CO_features(all_df)

X_train_co = train_co[:len(train)].copy()
X_test_co = train_co[len(train):].copy()

preds_CO, oof_preds_CO = train_cat(X_train_co, X_test_co, target_co, 0)


log_loss: 0.03611243506939732
log_loss: 0.04846722740532666
log_loss: 0.07345188937153367
log_loss: 0.0613780509804151
log_loss: 0.061193821181190475
log_loss: 0.05871287107616374
log_loss: 0.046158636266766195
log_loss: 0.03973710665060978
log_loss: 0.09266706035917502
log_loss: 0.12497526294028141
catboost_leave_one_group_out ,RMSLE: 0.06428543613008594


In [41]:
preds_CO

array([0.86442579, 0.98910836, 0.91228835, ..., 1.21844717, 1.08199576,
       1.18718217])

In [None]:
plot_model(model_CO, plot='error')

## for Benzene

In [None]:
# 'target_benzene'
X = train[train.index.month>8].copy()
all_df = make_benzene_features(pd.concat([X, test_benzene]))

X_train = all_df.copy()
X_test = all_df[len(X):].copy()

pred_benzene, model_benzene = pycaret_model(X_train, X_test, CFG['benzene'])
submission[CFG['benzene']['target']] = np.expm1(pred_benzene)

In [None]:
plot_model(model_benzene, plot='error')

## for Nitrogen Oxides

In [None]:
# 'target_nitrogen_oxides'
X = train[train.index.month>8].copy()
all_df = make_NOx_features(pd.concat([X, test_nitrogen_oxides]))

X_train = all_df.copy()
X_test = all_df[len(X):].copy()

pred_NOx, model_NOx = pycaret_model(X_train, X_test, CFG['nitrogen_oxides'])
submission[CFG['nitrogen_oxides']['target']] = np.expm1(pred_NOx)

In [None]:
plot_model(model_NOx, plot='error')

# Submission
---

In [None]:
submission.to_csv('submission.csv')
submission