# Introduction
Hey, thanks for viewing my Kernel!

If you like my work, please, leave an upvote: it will be really appreciated and it will motivate me in offering more content to the Kaggle community ! 😊

👉 EDA and FE are done in this [notebook](https://www.kaggle.com/hasanbasriakcay/tps-mar22-eda-fe).

In [None]:
import pandas as pd
import numpy as np
import warnings

warnings.simplefilter('ignore')
train = pd.read_pickle('../input/tpsmar22-deterministicholidaytime-features/train_featured_v2.pkl')
test = pd.read_pickle('../input/tpsmar22-deterministicholidaytime-features/test_featured_v2.pkl')
submission = pd.read_csv('../input/tabular-playground-series-mar-2022/sample_submission.csv')

display(train.head())
display(submission.head())

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int8','int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2

    for col in df.columns:
        col_type = df[col].dtypes

        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()

            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2

    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
 
    return df

In [None]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

In [None]:
train.drop(['index', 'time'], 1, inplace=True)
test.drop(['index', 'time', 'congestion'], 1, inplace=True)

In [None]:
train[['x', 'y']] = train[['x', 'y']].astype(str)
test[['x', 'y']] = test[['x', 'y']].astype(str)

In [None]:
display(train['direction'].value_counts(normalize=True))
display(test['direction'].value_counts(normalize=True))

In [None]:
selected_cols = ['direction', 'y', 'x', 'hour', 'hour_sin', 'hour_cos', 'weekend', 'weekday', 
                 'sin(3,freq=A-DEC)', 'cos(3,freq=A-DEC)', 'sin(2,freq=A-DEC)', 'cos(2,freq=A-DEC)',
                 'sin(1,freq=A-DEC)', 'cos(1,freq=A-DEC)', 'dayofweek_cos', 'trend', 'minute', 'dayofweek_sin']

In [None]:
#train = train.loc[train['quarter'] == 'Q3', :]
train = train[selected_cols + ['congestion']]
conditions = [train['direction'] == 'EB', train['direction'] == 'NB', train['direction'] == 'SB', train['direction'] == 'WB', 
              train['direction'] == 'NE', train['direction'] == 'SW', train['direction'] == 'NW', train['direction'] == 'SE', ]
choices = train['direction'].value_counts(normalize=True).values.tolist()
train['weights'] = np.select(conditions, choices, default=np.nan)
train = train.sample(n=100000, weights='weights', random_state=42)
train.drop(['weights'], 1, inplace=True)
test = test[selected_cols]

import gc
gc.collect()

# Modeling

In [None]:
%%capture
!pip install pycaret[full]

In [None]:
from pycaret.regression import *

numeric_cols = train[selected_cols].select_dtypes(include=np.number).columns.tolist()
object_cols = list(set(train[selected_cols].columns) - set(numeric_cols))
print('numeric_cols: ', numeric_cols)
print('object_cols: ', object_cols)

In [None]:
clf = setup(data=train,
            target='congestion',
            #train_size = 0.99,
            normalize = True,
            normalize_method = 'robust',
            #create_clusters = True,
            numeric_features = numeric_cols,
            categorical_features = object_cols,
            #ignore_low_variance=True,
            #remove_multicollinearity = True,
            session_id = 42,
            use_gpu = False,
            silent = True,
            fold = 9,
            n_jobs = -1)

In [None]:
N = 2
#include = ['huber', 'knn', 'dt', 'ada', 'lightgbm', 'dummy', 'catboost', 'rf', 'et']
include = ['huber', 'knn', 'dt', 'ada', 'lightgbm', 'catboost', 'et', 'dummy']
top = compare_models(sort='MAE', n_select=N, include=include)

# Stacking

In [None]:
stack = stack_models(top, optimize='MAE')
predict_model(stack);

In [None]:
plot_model(stack, plot='error')

# Blending

In [None]:
blend = blend_models(top, optimize='MAE')
predict_model(blend);

In [None]:
plot_model(blend, plot='error')

# Ensembling

In [None]:
ensemble = ensemble_model(top[0], method='Bagging')
predict_model(ensemble);

In [None]:
plot_model(ensemble, plot='error')

# Predictions

In [None]:
import gc
gc.collect()
unseen_predictions_stack = predict_model(stack, data=test)
unseen_predictions_blend = predict_model(blend, data=test)
unseen_predictions_ensemble = predict_model(ensemble, data=test)
unseen_predictions_stack.head()

In [None]:
assert(len(test.index)==len(unseen_predictions_stack))
sub = pd.DataFrame(list(zip(submission.row_id, unseen_predictions_stack.Label)),columns = ['row_id', 'congestion'])
sub.to_csv('submission_stack.csv', index = False)
sub = pd.DataFrame(list(zip(submission.row_id, unseen_predictions_blend.Label)),columns = ['row_id', 'congestion'])
sub.to_csv('submission_blend.csv', index = False)
sub = pd.DataFrame(list(zip(submission.row_id, unseen_predictions_ensemble.Label)),columns = ['row_id', 'congestion'])
sub.to_csv('submission_ensemble.csv', index = False)
sub.head()

# Prediction Distribution

In [None]:
def plot_preds_dist(df, preds, target, ax=None, title=''):
    hour_list = [f'h{i}' for i in range(24) if i >= 12]
    train_congestion = df.congestion[((df.weekday == 'WD0') &
                                    (df.hour.isin(hour_list))).values]
    if ax==None:
        fig, ax = plt.subplots(figsize=(16,8))
        sns.kdeplot(x=train_congestion, ax=ax, label='train')
        sns.kdeplot(x=preds, label='test', ax=ax)
        ax.legend()
        ax.set_title(title);
    else:
        sns.kdeplot(x=train_congestion, ax=ax, label='train')
        sns.kdeplot(x=preds, label='test', ax=ax)
        ax.legend()
        ax.set_title(title);

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
    
fig, axes = plt.subplots(3, 1, figsize=(16, 24))
plt.subplots_adjust(hspace=0.5)
plot_preds_dist(train, unseen_predictions_stack.Label, "congestion", ax=axes[0], title="Stack")
plot_preds_dist(train, unseen_predictions_blend.Label, "congestion", ax=axes[1], title="Blend")
plot_preds_dist(train, unseen_predictions_ensemble.Label, "congestion", ax=axes[2], title="Ensemble")
plt.show();