# Introduction
Hey, thanks for viewing my Kernel!

If you like my work, please, leave an upvote: it will be really appreciated and it will motivate me in offering more content to the Kaggle community ! 😊

👉 EDA and FE are done in this [notebook](https://www.kaggle.com/hasanbasriakcay/tps-mar22-eda-fe).

In [None]:
import pandas as pd
import numpy as np
import warnings

warnings.simplefilter('ignore')
train = pd.read_pickle('../input/tpsmar22-deterministicholidaytime-features/train_featured_v2.pkl')
test = pd.read_pickle('../input/tpsmar22-deterministicholidaytime-features/test_featured_v2.pkl')
submission = pd.read_csv('../input/tabular-playground-series-mar-2022/sample_submission.csv')

display(train.head())
display(submission.head())

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int8','int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2

    for col in df.columns:
        col_type = df[col].dtypes

        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()

            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2

    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
 
    return df

In [None]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

# Trend

In [None]:
def create_trend(df_train, df_test, time_col, target):
    from statsmodels.tsa.deterministic import DeterministicProcess
    from sklearn.linear_model import LinearRegression
    
    trend_model_dict = {}
    trend_train_dict = {}
    trend_test_dict = {}
    
    # Train
    for direction in df_train['direction'].unique():
        temp_df = df_train.loc[df_train['direction'] == direction, :]
        
        dp = DeterministicProcess(
            index=temp_df[time_col],
            constant=True,       
            order=1,             
            drop=True,           
        )
        X = dp.in_sample()
        y = temp_df[target]
        
        model = LinearRegression(fit_intercept=False)
        model.fit(X, y)
        y_pred = pd.Series(model.predict(X), index=X.index)
        
        trend_model_dict[direction] = model
        trend_train_dict[direction] = y_pred
        
    # Test
    for direction in df_train['direction'].unique():
        model = trend_model_dict[direction]
        
        temp_df = df_test.loc[df_test['direction'] == direction, :]
        dp = DeterministicProcess(
            index=temp_df[time_col],
            constant=True,       
            order=1,             
            drop=True,           
        )
        X = dp.in_sample()
        
        y_pred = pd.Series(model.predict(X), index=X.index)
        trend_test_dict[direction] = y_pred
    
    return trend_train_dict, trend_test_dict, _

In [None]:
def plot_trend(trend_train_dict, trend_test_dict):
    import matplotlib.pyplot as plt
    import seaborn as sns
    
    for key in trend_train_dict.keys():
        trend_train = trend_train_dict[key]
        trend_test = trend_test_dict[key]
        
        fig, ax = plt.subplots(figsize=(16, 8))
        sns.lineplot(x=trend_train.index, y=trend_train.values, ax=ax)
        sns.lineplot(x=trend_test.index, y=trend_test.values, ax=ax)
        ax.set_title(key)
        
        break
        

In [None]:
trend_train_dict, trend_test_dict, _ = create_trend(train, test, 'time', 'congestion')

In [None]:
#plot_trend(trend_train_dict, trend_test_dict)

train_group = train.groupby('time', as_index=False).agg({'congestion':'mean'})
moving_average = train_group['congestion'].rolling(
    window=182 * 24,       # 182-day 24-hour window
    center=True,           # puts the average at the center of the window
    min_periods=91 * 24,   # choose about half the window size
).mean()  
moving_average

ax = train_group['congestion'].plot(style=".", color="0.5", figsize=(16, 8))
moving_average.plot(
    ax=ax, linewidth=5, title="Congestion - 182-Day Moving Average", legend=False,
);

In [None]:
from scipy.stats import linregress

slope, intercept, r, p, se = linregress(np.arange(0, len(train_group)), train_group['congestion'])
fx = np.array([0, len(train_group) - 1])
fy = intercept + slope * fx
ax = train_group['congestion'].plot(style=".", color="0.5", figsize=(16, 8))
ax.plot(fx, fy, '-', linewidth=5, color='red')
ax.text(0, 64, 'slope: ' + str(slope), fontsize=18, color="red")
ax.set_title('Linregress Trend');

# Lag Features

In [None]:
def create_lag_features(df, lags=1, target=''):
    for direction in train['direction'].unique():
        for x in train['x'].unique():
            for y in train['y'].unique():
                conditions = ((train['x'] == x) & (train['y'] == y) & (train['direction'] == direction))
                temp_df = train.loc[conditions, [target]]
                for lag in range(0, lags):
                    train.loc[conditions, f'lag_{lag + 1}'] = temp_df[target].shift(lag + 1)

def plot_lag_features(df, lags=1, target='', ncols=1):
    from matplotlib.offsetbox import AnchoredText
    import matplotlib.pyplot as plt
    import seaborn as sns
    
    nrows = int(lags / ncols)
    fig, axes = plt.subplots(nrows, ncols, sharex=True, sharey=True, squeeze=False, 
                             figsize=(ncols * 2, nrows * 2 + 0.5))
    fig.tight_layout(w_pad=0.1, h_pad=0.1)
    plt.subplots_adjust(hspace=0.2)
    lag_index = 0
    for row in range(nrows):
        for col in range(ncols):
            scatter_kws = dict(
                alpha=0.1,
                s=0.1,
            )
            line_kws = dict(color='red')
            corr = df[target].corr(df[f'lag_{lag_index + 1}'])
            sns.regplot(x=df[target].values, y=df[f'lag_{lag_index + 1}'].values, scatter_kws=scatter_kws,
                       line_kws=line_kws, ax=axes[row][col])
            at = AnchoredText(
                f"{corr:.2f}",
                prop=dict(size="large"),
                frameon=True,
                loc="upper left",
            )
            at.patch.set_boxstyle("square, pad=0.0")
            axes[row][col].add_artist(at)
            axes[row][col].set_title(f'lag_{lag_index + 1}')
            
            if col == 0 and row == 0:
                axes[row][col].set_ylabel(target)
            elif col == 0 and row == 1:
                axes[row][col].set_ylabel(target)
                axes[row][col].set_xlabel(target)
            elif row == 1:
                axes[row][col].set_xlabel(target)
                
            lag_index += 1

In [None]:
create_lag_features(train, lags=12, target='congestion')
plot_lag_features(train, lags=12, target='congestion', ncols=6)

# Rule Based Model

In [None]:
def sub_trend(df, trend_dict, target):
    for direction in df['direction'].unique():
        df.loc[df['direction'] == direction, target] -= trend_dict[direction].values
        
def add_trend(df, trend_dict, target):
    for direction in df['direction'].unique():
        df.loc[df['direction'] == direction, target] += trend_dict[direction].values

In [None]:
sub_trend(train, trend_train_dict, 'congestion')

In [None]:
medians = train.groupby(['x', 'y', 'direction', 'weekend', 'hour', 'minute']).agg({'congestion':'median'})
medians

In [None]:
test['congestion'] = 0
add_trend(test, trend_test_dict, 'congestion')
test['trend'] = test['congestion']
test.drop(['congestion'], 1, inplace=True)

In [None]:
sub = test.merge(medians, 
                 left_on=['x', 'y', 'direction', 'weekend', 'hour', 'minute'],
                 right_index=True)[['congestion']]
sub.reset_index(inplace=True)
sub.columns = submission.columns
sub['congestion'] += test['trend'].values
sub['congestion'] = sub['congestion'].round()
sub.to_csv('submission.csv', index=False)
sub

# Preds Distribution

In [None]:
import matplotlib.pyplot as plt
# Plot the distribution of the test predictions
# compared to the other Monday afternoons
add_trend(train, trend_train_dict, 'congestion')
plt.figure(figsize=(16,3))
plt.hist(train.congestion[((train.time.dt.weekday == 0) &
                           (train.time.dt.hour >= 12)).values],
         bins=np.linspace(-0.5, 100.5, 102),
         density=True, label='Train',
         color='b')
plt.hist(sub['congestion'], np.linspace(-0.5, 100.5, 102),
         density=True, rwidth=0.5, label='Test predictions',
         color='r')
plt.xlabel('Congestion')
plt.ylabel('Frequency')
plt.title('Congestion on Monday afternoons')
plt.legend()
plt.show()

References:

[1] [notebook](https://www.kaggle.com/ambrosm/tpsmar22-without-machine-learning/notebook?scriptVersionId=89093653)