# TPS-MARCH 2022

Hey guys 👋, welcome to my EDA notebook🕵️📚 for the TPS-March 2022 competetion organized by Kaggle itself.

## Introduction to the competetion

### 1. Objective of the Competetion
In this competition, you'll forecast twelve-hours of traffic flow in a major U.S. metropolitan area.

### 2. File Structure of the Competetion
Like any other kaggle competetion this competetion too has a `train.csv` and `test.csv` file. Along with those there is a `sample_submission.csv` which represents how the submission data should look like.

### 3. Evaluation-metric used
Submissions are evaluated on the mean absolute error (MAE) between predicted and actual congestion values for each time period in the test set.

### Feature Description
* `row_id`: a unique identifier for this instance
* `time` : the 20-minute period in which each measurement was taken
* `x` : the east-west midpoint coordinate of the roadway
* `y` : the north-south midpoint coordinate of the roadway
* `direction`: the direction of travel of the roadway. for example `EB` indicates East-Bound travel, `SW` indicates South-West Direction of travel.
* `congestion` : congestion levels for the roadway during each hour; the target. (NOTE: The congestion measurements have been normalized to the range 0 to 100.)


# Import the Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import warnings

from IPython.display import display
from colorama import Fore, Style, Back

warnings.filterwarnings('ignore')
plt.style.use('fivethirtyeight')

# Load the Datasets

In [None]:
df_train = pd.read_csv('../input/tabular-playground-series-mar-2022/train.csv', index_col='time', infer_datetime_format=True, parse_dates=True)
df_test = pd.read_csv('../input/tabular-playground-series-mar-2022/test.csv', index_col='time', infer_datetime_format=True, parse_dates=True)


print(Fore.YELLOW, f'Shape of Training Data : {df_train.shape}', Style.RESET_ALL)
display(df_train.head(5))

print('\n\n')

print(Fore.BLUE, f'Shape of Test Data: {df_test.shape}', Style.RESET_ALL)
display(df_test.head(5))

# Dataset Information

In [None]:
print(Fore.YELLOW, '<---Training data information--->', Style.RESET_ALL)
display(df_train.info())

print('\n\n')

print(Fore.BLUE, '<---Test data information--->', Style.RESET_ALL)
display(df_test.info())

# Number of unique values in each attributes of the dataset

In [None]:
def calc_n_unqiue_per_attr(data, is_train=True):
    
    _df = list()
    for col in data.columns:
        if col == 'row_id': continue
        _df.append({'attribute': col, 'n_unique': data[col].nunique(), 'uniques':data[col].unique()})
        
    _df = pd.DataFrame.from_records(_df)
    
    if is_train:
        print(Fore.YELLOW,'<---Information of Training data--->', Style.RESET_ALL)
        display(_df)
        
    else:
        print(Fore.BLUE, '<--- Information of Test data --->', Style.RESET_ALL)
        display(_df)
        

In [None]:
calc_n_unqiue_per_attr(df_train, is_train=True)
calc_n_unqiue_per_attr(df_test, is_train=False)

# Charts

## Distribution of $x$ and $y$ in training and test set

In [None]:
fig, ax = plt.subplots(2, 2, figsize=(15, 5))
plt.subplots_adjust(top=2.4, right=1.5, hspace=.4, wspace=.4)
ax[0, 0].set_title('Distribution of $x$ in training data')
sns.countplot(x='x', data=df_train, ax=ax[0, 0], color='royalblue')

ax[0, 1].set_title('Distribution of $x$ in test data')
sns.countplot(x='x', data=df_test, ax=ax[0, 1], color='indianred')

ax[1, 0].set_title('Distribution of $y$ in training data')
sns.countplot(x='y', data=df_train, ax=ax[1, 0], color='indigo')

ax[1, 1].set_title('Distribution of $y$ in test data')
sns.countplot(x='y', data=df_test, ax=ax[1, 1], color='crimson')

plt.show()

## Distribution of direction in train and test set (throughout)

In [None]:
fig, ax = plt.subplots(2, 1, figsize=(15, 5))
plt.subplots_adjust(top=1.4, hspace=.5)


ax[0].set_title('Distribution of "direction" in training data')
sns.countplot(x='direction', data=df_train, ax=ax[0], color='royalblue')

ax[1].set_title('Distribution of "direction" in test data')
sns.countplot(x='direction', data=df_test, ax=ax[1], color='indianred')

plt.show()

## Distribution of "direction" based on $(x, y)$ coordinates of train and test set

In [None]:
fig, ax = plt.subplots(2, 1, figsize=(15, 5))
plt.subplots_adjust(top=1.4, hspace=.8)


train_grp1 = pd.DataFrame(df_train.groupby(['x', 'y'])['direction'].value_counts()).rename(columns={'direction':'counts'}).reset_index()
test_grp1 = pd.DataFrame(df_test.groupby(['x', 'y'])['direction'].value_counts()).rename(columns={'direction':'counts'}).reset_index()

train_grp1['(x, y)'] = train_grp1.apply(lambda x: (x['x'], x['y']), axis=1)
train_grp1.drop(['x', 'y'], axis=1, inplace=True)

test_grp1['(x, y)'] = test_grp1.apply(lambda x: (x['x'], x['y']), axis=1)
test_grp1.drop(['x', 'y'], axis=1, inplace=True)

ax[0].set_title('Distribution of $direction$ in train set based on $(x, y)$ coordinates', pad=50)
sns.barplot(x='(x, y)', y='counts', hue='direction', data=train_grp1, palette='muted', ax=ax[0])
ax[0].legend(ncol=8, loc='upper right', bbox_to_anchor=(1, 1.2))


ax[1].set_title('Distribution of $direction$ in test set based on $(x, y)$ coordinates', pad=50)
sns.barplot(x='(x, y)', y='counts', hue='direction', data=test_grp1, palette='muted', ax=ax[1])
ax[1].legend(ncol=8, loc='upper right', bbox_to_anchor=(1, 1.2))

plt.show()

## Introducing Time-Based Features

In [None]:
df_train['hour'] = df_train.index.hour
df_train['minute'] = df_train.index.minute
df_train['dayname'] = df_train.index.day_name()


df_test['hour'] = df_test.index.hour
df_test['minute'] = df_test.index.minute
df_test['dayname'] = df_test.index.day_name()

## Distribution of $hour$ in train and test set (throughout)

In [None]:
fig, ax = plt.subplots(2, 1, figsize=(15, 5))

plt.subplots_adjust(hspace=.4, top=1.4)

ax[0].set_title('Distribution of $hour$ in train set')
sns.countplot(x='hour', data=df_train, ax=ax[0], color='royalblue')


ax[1].set_title('Distribution of $hour$ in test set')
sns.countplot(x='hour', data=df_test, ax=ax[1], color='indianred')

plt.show()

## Distribution of $min$ in train and test set (throughout)

In [None]:
fig, ax = plt.subplots(2, 1, figsize=(15, 5))

plt.subplots_adjust(hspace=.4, top=1.4)

ax[0].set_title('Distribution of $min$ in train set')
sns.countplot(x='minute', data=df_train, ax=ax[0], color='royalblue')


ax[1].set_title('Distribution of $min$ in test set')
sns.countplot(x='minute', data=df_test, ax=ax[1], color='indianred')

plt.show()

## Distribution of $dayname$ in train and test set (throughout)

In [None]:
fig, ax = plt.subplots(2, 1, figsize=(15, 5))

plt.subplots_adjust(hspace=.4, top=1.4)

ax[0].set_title('Distribution of $dayname$ in train set')
sns.countplot(x='dayname', data=df_train, ax=ax[0], color='royalblue')


ax[1].set_title('Distribution of $dayname$ in test set')
sns.countplot(x='dayname', data=df_test, ax=ax[1], color='indianred')

plt.show()

## Distribution of $congestion$ 

In [None]:
plt.figure(figsize=(15, 5))
sns.distplot(df_train['congestion'])
plt.show()

## Pattern of $congestion$ based on $dayname$

In [None]:
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
plt.title('Pattern of $congestion$ based on $dayname$')
df_train.groupby(['dayname'])['congestion'].mean().loc[day_order].plot(figsize=(15, 5))
plt.ylabel('Average Congestion')
plt.show()

In [None]:
df_train.groupby(['hour'])['congestion'].mean().plot(figsize=(15, 5), title='Pattern of $congestion$ based on $hour$')
plt.ylabel('Average congestion')
plt.show()

## Pattern of $congestion$ based on $hour$ and $dayname$

In [None]:
df_train.groupby(['hour', 'dayname'])['congestion'].mean().unstack()[day_order].plot(figsize=(15, 5), color=["#bd1c1c", "#deb626", "#85de26", "#269ede",
                                                                                                             "#7026de", "#c226de", "#42f5d4"],
                                                                                    title="Pattern of $congestion$ based on $hour$ and $dayname$")

plt.ylabel("Average Congestion")
plt.show()