In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns

import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (20,5) # set default size of figures

# Load Data

In [None]:
train_df = pd.read_csv('../input/tabular-playground-series-mar-2022/train.csv', index_col = 'time',parse_dates=['time'])
test_df = pd.read_csv('../input/tabular-playground-series-mar-2022/test.csv', index_col = 'time',parse_dates=['time'])

# Explore

In [None]:
display(train_df.head())
display(test_df.head())

## Roads

x - the east-west midpoint coordinate of the roadway

y - the north-south midpoint coordinate of the roadway

direction - the direction of travel of the roadway. EB indicates "eastbound" travel, for example, while SW indicates a "southwest" direction of travel.

lets plot the "map" using x and y coordinates

In [None]:
sns.scatterplot(data=train_df, x='x',y='y');

X coordinate has only 3 values, Y coordinage has 4. It means that there are only 12 roadways.

There are 8 possible directions: 'EB', 'NB', 'SB', 'WB', 'NE', 'SW', 'NW', 'SE', but each road doesn't have to go in all directions. Lets calc how many combinations of coordinates and directions is present in the dataset:

In [None]:
len(train_df.groupby(['x','y','direction']))

There are 65 different roads in the dataset. 
A road is defined by: coordinates and direction.

## Time

In [None]:
display(train_df.index.min())
display(train_df.index.max())

Training data contain samples from 1st April 1991 to 30th September 1991. It means 6 months half of the year.

In [None]:
display(test_df.index.min())
display(test_df.index.max())

Our model will be evaluated by predictions for the half of one day (afternoon of 30th September 1991)

## Congestion (target)

congestion - congestion levels for the roadway during each hour; the target. The congestion measurements have been normalized to the range 0 to 100.

Lets draw congestion over time for few randomly selected roadways.

In [None]:
roadway1 = train_df.query("x==0 & y==0 & direction=='NB'")
roadway2 = train_df.query("x==1 & y==3 & direction=='EB'")
roadway3 = train_df.query("x==1 & y==3 & direction=='NE'")

congestion_df = pd.DataFrame()
congestion_df.index = roadway1.index
congestion_df['roadway1'] = roadway1.congestion
congestion_df['roadway2'] = roadway2.congestion
congestion_df['roadway3'] = roadway3.congestion
congestion_df.plot();

In [None]:
congestion_df

The plot is a bit unclear for human eye. Lets zoom in and display only one (randomly selected) day.

In [None]:
congestion_df['1991-05-29 00:00:00':'1991-05-29 23:40:00'].plot();

An interesting infromation may be to see the congestion for one roadway, but in all possible directions.

In [None]:
# Roadway x=0, y=0 has these three direction: EB NB SB
roadway00_EB = train_df.query("x==0 & y==0 & direction=='EB'").copy()
roadway00_NB = train_df.query("x==0 & y==0 & direction=='NB'").copy()
roadway00_SB = train_df.query("x==0 & y==0 & direction=='SB'").copy()


congestion_df = pd.DataFrame()
congestion_df.index = roadway00_EB.index
congestion_df['EB'] = roadway00_EB.congestion
congestion_df['NB'] = roadway00_NB.congestion
congestion_df['SB'] = roadway00_SB.congestion

congestion_df['1991-05-29 00:00:00':'1991-05-29 23:40:00'].plot();

Now let's agreggate congestion for one road in one direction for all measured days.
Note: following graphs are inspired by Kaggle Time Series tutorial: https://www.kaggle.com/code/ryanholbrook/seasonality/tutorial

In [None]:
from learntools.time_series.utils import seasonal_plot

roadway00_EB["dayofyear"] = roadway00_EB.index.dayofyear
roadway00_EB["mins_of_day"] = roadway00_EB.index.hour * 60 + roadway00_EB.index.minute

seasonal_plot(roadway00_EB, y="congestion", period="dayofyear", freq="mins_of_day");

The graph for all days is a bit messy. Let's plot only Mondays as it is our target day for prediction.

In [None]:
roadway00_EB['dayofweek'] = roadway00_EB.index.dayofweek

seasonal_plot(roadway00_EB[roadway00_EB.dayofweek == 0], y="congestion", period="dayofyear", freq="mins_of_day");


# Model

## Mean Value

At the beginning I will just try if I am able to generate a meaningful submission file. Let's use just mean value as target for now.

In [None]:
mean = train_df.congestion.mean()

submission = test_df[['row_id']].copy()
submission['congestion_mean'] = mean

In [None]:
submission

Note: the public score of the mean value is 14.077

## XGB Regressor

### Add Some Features

In [None]:
def engineer_features(df):
    df['dayofweek'] = df.index.dayofweek
    df['week'] = df.index.isocalendar().week.astype(int)
    df["mins_of_day"] = df.index.hour * 60 + df.index.minute
    
    df.drop('row_id', inplace=True, axis=1)    
    
engineer_features(train_df)
engineer_features(test_df)

In [None]:
X_train = train_df.drop('congestion',axis=1)
y_train = train_df['congestion']
X_test = test_df

X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

### Create Model

#### Whole dataset

In [None]:
from xgboost import XGBRegressor

xgb_model = XGBRegressor()

# xgb_model.fit(X_train, y_train)
# y_pred = xgb_model.predict(X_test)
# submission['congestion_xgb'] = y_pred

Note: public score of this model is 5.436

#### Only Mondays

The target day is Monday -> let's try what happens if we train the model only on Mondays. Will the score be better or worse?

In [None]:
X_train_mondays = X_train[X_train.dayofweek == 0]
y_train_mondays = y_train[X_train.dayofweek == 0]

xgb_model.fit(X_train_mondays, y_train_mondays)
y_pred_mondays = xgb_model.predict(X_test)
submission['congestion_xgb_mondays'] = y_pred_mondays

Note: the score of this model is 5.283, which is slightly better than the previous one (using the whole dataset).
I will select these two models for Final Score to see if this comparison is valid also for the final score...

*Few days later (after the competition deadline)*

The final score of the models ìs: 
* whole dataset: 5.436
* only mondays: 5.283

We see the same pattern: the model trained with only fraction of original data performs better than the one trained on the whole trainig dataset.
This brings me to the conclusion and main lesson learned from this competition: **Less is sometimes more. Even in Data Science.**

## Save into submission file

In [None]:
submission.to_csv('submission.csv',columns=['row_id','congestion_xgb_mondays'],header=['row_id','congestion'], index=False)