# Laod Data

In [None]:
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import datetime

train = pd.read_csv("../input/tabular-playground-series-mar-2022/train.csv")
test = pd.read_csv("../input/tabular-playground-series-mar-2022/test.csv")
sample_submission = pd.read_csv("../input/tabular-playground-series-mar-2022/sample_submission.csv")

In [None]:
train["time"] = pd.to_datetime(train.time)
train['weekday'] = train['time'].dt.dayofweek
train["x+y"] = train["x"].astype(str) + train["y"].astype(str)
train["x+y+direction"] = train["x+y"] + train["direction"].astype(str)

*65 Roadways List*

In [None]:
roadways_list = train["x+y+direction"].unique().tolist()

# Calculate Trends for Roadways

In [None]:
trend_difference_dict = {}
trend_difference_list = []
for roadway in roadways_list:
    roadway_df = train[train["x+y+direction"] == roadway].set_index('time', drop=False)
    #roadway_df["timestamp"] = roadway_df["time"].dt.asi8
    fig, ax = plt.subplots(1, 1)
    ax.set_title(f"Roadway - {roadway}")
    ax.scatter(roadway_df.index, roadway_df.congestion, s=0.1)
    ax.set_xlim([datetime.date(1991, 4, 1), datetime.date(1991, 9, 30)])
    
    # To perform the linear regression we need the dates to be numeric
    roadway_df.index = roadway_df.index.map(datetime.date.toordinal)
    
    # Perform linear regression
    slope, y0, r, p, stderr = stats.linregress(roadway_df.index, roadway_df['congestion'])

    # x co-ordinates for the start and end of the line
    x_endpoints = pd.DataFrame([roadway_df.index[0], roadway_df.index[-1]])

    # Compute predicted values from linear regression
    y_endpoints = y0 + slope * x_endpoints
    x_endpoints_time = pd.DataFrame([roadway_df.time.iloc[0], roadway_df.time.iloc[-1]])
    
    # Overlay the line
    plt.plot(x_endpoints_time.to_numpy(), y_endpoints.values, c='r', label='daily trend={:.3f}'.format(slope))
    plt.legend(loc='lower right')
    ax.set_xlabel('time')
    ax.set_ylabel('congestion')
    
    trend_difference_from_midpoint = ((y_endpoints.loc[1].values - y_endpoints.loc[0].values) / 2)[0]
    trend_difference_dict[roadway] = trend_difference_from_midpoint
    trend_difference_list.append(trend_difference_from_midpoint)
trend_difference_df = pd.DataFrame.from_dict({'roadway': roadways_list, 'trend': trend_difference_list})

In [None]:
trend_difference_df.head()

# Total Change from Midpoint

In [None]:
trend_difference_df.sort_values('trend', inplace=True)
ax = trend_difference_df.plot(y='trend',x='roadway', kind='bar', legend=False, figsize=(20,3), title='Trend Change from Midpoint for Roadways')
ax.set_ylabel("Δ Congestion")

*It seems like when we have quite big trend numbers for some of the roudways like **21EB, 12NB and 11NB** *

In [None]:
trend_difference_df[["roadway", "trend"]].to_csv(
    f"trend_difference_df.csv", sep=",", index=True
)
trend_difference_df.head(10)