In [145]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import scipy as sp
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

import plotly.graph_objs as go
import plotly.tools as tls
from plotly.offline import iplot, init_notebook_mode
import plotly.figure_factory as ff

from sklearn.preprocessing import minmax_scale
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression

# from functools import partial
# from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK, STATUS_RUNNING

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [146]:
df_train = pd.read_csv('/kaggle/input/bigquery-geotab-intersection-congestion/train.csv')
# df_test = pd.read_csv('/kaggle/input/bigquery-geotab-intersection-congestion/test.csv')
train = df_train.dropna()
df_train = train

In [147]:
# show structure of data
df_train.head()

In [148]:
df_train.describe()

In [149]:
# show numbers of unique values
df_train.nunique()

In [150]:
# show four unique cities
df_train["City"].unique()

In [151]:
# show distribution of cities
total = len(df_train)
plt.figure()
g = sns.countplot(x="City", data=df_train)
g.set_title("City Count Distribution")
g.set_ylabel("Count")
g.set_xlabel("City")
plt.show()
for p in g.patches:
    height = p.get_height()
    print('{:1.2f}%'.format(height/total*100))

* The most common value is Philadelphia and it have 45.29% of the total entries.

In [152]:
plt.figure()
g = sns.countplot(x="Hour", data=df_train, hue='City', dodge=True)
g.set_title("Hour Count Distribution", fontsize=20)
g.set_ylabel("Count",fontsize= 17)
g.set_xlabel("Hours", fontsize=17)
sizes=[]
for p in g.patches:
    height = p.get_height()
    sizes.append(height)
plt.show()


* Cities have different hours patterns.
* Philadelphia is the most common in all hours. The lowest point in on 5 a.m.
* Atlanta has less entries in all day, but it has more rides after 17 p.m to 4a.m.

In [153]:
plt.figure()
g = sns.countplot(x="Month", data=df_train, hue='City', dodge=True)
g.set_title("Month Count Distribution")
g.set_ylabel("Count")
g.set_xlabel("Month")
plt.show()

* The data is about only 6 months
* Boston and Philadelphia improved througout the time and the others seem unchanged.


In [154]:
plt.figure()
g = sns.countplot(x="EntryHeading",data=df_train,hue='ExitHeading', dodge=True)
g.set_title("Entry heading by Exit Heading")
g.set_ylabel("Count")
g.set_xlabel("Entry Heading Region")
plt.show()

* Generally, the Entry and Exit Region is the same.

In [155]:
plt.figure()
g1 = sns.countplot(x="EntryHeading", data=df_train, hue='City')
g1.set_title("Entry Heading By Cities")
g1.set_ylabel("Count")
g1.set_xlabel("Entry Heading Region")
plt.show()

* Cities have difference patterns for entry heading.

In [156]:
# predict values
t_stopped = ['TotalTimeStopped_p20',
             'TotalTimeStopped_p50', 
             'TotalTimeStopped_p80']
d_first_stopped = ['DistanceToFirstStop_p20',
                   'DistanceToFirstStop_p50',
                   'DistanceToFirstStop_p80']

In [157]:
plt.figure()
plt.title('Correlation of Features for Train Set')
sns.heatmap(df_train[t_stopped + d_first_stopped].astype(float).corr(),
            vmax=1.0,  annot=True)
plt.show()

* Distance to First Stop p20 and Total Time Stopped p20 have a high correlation.

In [158]:
target_cols = t_stopped + d_first_stopped
for col in target_cols:
    df_train[col + str("_n")] = (minmax_scale(df_train[col], feature_range=(0,1)))

cols = ['TotalTimeStopped_p20_n', 'TotalTimeStopped_p50_n', 'TotalTimeStopped_p80_n', 'DistanceToFirstStop_p20_n', 'DistanceToFirstStop_p50_n', 'DistanceToFirstStop_p80_n']

# df_train[cols].isna().sum()

In [159]:
pca = PCA(n_components=2, random_state=5)
pc = pca.fit_transform(df_train[cols])

pc = pd.DataFrame(pc)

prefix='pc'
pc.rename(columns=lambda x: str(prefix)+str(x), inplace=True)


df_train = pd.concat([df_train, pc], axis=1)
df_train = df_train.dropna()

In [160]:
plt.figure()
g = sns.FacetGrid(df_train.sample(50000), col="City", col_wrap=2, height=5, aspect=1.5, hue='Weekend')

g.map(sns.scatterplot, "pc0", "pc1", alpha=.5 ).add_legend();

plt.show()

In [161]:
# #df_train_na = df_train[cols].dropna()
# ssd = []

# for k in range(1,8):
#     km = KMeans(n_clusters=k, random_state=4)
#     km = km.fit(df_train_na)
#     inertia= km.inertia_
#     ssd.append(inertia)
# print(ssd)

In [162]:
# x = range(1,8)
# plt.plot(x, ssd, 'bx-')
# plt.xlabel('k')
# plt.ylabel('Sum of squared distances')
# plt.title('Elbow Method For Optimal k')
# plt.show()

* Based on Elbow Method the best number of cluster is 4

In [164]:
km = KMeans(n_clusters=4, random_state=4)
km = km.fit(df_train[cols])
df_train['clusters'] = km.predict(df_train[cols])

In [165]:
plt.figure()
g = sns.countplot(x="clusters", data=df_train)
g.set_title("Cluster Count Distribution")
g.set_ylabel("Count")
g.set_xlabel("Target Cluster Distributions")
plt.show()

* The most common cluster has 73% of all data.

In [166]:
plt.figure()
g = sns.countplot(x="clusters", data=df_train, hue='City')
g.set_title("Cites Cluster Distribution")
g.set_ylabel("Count")
g.set_xlabel("Target Cluster Distributions")
plt.show()

* Philadelphia is the most common.
* Boston is the second most common.

In [167]:
plt.figure()
g = sns.boxplot(x="clusters", y='pc0', data=df_train, hue='City')
g.set_title("Distribution of PCA")
g.set_ylabel("PCA 0 Values")
g.set_xlabel("Target Cluster Distributions")
plt.show()

In [168]:
plt.figure()

sns.scatterplot(x='pc0', y='pc1',hue='clusters', data=df_train, palette='Set1')
plt.title("PCA 0 and PCA 1 by Clusters", fontsize=22)
plt.ylabel("Target PCA 1 values", fontsize=18)
plt.xlabel("Target PCA 0 values", fontsize=18)

plt.show()