# UBER Pickups project 

In [1]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import  silhouette_score

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

import plotly.express as px
import plotly.io as pio
pio.renderers.default = "iframe_connected"

In [2]:
#import data 
data = pd.read_csv('uber-raw-data-apr14.csv')

In [3]:
data.shape

(564516, 4)

In [4]:
data.head()

Unnamed: 0,Date/Time,Lat,Lon,Base
0,4/1/2014 0:11:00,40.769,-73.9549,B02512
1,4/1/2014 0:17:00,40.7267,-74.0345,B02512
2,4/1/2014 0:21:00,40.7316,-73.9873,B02512
3,4/1/2014 0:28:00,40.7588,-73.9776,B02512
4,4/1/2014 0:33:00,40.7594,-73.9722,B02512


In [5]:
data.describe(include='all')

Unnamed: 0,Date/Time,Lat,Lon,Base
count,564516,564516.0,564516.0,564516
unique,41999,,,5
top,4/7/2014 20:21:00,,,B02682
freq,97,,,227808
mean,,40.740005,-73.976817,
std,,0.036083,0.050426,
min,,40.0729,-74.7733,
25%,,40.7225,-73.9977,
50%,,40.7425,-73.9848,
75%,,40.7607,-73.97,


In [6]:
#transfomr Date/Time variable into datime format and create hour, day of the week, day and year features 

In [7]:
def create_datefeatures(df):
    """
    Creates time series features from datetime str feature.
    """
    df = data.copy()
    df['date'] = pd.to_datetime(data['Date/Time'], format="%m/%d/%Y %H:%M:%S")
    df['dayofweek'] = df['date'].dt.dayofweek
    df['hour'] = df['date'].dt.hour
    X = df[['date', 'hour', 'dayofweek']]
    return X

X = create_datefeatures(data)
display(X)

Unnamed: 0,date,hour,dayofweek
0,2014-04-01 00:11:00,0,1
1,2014-04-01 00:17:00,0,1
2,2014-04-01 00:21:00,0,1
3,2014-04-01 00:28:00,0,1
4,2014-04-01 00:33:00,0,1
...,...,...,...
564511,2014-04-30 23:22:00,23,2
564512,2014-04-30 23:26:00,23,2
564513,2014-04-30 23:31:00,23,2
564514,2014-04-30 23:32:00,23,2


In [8]:
#concatenate with initial dataframe and drop date/tim column in str format
df = pd.concat([X, data], axis=1)
df = df.drop(['Date/Time'], axis=1)
df.head()

Unnamed: 0,date,hour,dayofweek,Lat,Lon,Base
0,2014-04-01 00:11:00,0,1,40.769,-73.9549,B02512
1,2014-04-01 00:17:00,0,1,40.7267,-74.0345,B02512
2,2014-04-01 00:21:00,0,1,40.7316,-73.9873,B02512
3,2014-04-01 00:28:00,0,1,40.7588,-73.9776,B02512
4,2014-04-01 00:33:00,0,1,40.7594,-73.9722,B02512


In [9]:
df.describe(include='all')





Unnamed: 0,date,hour,dayofweek,Lat,Lon,Base
count,564516,564516.0,564516.0,564516.0,564516.0,564516
unique,41999,,,,,5
top,2014-04-07 20:21:00,,,,,B02682
freq,97,,,,,227808
first,2014-04-01 00:00:00,,,,,
last,2014-04-30 23:59:00,,,,,
mean,,14.465043,2.86698,40.740005,-73.976817,
std,,5.873925,1.82081,0.036083,0.050426,
min,,0.0,0.0,40.0729,-74.7733,
25%,,10.0,1.0,40.7225,-73.9977,


In [10]:
df.info() #no missing values , no need of imputer . we will have to standarise numeric values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 564516 entries, 0 to 564515
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   date       564516 non-null  datetime64[ns]
 1   hour       564516 non-null  int64         
 2   dayofweek  564516 non-null  int64         
 3   Lat        564516 non-null  float64       
 4   Lon        564516 non-null  float64       
 5   Base       564516 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(2), object(1)
memory usage: 25.8+ MB


In [11]:
df['Base'].value_counts() #there are 5 categories for base feature, seems useless for the purpose of the project.we will drop it for now

B02682    227808
B02598    183263
B02617    108001
B02512     35536
B02764      9908
Name: Base, dtype: int64

In [12]:
df = df.drop(['Base','date'], axis=1)

In [13]:
df.tail()

Unnamed: 0,hour,dayofweek,Lat,Lon
564511,23,2,40.764,-73.9744
564512,23,2,40.7629,-73.9672
564513,23,2,40.7443,-73.9889
564514,23,2,40.6756,-73.9405
564515,23,2,40.688,-73.9608


In [14]:
df_sample = df.sample(n=10000, random_state=0)

In [15]:
##preprocessing features
# Création du pipeline pour les variables quantitatives
numeric_features = [0,1,2,3] # Positions des colonnes quantitatives dans X
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler()) # pour normaliser les variables
])

# On combine les pipelines dans un ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
    ])

# Preprocessings sur le dataset
print("Preprocessing sur le train set...")
print(df_sample.head())
X = preprocessor.fit_transform(df_sample) # fit_transform !!
print('...Terminé.')
print(X[0:5, :])
print()

Preprocessing sur le train set...
        hour  dayofweek      Lat      Lon
63031      5          6  40.7800 -73.9486
100482    11          6  40.7495 -73.9917
239000    23          5  40.7475 -74.0089
216098    20          2  40.7555 -73.9917
160769    13          3  40.6450 -73.7819
...Terminé.
[[-1.58887328  1.73024028  1.09465112  0.56298699]
 [-0.57446143  1.73024028  0.24914854 -0.30514713]
 [ 1.45436227  1.18200191  0.19370575 -0.65159508]
 [ 0.94715634 -0.46271318  0.41547692 -0.30514713]
 [-0.23632415  0.08552518 -2.64773734  3.9207122 ]]



### KMeans method 

In [17]:
# Utilisation de la méthode Elbow pour trouver le nombre optimal de clusters

wcss =  []
for i in range (1,20): 
    kmeans = KMeans(n_clusters= i, random_state=0)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
    
print(wcss)

[39999.99999999997, 32535.998608692007, 27288.98717764106, 22453.848242072894, 19671.08413091895, 17589.517671633912, 16026.683894363445, 14940.9294518182, 13639.16319923677, 13073.603577046157, 11796.263798419595, 11083.768280855717, 10523.497943903483, 10043.261418500551, 9594.263630291121, 9184.360005130502, 8832.573860567463, 8621.073904102324, 8325.524780274554]


In [18]:
fig = px.line(x = range(1,20), y = wcss)
fig.show()

In [19]:
# Utilisation du silhouette_score pour déterminer le nombre optimal de clusters
s_score = []
for i in range (2,20): 
    kmeans = KMeans(n_clusters= i, random_state=0)
    kmeans.fit(X)
    s_score.append(silhouette_score(X, kmeans.predict(X)))

print(s_score)

[0.23053494285400894, 0.267313933074676, 0.259006538080643, 0.2614646803032073, 0.26796640094570345, 0.2716017204805086, 0.2738394817304445, 0.2577314095414832, 0.2420677135796497, 0.24115901025900813, 0.24345815643836763, 0.24093061837516042, 0.2413845664919689, 0.2458350179861366, 0.24445522108584447, 0.24837578837408714, 0.23885268473395582, 0.2411682746238321]


In [20]:
# Affichage de scores en fonction du nombre de clusters
fig = px.bar(x = range(2,20), y = s_score)
fig.show()

In [21]:
#xwe will keep K=9 as number of clusters and train our model Kmeans with it in order to visualise our clusters
kmeans = KMeans(n_clusters= 8, random_state = 0)
kmeans.fit(X)

df_sample.loc[:,'Cluster_KMeans'] = kmeans.predict(X)
df_sample.head()

Unnamed: 0,hour,dayofweek,Lat,Lon,Cluster_KMeans
63031,5,6,40.78,-73.9486,0
100482,11,6,40.7495,-73.9917,0
239000,23,5,40.7475,-74.0089,3
216098,20,2,40.7555,-73.9917,1
160769,13,3,40.645,-73.7819,2


In [22]:
# Visualisation of dataset with mapbox coloured by cluster
fig = px.scatter_mapbox(df_sample, lat="Lat", lon="Lon", color='Cluster_KMeans', 
                        mapbox_style="open-street-map", zoom = 8)
fig.show();
# --> the cluters are not very distincts specialy in mahatan downtow. however there is a clearly defined cluster of pick ups in upper Brooklyn (yellow cluster)

In [33]:
# Visualisation of dataset with mapbox coloured by cluster and after keeping only saturday as day of week

#The day of the week with dt.dayofweek method is Monday=0, ....Sunday=6 

fig = px.scatter_mapbox(df_sample[df_sample['hour'] == 8], lat="Lat", lon="Lon", color='Cluster_KMeans', 
                        mapbox_style="open-street-map", zoom = 8)
fig.show()

# ---> seems like hot zones of pick ups are concentrated in manhatan downtow in saturdays 

In [44]:
test = df_sample.sort_values('hour', ascending = True)
fig = px.scatter_mapbox(test, lat="Lat", lon="Lon", mapbox_style="open-street-map",
                       animation_frame = 'hour', color ='Cluster_KMeans', zoom = 8, width=500, height=500)
fig.show()

In [46]:
test = df_sample.sort_values('dayofweek', ascending = True)
fig = px.scatter_mapbox(test, lat="Lat", lon="Lon", mapbox_style="open-street-map",
                       animation_frame = 'dayofweek', color ='Cluster_KMeans', zoom = 8, width=500, height=500)
fig.show()

### DBSCAN method 

In [47]:
print(X)

[[-1.58887328  1.73024028  1.09465112  0.56298699]
 [-0.57446143  1.73024028  0.24914854 -0.30514713]
 [ 1.45436227  1.18200191  0.19370575 -0.65159508]
 ...
 [-0.57446143 -0.46271318  0.01074454 -0.62541006]
 [ 0.10181314  0.08552518  0.90891777  0.28502293]
 [-0.06725551  0.08552518  0.75645009  2.21264323]]


In [48]:
db = DBSCAN(eps=0.3, min_samples =100)
db.fit(X)

DBSCAN(eps=0.3, min_samples=100)

In [49]:
clusters = db.labels_
np.unique(clusters)

array([-1,  0,  1])

In [50]:
pd.Series(clusters).value_counts()

-1    9240
 0     422
 1     338
dtype: int64

In [51]:
#many outliers , we should enlarge eps to get more clusters.

db = DBSCAN(eps=0.5, min_samples =100)
db.fit(X)
#np.unique(db.labels_)
pd.Series(db.labels_).value_counts()

-1    4160
 0    1493
 5     980
 2     979
 1     885
 3     691
 4     507
 6     154
 7     151
dtype: int64

In [59]:
#let's try with a larger min_samples number  ---> too many obersvations considered as outliers now

db = DBSCAN(eps=0.5, min_samples =200)
db.fit(X)
np.unique(db.labels_)
pd.Series(db.labels_).value_counts()

-1    6844
 0     978
 1     713
 3     636
 2     629
 4     200
dtype: int64

In [60]:
db = DBSCAN(eps=0.5, min_samples =100)
db.fit(X)
np.unique(db.labels_)
pd.Series(db.labels_).value_counts()

-1    4160
 0    1493
 5     980
 2     979
 1     885
 3     691
 4     507
 6     154
 7     151
dtype: int64

In [61]:
df_sample.loc[:,'Cluster_DBSCAN'] = db.labels_

In [64]:
fig = px.scatter_mapbox(df_sample[df_sample['Cluster_DBSCAN'] != -1], lat="Lat", lon="Lon", color='Cluster_DBSCAN', 
                        mapbox_style="open-street-map", zoom = 8)
fig.show()

In [65]:
fig = px.scatter_mapbox(df_sample[(df_sample['hour'] == 20) & (df_sample['Cluster_DBSCAN'] != -1)], lat="Lat", lon="Lon", color='Cluster_DBSCAN', 
                        mapbox_style="open-street-map", zoom = 8)
fig.show()

In [68]:
test = df_sample.sort_values('hour', ascending = True)
fig = px.scatter_mapbox(test[test['Cluster_DBSCAN'] !=-1], lat="Lat", lon="Lon", mapbox_style="open-street-map",
                       animation_frame = 'hour', color ='Cluster_DBSCAN', zoom = 10, width=800, height=800)
fig.show()

In [69]:
test = df_sample.sort_values('dayofweek', ascending = True)
fig = px.scatter_mapbox(test[test['Cluster_DBSCAN'] !=-1], lat="Lat", lon="Lon", mapbox_style="open-street-map",
                       animation_frame = 'dayofweek', color ='Cluster_DBSCAN', zoom = 10, width=800, height=800)
fig.show()

In [72]:
fig = px.scatter_3d(df_sample[df_sample['Cluster_DBSCAN'] !=-1], x="Lat", y="Lon", z='hour',
                       color ='Cluster_DBSCAN', width=800, height=800)
fig.show()