<img src="https://upload.wikimedia.org/wikipedia/commons/thumb/5/58/Uber_logo_2018.svg/1024px-Uber_logo_2018.svg.png" alt="UBER LOGO" width="15%" />

# UBER Pickups

In [1]:
# Import des librairies
import pandas as pd
import numpy as np
from datetime import datetime

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans, MiniBatchKMeans, DBSCAN
from sklearn.metrics import  silhouette_score
from sklearn.decomposition import PCA

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import matplotlib.pyplot as plt

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

A: Télechargement et lecture du CSV

In [2]:
data=pd.read_csv('uber-raw-data-may14.csv')
data.head()

Unnamed: 0,Date/Time,Lat,Lon,Base
0,5/1/2014 0:02:00,40.7521,-73.9914,B02512
1,5/1/2014 0:06:00,40.6965,-73.9715,B02512
2,5/1/2014 0:15:00,40.7464,-73.9838,B02512
3,5/1/2014 0:17:00,40.7463,-74.0011,B02512
4,5/1/2014 0:17:00,40.7594,-73.9734,B02512


In [3]:
#dataset trop volumineux on prend donc une fraction de 15% 
data_1=data.sample(frac=0.05,random_state=42)

B: nettoyage des données

In [4]:
#convertion des dates sur un format plus lisible (format='%m/%d/%Y %H:%M:%S)
data_1['Date/Time'] = pd.to_datetime(data_1['Date/Time'], format='%m/%d/%Y %H:%M:%S')
data_1.head()

Unnamed: 0,Date/Time,Lat,Lon,Base
537913,2014-05-16 15:23:00,40.7809,-73.9791,B02682
216112,2014-05-21 19:38:00,40.7682,-73.9823,B02598
225908,2014-05-22 19:23:00,40.7351,-73.988,B02598
22010,2014-05-18 12:43:00,40.7686,-73.9635,B02512
215785,2014-05-21 19:05:00,40.7223,-73.9917,B02598


In [5]:
#afin d'avoir une meilleur visibilité du dataframe on va creer 3 colonnes suplémentaires

data_1['Day of mounth'] = data_1['Date/Time'].dt.day

data_1['Day of week'] = data_1['Date/Time'].dt.weekday

data_1['Hour'] = data_1['Date/Time'].dt.hour

print(data_1.head())
data_1.shape

                 Date/Time      Lat      Lon    Base  Day of mounth  \
537913 2014-05-16 15:23:00  40.7809 -73.9791  B02682             16   
216112 2014-05-21 19:38:00  40.7682 -73.9823  B02598             21   
225908 2014-05-22 19:23:00  40.7351 -73.9880  B02598             22   
22010  2014-05-18 12:43:00  40.7686 -73.9635  B02512             18   
215785 2014-05-21 19:05:00  40.7223 -73.9917  B02598             21   

        Day of week  Hour  
537913            4    15  
216112            2    19  
225908            3    19  
22010             6    12  
215785            2    19  


(32622, 7)

In [6]:
# pour notre etude la colonne Base ne nous donne pas d'information utile donc on se permet de la supprimé

useless_columns=['Base']
data_1=data_1.drop(useless_columns,axis=1)

In [7]:
data_1.head()

Unnamed: 0,Date/Time,Lat,Lon,Day of mounth,Day of week,Hour
537913,2014-05-16 15:23:00,40.7809,-73.9791,16,4,15
216112,2014-05-21 19:38:00,40.7682,-73.9823,21,2,19
225908,2014-05-22 19:23:00,40.7351,-73.988,22,3,19
22010,2014-05-18 12:43:00,40.7686,-73.9635,18,6,12
215785,2014-05-21 19:05:00,40.7223,-73.9917,21,2,19


In [8]:
# Basic stats
print("Number of rows : {}".format(data_1.shape[0]))
print()

print("Display of dataset: ")
display(data_1.head())
print()

print("Basics statistics: ")
data_desc = data_1.describe(include='all')
display(data_desc)
print()

print("Percentage of missing values: ")
display(100*data_1.isnull().sum()/data_1.shape[0])

Number of rows : 32622

Display of dataset: 


Unnamed: 0,Date/Time,Lat,Lon,Day of mounth,Day of week,Hour
537913,2014-05-16 15:23:00,40.7809,-73.9791,16,4,15
216112,2014-05-21 19:38:00,40.7682,-73.9823,21,2,19
225908,2014-05-22 19:23:00,40.7351,-73.988,22,3,19
22010,2014-05-18 12:43:00,40.7686,-73.9635,18,6,12
215785,2014-05-21 19:05:00,40.7223,-73.9917,21,2,19



Basics statistics: 


Unnamed: 0,Date/Time,Lat,Lon,Day of mounth,Day of week,Hour
count,32622,32622.0,32622.0,32622.0,32622.0,32622.0
mean,2014-05-16 11:56:24.200846080,40.74049,-73.974729,15.872019,3.100178,14.519067
min,2014-05-01 00:00:00,40.2622,-74.6607,1.0,0.0,0.0
25%,2014-05-08 20:34:00,40.7225,-73.997,8.0,2.0,10.0
50%,2014-05-16 13:25:30,40.7437,-73.9838,16.0,3.0,16.0
75%,2014-05-23 14:33:45,40.7617,-73.9678,23.0,4.0,19.0
max,2014-05-31 23:58:00,41.1635,-73.0133,31.0,6.0,23.0
std,,0.037943,0.054663,8.864623,1.757558,5.821001



Percentage of missing values: 


Date/Time        0.0
Lat              0.0
Lon              0.0
Day of mounth    0.0
Day of week      0.0
Hour             0.0
dtype: float64

pas de "missing values" on va donc pouvoir effectuer notre etude avec des données nettoyés .

## E.D.A UBER sous forme de Scatter mapbox

In [9]:
# creartion d'un dataframe
data_1 = data_1.sort_values('Day of mounth')

# scatter mapbox graph
fig = px.scatter_mapbox(
    data_1,
    lat="Lat",
    lon="Lon",
    animation_frame="Day of mounth",
    mapbox_style="carto-positron"
)

fig.update_layout(
    title="prise en charge des clients au cours du Moi"
)

fig.show()

Préprocessing 

In [10]:
print("Preprocessing X...")
print(data_1.head())
print()

Preprocessing X...
                 Date/Time      Lat      Lon  Day of mounth  Day of week  Hour
44231  2014-05-01 21:14:00  40.7288 -73.9806              1            3    21
313    2014-05-01 09:20:00  40.7084 -74.0095              1            3     9
44413  2014-05-01 21:30:00  40.7065 -74.0090              1            3    21
297818 2014-05-01 07:13:00  40.7284 -73.9890              1            3     7
597    2014-05-01 14:58:00  40.7529 -73.9793              1            3    14



In [11]:
# Create pipeline for numeric features
numeric_features = ['Lat', 'Lon', 'Hour','Day of mounth','Day of week']

numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

print('Done...')

#Prepocessing
preprocessor = ColumnTransformer(
    transformers=[(
        'num', numeric_transformer, numeric_features)
])

# Test pipeline
print("Preprocessing X...")
print(data_1.head())
print()

X = preprocessor.fit_transform(data_1)

print("...Done!")
print(X[0:5, :])
print()

Done...
Preprocessing X...
                 Date/Time      Lat      Lon  Day of mounth  Day of week  Hour
44231  2014-05-01 21:14:00  40.7288 -73.9806              1            3    21
313    2014-05-01 09:20:00  40.7084 -74.0095              1            3     9
44413  2014-05-01 21:30:00  40.7065 -74.0090              1            3    21
297818 2014-05-01 07:13:00  40.7284 -73.9890              1            3     7
597    2014-05-01 14:58:00  40.7529 -73.9793              1            3    14

...Done!
[[-0.30811105 -0.1074066   1.11338798 -1.67770769 -0.05699915]
 [-0.8457732  -0.63610474 -0.94814476 -1.67770769 -0.05699915]
 [-0.89584957 -0.62695772  1.11338798 -1.67770769 -0.05699915]
 [-0.31865344 -0.26107665 -1.29173355 -1.67770769 -0.05699915]
 [ 0.32706826 -0.08362433 -0.08917278 -1.67770769 -0.05699915]]



Kmeans

In [12]:
# Pipeline for K-means
print("Preprocessing X_kmeans...")
print(data_1.head())
print()
X_kmeans = preprocessor.fit_transform(data_1)
print("...Done!")
print(X_kmeans[0:5, :])
print()

Preprocessing X_kmeans...
                 Date/Time      Lat      Lon  Day of mounth  Day of week  Hour
44231  2014-05-01 21:14:00  40.7288 -73.9806              1            3    21
313    2014-05-01 09:20:00  40.7084 -74.0095              1            3     9
44413  2014-05-01 21:30:00  40.7065 -74.0090              1            3    21
297818 2014-05-01 07:13:00  40.7284 -73.9890              1            3     7
597    2014-05-01 14:58:00  40.7529 -73.9793              1            3    14

...Done!
[[-0.30811105 -0.1074066   1.11338798 -1.67770769 -0.05699915]
 [-0.8457732  -0.63610474 -0.94814476 -1.67770769 -0.05699915]
 [-0.89584957 -0.62695772  1.11338798 -1.67770769 -0.05699915]
 [-0.31865344 -0.26107665 -1.29173355 -1.67770769 -0.05699915]
 [ 0.32706826 -0.08362433 -0.08917278 -1.67770769 -0.05699915]]



Methode ELBOW

In [14]:
wcss = []
k = []
for i in range (2,11):
    kmeans = KMeans(n_clusters=i, n_init=10, random_state=0)
    kmeans.fit(X_kmeans)
    wcss.append(kmeans.inertia_)
    k.append(i)
    print("WCSS for K={} --> {}".format(i, wcss[-1]))

print(wcss)

WCSS for K=2 --> 139109.98205273392
WCSS for K=3 --> 121565.3866497293
WCSS for K=4 --> 104375.15233769407
WCSS for K=5 --> 91609.31586616869
WCSS for K=6 --> 84775.88110050304
WCSS for K=7 --> 79220.77465733522
WCSS for K=8 --> 74184.08164837115
WCSS for K=9 --> 70396.42684910167
WCSS for K=10 --> 66754.83318518971
[139109.98205273392, 121565.3866497293, 104375.15233769407, 91609.31586616869, 84775.88110050304, 79220.77465733522, 74184.08164837115, 70396.42684910167, 66754.83318518971]


In [15]:
fig = px.line(x = range(2,11), y = wcss)
fig.update_layout(yaxis_title='Inertie',xaxis_title='Clusters',title_text="Elbow representation")

fig.show()

#### Le graphique de la méthode elbow montre une courbe qui commence à se plier à un certain point, suggérant que le nombre optimal de grappes est probablement autour de ce point. Dans notre cas, le coude semble se former à 5. Cependant, il est important de noter que le choix du nombre final de grappes peut également dépendre d’autres facteurs et d’une analyse plus approfondie. !

Silhouette 

In [16]:
# Computer mean silhouette score
sil = []
k = []

## Careful, you need to start at i=2 as silhouette score cannot accept less than 2 labels
for i in range (2,11):
    kmeans = KMeans(n_clusters= i, random_state = 0, n_init = 'auto')
    kmeans.fit(X)
    sil.append(silhouette_score(X, kmeans.predict(X)))
    k.append(i)
    print("Silhouette score for K={} is {}".format(i, sil[-1]))

Silhouette score for K=2 is 0.1685597324773094
Silhouette score for K=3 is 0.18414729838190735
Silhouette score for K=4 is 0.1949705410771308
Silhouette score for K=5 is 0.2073304725230338
Silhouette score for K=6 is 0.20844512033283552
Silhouette score for K=7 is 0.20617891796410162
Silhouette score for K=8 is 0.18930187862062645
Silhouette score for K=9 is 0.18571272649019216
Silhouette score for K=10 is 0.18273024019479664


In [17]:
fig = px.bar(x=k, y=sil)
fig.update_layout(yaxis_title='Score de Silhouette', xaxis_title='Nombre de Clusters', title_text="Méthode de Silhouette")
fig.show()

#### En examinant le graphique de la méthode de la silhouette, nous pouvons observer que le score de la silhouette est significativement élevé lorsque le nombre de grappes est de 6. Cependant, il est recommandé de considérer d’autres critères et de vérifier la stabilité des clusters pour prendre une décision finale sur le nombre optimal de clusters.

In [18]:
s_score = []
for i in range (2,11):
    kmeans = KMeans(n_clusters= i,n_init=2,random_state=0)
    kmeans.fit(X)
    s_score.append(silhouette_score(X, kmeans.predict(X)))

print(s_score)

[0.1664996075064228, 0.17787906498857067, 0.1949705410771308, 0.20724962677908706, 0.2110019738249626, 0.20617891796410162, 0.18930187862062645, 0.18571272649019216, 0.18577492320567285]


#### Compte tenu des deux approches(elbow et silouette), le nombre optimal  de cluster semble être 6.

In [19]:

kmeans = KMeans(n_clusters= 6,n_init=10, random_state=0)
kmeans.fit(X_kmeans)

In [20]:
# creation d'une colonne pour les cluster Kmeans
data_1.loc[:,'Cluster_KMeans'] = kmeans.predict(X_kmeans)
data_1.head()

Unnamed: 0,Date/Time,Lat,Lon,Day of mounth,Day of week,Hour,Cluster_KMeans
44231,2014-05-01 21:14:00,40.7288,-73.9806,1,3,21,2
313,2014-05-01 09:20:00,40.7084,-74.0095,1,3,9,0
44413,2014-05-01 21:30:00,40.7065,-74.009,1,3,21,2
297818,2014-05-01 07:13:00,40.7284,-73.989,1,3,7,0
597,2014-05-01 14:58:00,40.7529,-73.9793,1,3,14,2


In [21]:
# Create a dataframe
data_1_sorted = data_1[data_1['Cluster_KMeans'] != -1].sort_values('Hour')

# Create a scatter mapbox graph
fig = px.scatter_mapbox(
    data_1_sorted,
    lat="Lat",
    lon="Lon",
    color= 'Cluster_KMeans',
    animation_frame="Hour",
    mapbox_style="carto-positron"
)

fig.update_layout(
    title="Spatial distribution of clusters over hours of the days")

fig.show()

#### K-means a identifié six clusters distincts.

- Groupe 0 : correspond à la zone géographique de Manhattan autour et au nord de Central Park. Ce secteur a une grappe distincte, ce qui suggère des caractéristiques uniques. Explorer les raisons de cette différence pourrait fournir des renseignements intéressants.

- Clusters 1, 2 et 3 : correspondent à la même zone géographique, mais avec un créneau horaire de 9h à 23h. Cette zone est dense et très active, reflétant la forte activité économique de Manhattan.

- Groupe 4 : Correspond à la zone géographique de Manhattan au sud de Central Park, mais n’est présente qu’entre minuit et 8h du matin, ce qui suggère une activité importante la nuit, associée à la vie nocturne du quartier.

- Cluster 5 : Correspond à la zone JFK. Ce cluster a une stabilité constante indépendamment du temps. La faible activité peut être attribuée à la nature moins dense de la zone, ce qui la rend utile pour identifier les zones moins actives.

En conclusion, bien que les moyennes K aient identifié six grappes dans l’ensemble de données, elles n’ont pas permis de distinguer clairement les grappes utilisables. Nous explorerons une autre approche avec DBSCAN pour une meilleure compréhension.

DBscan

In [22]:
# Drop useless columns
useless_cols = ['Cluster_KMeans']

print("Dropping useless columns...")
data_1 = data_1.drop(useless_cols, axis=1)

print("...Done.")

Dropping useless columns...
...Done.


In [23]:
# Preprocessing for DBSCAN
print("Preprocessing X_dbscan...")
print(data_1.head())
print()
X_dbscan =  preprocessor.fit_transform(data_1)
print("...Done!")
print(X_dbscan[0:5, :]) 
print()

Preprocessing X_dbscan...
                 Date/Time      Lat      Lon  Day of mounth  Day of week  Hour
44231  2014-05-01 21:14:00  40.7288 -73.9806              1            3    21
313    2014-05-01 09:20:00  40.7084 -74.0095              1            3     9
44413  2014-05-01 21:30:00  40.7065 -74.0090              1            3    21
297818 2014-05-01 07:13:00  40.7284 -73.9890              1            3     7
597    2014-05-01 14:58:00  40.7529 -73.9793              1            3    14

...Done!
[[-0.30811105 -0.1074066   1.11338798 -1.67770769 -0.05699915]
 [-0.8457732  -0.63610474 -0.94814476 -1.67770769 -0.05699915]
 [-0.89584957 -0.62695772  1.11338798 -1.67770769 -0.05699915]
 [-0.31865344 -0.26107665 -1.29173355 -1.67770769 -0.05699915]
 [ 0.32706826 -0.08362433 -0.08917278 -1.67770769 -0.05699915]]



In [24]:
# Instanciate DBSCAN
db = DBSCAN(eps=0.72, min_samples=15, metric='manhattan')
db.fit(X_dbscan)

labels = db.labels_
np.unique(db.labels_, return_counts=True)

(array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
        16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31],
       dtype=int64),
 array([4520, 3409,   31,   36,   32,   12, 6345,   22,   16,   86,   14,
          18,   31,   25, 6610,   44,   23,   21,   35,   52,   11,   42,
        5941,   70,   46,   39,   25, 4881,   66,   26,   21,   42,   30],
       dtype=int64))

### on peut voir  que  avec ϵ=0.72 on a  une segmentation qui semble représenter les différents schémas spatiotemporels présents dans les données.
Une valeur de min_samples= 15 a été retenue, ce qui semble produire des clusters significatifs tout en évitant une segmentation excessive. Ainsi, nous observons 31 clusters.

Il est important de noter que le choix des paramètres dans DBSCAN est souvent un compromis entre la sensibilité au bruit, la taille du cluster et la capacité à détecter des modèles significatifs!

In [25]:
# creation d'une nouvelle colonnes "cluster_DBSCAN"
data_1.loc[:, "Cluster_DBSCAN"] = labels
data_1.head()

Unnamed: 0,Date/Time,Lat,Lon,Day of mounth,Day of week,Hour,Cluster_DBSCAN
44231,2014-05-01 21:14:00,40.7288,-73.9806,1,3,21,0
313,2014-05-01 09:20:00,40.7084,-74.0095,1,3,9,0
44413,2014-05-01 21:30:00,40.7065,-74.009,1,3,21,0
297818,2014-05-01 07:13:00,40.7284,-73.989,1,3,7,0
597,2014-05-01 14:58:00,40.7529,-73.9793,1,3,14,0


In [26]:
df_apr14_sample_sorted = data_1[data_1['Cluster_DBSCAN'] != -1].sort_values('Hour')

# Create a scatter mapbox graph
fig = px.scatter_mapbox(
    df_apr14_sample_sorted,
    lat="Lat",
    lon="Lon",
    color='Cluster_DBSCAN',
    animation_frame="Hour",
    mapbox_style="carto-positron"
)

fig.update_layout(
    title="Spatial distribution of clusters over hours of the day"
)

fig.show()

## PCA 

In [27]:
# Create pipeline for numeric features
df_pca = data_1.loc[:, ["Hour", 'Day of mounth',	'Day of week']]

# Preprocessing
scaler = StandardScaler()
X_train_pca = scaler.fit_transform(df_pca)

# Iniciate PCA
pca = PCA(n_components=2)
PC_train_opti = pca.fit_transform(X_train_pca)
print("...Done!")

...Done!


In [28]:
# Create 2 columns for PCA1 ans PCA2
df_pca_opti= data_1.loc[:, ["Lat", "Lon"]]
df_pca_opti['pca1'] = PC_train_opti[:, 0]
df_pca_opti['pca2'] = PC_train_opti[:, 1]

df_pca_opti

Unnamed: 0,Lat,Lon,pca1,pca2
44231,40.7288,-73.9806,0.269634,-0.928994
313,40.7084,-74.0095,1.793452,-0.563826
44413,40.7065,-74.0090,0.269634,-0.928994
297818,40.7284,-73.9890,2.047422,-0.502965
597,40.7529,-73.9793,1.158528,-0.715979
...,...,...,...,...
415750,40.7269,-73.9787,0.151672,0.207907
637558,40.7331,-74.0075,-0.737222,-0.005108
420006,40.6765,-73.9632,-2.388025,-0.400706
293717,40.7252,-74.0012,-1.626116,-0.218122


In [29]:
# Create pipeline for numeric features
numeric_features = ['Lat', 'Lon', 'pca1', 'pca2']

# Preprocessing
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
preprocessor = ColumnTransformer(
    transformers=[('num', numeric_transformer, numeric_features)])

# Test pipeline
print("Preprocessing X_pca...")
X_pca = preprocessor.fit_transform(df_pca_opti)
print("...Done!")

Preprocessing X_pca...
...Done!


In [30]:
# Instanciate DBSCAN
db = DBSCAN(eps=0.55, min_samples=6)
db.fit(X_pca)

labels = db.labels_
np.unique(db.labels_, return_counts=True)

(array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10], dtype=int64),
 array([  939, 30678,   221,   728,     6,     8,     6,     9,     8,
            7,     6,     6], dtype=int64))

In [31]:
data_1.loc[:,'Cluster_DBSCAN'] = db.labels_
data_1.head(20)

Unnamed: 0,Date/Time,Lat,Lon,Day of mounth,Day of week,Hour,Cluster_DBSCAN
44231,2014-05-01 21:14:00,40.7288,-73.9806,1,3,21,0
313,2014-05-01 09:20:00,40.7084,-74.0095,1,3,9,0
44413,2014-05-01 21:30:00,40.7065,-74.009,1,3,21,0
297818,2014-05-01 07:13:00,40.7284,-73.989,1,3,7,0
597,2014-05-01 14:58:00,40.7529,-73.9793,1,3,14,0
420486,2014-05-01 05:35:00,40.7206,-73.9842,1,3,5,0
428509,2014-05-01 23:35:00,40.7278,-73.9911,1,3,23,0
300064,2014-05-01 18:02:00,40.7377,-73.9924,1,3,18,0
426173,2014-05-01 19:25:00,40.7474,-73.9809,1,3,19,0
424385,2014-05-01 16:38:00,40.7238,-73.979,1,3,16,0


In [32]:
data_1 = data_1[data_1['Cluster_DBSCAN'] != -1].sort_values('Hour')

fig = px.scatter_mapbox(
    data_1,
    lat="Lat",
    lon="Lon",
    color= 'Cluster_DBSCAN',
    hover_data=["Hour", "Day of week", 'Day of mounth'],
    animation_frame="Hour",
    mapbox_style="carto-positron"
)
fig.update_layout(
    title="Spatial distribution of clusters over hours of the day"
)

fig.show()

#### Suite à l’application de DBSCAN sur les composants principaux (PCA), nous avons exploré les modes de déplacement des taxis Uber sous un nouvel angle. Les résultats obtenus confirment la richesse de l’approche DBSCAN en termes de segmentation, même lorsque les dimensions des données sont réduites par PCA