In [1]:
!pip install plotly==4.9.0
!pip install jupyterlab "ipywidgets>=7.5"
!jupyter labextension install jupyterlab-plotly@4.9.0
!jupyter labextension install @jupyter-widgets/jupyterlab-manager plotlywidget@4.9.0

Building jupyterlab assets (build:prod:minimize)
Building jupyterlab assets (build:prod:minimize)


L'équipe de données d'Uber aimerait travailler sur un projet dans lequel son application recommanderait des zones chaudes dans les grandes villes à n'importe quel moment de la journée.

In [2]:
import pandas as pd
import plotly.express as px
import plotly.io as pio
from bokeh.io import export_png
pio.renderers.default = "iframe_connected"

In [3]:
data_jun14 = pd.read_csv("https://raw.githubusercontent.com/fivethirtyeight/uber-tlc-foil-response/master/uber-trip-data/uber-raw-data-jun14.csv")
data_jun14.head(2)

Unnamed: 0,Date/Time,Lat,Lon,Base
0,6/1/2014 0:00:00,40.7293,-73.992,B02512
1,6/1/2014 0:01:00,40.7131,-74.0097,B02512


In [4]:
data_jun14=data_jun14.drop(columns="Base",axis=1)
data_jun14.head()

Unnamed: 0,Date/Time,Lat,Lon
0,6/1/2014 0:00:00,40.7293,-73.992
1,6/1/2014 0:01:00,40.7131,-74.0097
2,6/1/2014 0:04:00,40.3461,-74.661
3,6/1/2014 0:04:00,40.7555,-73.9833
4,6/1/2014 0:07:00,40.688,-74.1831


In [5]:
data_jun14.shape

(663844, 3)

In [6]:
import datetime as dt  
data_jun14['Date/Time'] = pd.to_datetime(data_jun14['Date/Time'], format = '%m/%d/%Y %H:%M:%S')

In [7]:
data_jun14['hour'] = data_jun14['Date/Time'].dt.hour
data_jun14['dayofmonth'] = data_jun14['Date/Time'].dt.day
data_jun14['day_name'] = data_jun14['Date/Time'].dt.day_name()
data_jun14=data_jun14.drop(columns="Date/Time",axis=1)

In [8]:
data_jun14.tail()

Unnamed: 0,Lat,Lon,hour,dayofmonth,day_name
663839,40.7332,-73.9872,22,30,Monday
663840,40.7905,-73.9796,23,30,Monday
663841,40.764,-73.9887,23,30,Monday
663842,40.7262,-73.9944,23,30,Monday
663843,40.7404,-73.9848,23,30,Monday


In [9]:
data_jun14.shape

(663844, 5)

In [10]:
data_jun14.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 663844 entries, 0 to 663843
Data columns (total 5 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Lat         663844 non-null  float64
 1   Lon         663844 non-null  float64
 2   hour        663844 non-null  int64  
 3   dayofmonth  663844 non-null  int64  
 4   day_name    663844 non-null  object 
dtypes: float64(2), int64(2), object(1)
memory usage: 25.3+ MB


In [11]:
# Nous filtrons les données sur le nom du jour et sur l'heure
data_jun14_filt= data_jun14.loc[(data_jun14['day_name'] == 'Friday') & (data_jun14['dayofmonth'] == 6) & (data_jun14['hour'] == 22), :]
data_jun14_filt.shape

(1684, 5)

In [12]:
# Import Standard Scaler
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer

# Initialize StandardScaler
# StandardScaler will substract mean and divide by standard deviation to each observation


numeric_features = [0,1,2,3] 
numeric_transformer =  StandardScaler()


categorical_features = [4] 
categorical_transformer =OneHotEncoder(drop='first')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])
print("Performing preprocessings on train set...")
X= preprocessor.fit_transform(data_jun14_filt)  
print('...Done.')

# Preprocessings sur le dataset
print(data_jun14_filt.head())
X = preprocessor.fit_transform(data_jun14_filt) # fit_transform !!
print('...Terminé.')
print(X[0:5, :])
print()

Performing preprocessings on train set...
...Done.
          Lat      Lon  hour  dayofmonth day_name
6914  40.7725 -73.9773    22           6   Friday
6915  40.7405 -73.9840    22           6   Friday
6916  40.7940 -73.9211    22           6   Friday
6917  40.8000 -73.9255    22           6   Friday
6918  40.7325 -74.0005    22           6   Friday
...Terminé.
[[ 0.97796877  0.03521747  0.          0.        ]
 [ 0.00607972 -0.10875575  0.          0.        ]
 [ 1.63095672  1.24287347  0.          0.        ]
 [ 1.81318592  1.14832389  0.          0.        ]
 [-0.23689255 -0.46331668  0.          0.        ]]



In [13]:
# Let's create a loop that will collect the Within-sum-of-square (wcss) for each value K 
# Let's use .inertia_ parameter to get the within sum of square value for each value K 
from sklearn.cluster import KMeans
wcss =  []# une liste qui va définir les coûts de nos modèles pr une valeur de k allant de 1 à 10 
k = []
for i in range (1,11): 
    kmeans = KMeans(n_clusters= i, init = "k-means++", random_state = 0)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
    k.append(i)
    print("WCSS for K={} --> {}".format(i, wcss[-1]))

WCSS for K=1 --> 3367.999999999999
WCSS for K=2 --> 2337.349264875635
WCSS for K=3 --> 1631.1313206965851
WCSS for K=4 --> 1269.0333037153941
WCSS for K=5 --> 987.3297004376841
WCSS for K=6 --> 775.6012990960456
WCSS for K=7 --> 671.4943890089855
WCSS for K=8 --> 618.9567684151434
WCSS for K=9 --> 519.2478843406886
WCSS for K=10 --> 473.81091203344465


In [14]:
# Let's visualize using plotly
import plotly.express as px

# Create DataFrame
wcss_frame = pd.DataFrame(wcss)
k_frame = pd.Series(k)

# Create figure
fig= px.line(
    wcss_frame,
    x=k_frame,
    y=wcss_frame.iloc[:,-1]
)

# Create title and axis labels
fig.update_layout(
    yaxis_title="Inertia",
    xaxis_title="# Clusters",
    title="Inertia per cluster"
)

In [15]:
# Import silhouette score
from sklearn.metrics import silhouette_score

# Computer mean silhouette score
sil = []
k = []

## on doit commencer par i=2 car le score de silhouette regarde pr un pt donnée à quel pt il est proche de son centroide et éloigné 
# du cluster le plus proche
for i in range (2,11):
    kmeans = KMeans(n_clusters= i, init = "k-means++", random_state = 0)
    kmeans.fit(X)
    sil.append(silhouette_score(X, kmeans.predict(X))) # il attend la valeur du feature et à quel cluster j'ai assigné ce feature 
    k.append(i)
    print("Silhouette score for K={} is {}".format(i, sil[-1]))

Silhouette score for K=2 is 0.5551708582772285
Silhouette score for K=3 is 0.5150984029474179
Silhouette score for K=4 is 0.3732113127408283
Silhouette score for K=5 is 0.39027907804089035
Silhouette score for K=6 is 0.45648856553530454
Silhouette score for K=7 is 0.45745460846745056
Silhouette score for K=8 is 0.483716322348675
Silhouette score for K=9 is 0.483914772732267
Silhouette score for K=10 is 0.39747652822324725


In [16]:
# Create a data frame 
cluster_scores=pd.DataFrame(sil)
k_frame = pd.Series(k)

# Create figure
fig = px.bar(data_frame=cluster_scores,  
             x=k, 
             y=cluster_scores.iloc[:, -1]
            )

# Add title and axis labels
fig.update_layout(
    yaxis_title="Silhouette Score",
    xaxis_title="# Clusters",
    title="Silhouette Score per cluster"
)

In [17]:
# On ré-entraîne un KMeans avec le nombre optimal de clusters
kmeans = KMeans(n_clusters= 6)
kmeans.fit(X)

KMeans(n_clusters=6)

In [18]:
data_jun14_filt.loc[:,'Cluster_KMeans'] = kmeans.predict(X)
data_jun14_filt.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,Lat,Lon,hour,dayofmonth,day_name,Cluster_KMeans
6914,40.7725,-73.9773,22,6,Friday,0
6915,40.7405,-73.984,22,6,Friday,1
6916,40.794,-73.9211,22,6,Friday,4
6917,40.8,-73.9255,22,6,Friday,4
6918,40.7325,-74.0005,22,6,Friday,1


In [19]:
fig = px.scatter_mapbox(
        data_jun14_filt, 
        lat="Lat", 
        lon="Lon",
        color="Cluster_KMeans",
        mapbox_style="carto-positron"
)
fig.show()

In [20]:
from sklearn.cluster import DBSCAN

db = DBSCAN(eps=0.08, min_samples=32, metric="manhattan", algorithm="brute")

db.fit(X)
#on utilise la métric manhattan qd on a des vbles catégoriques 

DBSCAN(algorithm='brute', eps=0.08, metric='manhattan', min_samples=32)

In [21]:
import numpy as np 
label=db.labels_
np.unique(label)

array([-1,  0,  1,  2,  3,  4])

In [22]:
data_jun14_filt['cluster_dbscan']=label.tolist()
data_jun14_filt



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,Lat,Lon,hour,dayofmonth,day_name,Cluster_KMeans,cluster_dbscan
6914,40.7725,-73.9773,22,6,Friday,0,-1
6915,40.7405,-73.9840,22,6,Friday,1,-1
6916,40.7940,-73.9211,22,6,Friday,4,-1
6917,40.8000,-73.9255,22,6,Friday,4,-1
6918,40.7325,-74.0005,22,6,Friday,1,0
...,...,...,...,...,...,...,...
656577,40.7091,-74.0125,22,6,Friday,1,-1
656578,40.8046,-73.9372,22,6,Friday,4,3
656579,40.7638,-73.9815,22,6,Friday,0,-1
656580,40.7345,-74.0014,22,6,Friday,1,0


In [23]:
data_jun14_filt['cluster_dbscan'].value_counts()

-1    1265
 0     170
 2      88
 1      60
 4      55
 3      46
Name: cluster_dbscan, dtype: int64

In [24]:
data_clean=data_jun14_filt.loc[data_jun14_filt['cluster_dbscan']!=-1]
data_clean

Unnamed: 0,Lat,Lon,hour,dayofmonth,day_name,Cluster_KMeans,cluster_dbscan
6918,40.7325,-74.0005,22,6,Friday,1,0
6921,40.7330,-74.0069,22,6,Friday,1,0
6925,40.7386,-74.0081,22,6,Friday,1,4
6928,40.7301,-73.9917,22,6,Friday,1,1
6931,40.7411,-74.0078,22,6,Friday,1,4
...,...,...,...,...,...,...,...
656572,40.7402,-74.0080,22,6,Friday,1,4
656573,40.7438,-73.9716,22,6,Friday,0,2
656575,40.8030,-73.9335,22,6,Friday,4,3
656578,40.8046,-73.9372,22,6,Friday,4,3


In [25]:
fig = px.scatter_mapbox(data_clean, lat="Lat", lon="Lon", color="cluster_dbscan", mapbox_style="carto-positron", zoom=10)
fig.show()

Si Uber connaît la tendance des clusters et s'il reçoit beaucoup de demandes de covoiturage, il peut alors stratégiquement placer ses chauffeurs au bon endroit où la probabilité d'obtenir une demande de covoiturage est énorme. Cela aidera Uber à servir le client plus rapidement car les véhicules sont placés plus près de l'emplacement et cela aidera également à développer leur activité.

Uber peut savoir à quel moment de la journée il y a  le plus de demandes de course. Par exemple, dans notre exemple j'ai choisi au  hasard la date du Vnedredi 6 Juin 2014 à 22 H comme c'est un Vendredi je me suis dit qu'il doit avoir certainement beaucoup de personnes qui sortent et donc beaucoup plus de trafic. On peut remarquer que dans notre exemple on a beaucoup plus de demandes dans le secteur du cluster 0 à comparer au cluster N° 3 donc on peut proposer à uber de placer beaucoup plus de voiture à proximité du clutser 0 ou à rediriger certaines voitures qui se trouvent près du cluster 3 en direction du cluster 0 qui se trouve entre West Village et Hudson Square. 

Uber peut utiliser ces clusters pour une tarification optimale en analysant quel cluster traite les demandes maximales, les heures de pointe, etc. Supposons que s'ils n'ont pas trop de véhicules à envoyer à un endroit particulier (plus de demande), ils peuvent alors faire une tarification optimale car la demande est élevée et l'offre est moindre.

