In [1]:
import pandas as pd 
import numpy as np 

from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.cluster import DBSCAN, KMeans
from sklearn.metrics import  silhouette_score

import plotly.express as px
import plotly.io as pio
pio.renderers.default = "vscode"

In [2]:
data = pd.read_csv("uber-trip-data/uber-raw-data-sep14.csv")
data.head()

Unnamed: 0,Date/Time,Lat,Lon,Base
0,9/1/2014 0:01:00,40.2201,-74.0021,B02512
1,9/1/2014 0:01:00,40.75,-74.0027,B02512
2,9/1/2014 0:03:00,40.7559,-73.9864,B02512
3,9/1/2014 0:06:00,40.745,-73.9889,B02512
4,9/1/2014 0:11:00,40.8145,-73.9444,B02512


In [3]:
data_sample = data.sample(30000)

In [4]:
# Basic stats
print("Number of rows : {}".format(data_sample.shape[0]))
print("Number of columns : {}".format(data_sample.shape[1]))
print()


print("Percentage of missing values: ")
display(100*data_sample.isnull().sum()/data_sample.shape[0])

Number of rows : 30000
Number of columns : 4

Percentage of missing values: 


Date/Time    0.0
Lat          0.0
Lon          0.0
Base         0.0
dtype: float64

In [5]:
data_sample['Date/Time']= pd.to_datetime(data_sample['Date/Time'])
data_sample.dtypes

Date/Time    datetime64[ns]
Lat                 float64
Lon                 float64
Base                 object
dtype: object

In [6]:
# split Date column into 2 parts (day and day of week) and drop Date column
data_sample['DayOfWeek'] =data_sample['Date/Time'].dt.dayofweek
data_sample['Day'] =data_sample['Date/Time'].dt.day
data_sample['Hour'] =data_sample['Date/Time'].dt.hour
data_sample.drop(['Date/Time', 'Base'], axis=1, inplace = True)

In [7]:
data_sample.head()

Unnamed: 0,Lat,Lon,DayOfWeek,Day,Hour
265005,40.7727,-73.9626,0,29,16
1001563,40.7374,-74.0065,5,27,15
352153,40.7249,-74.0101,5,6,21
894292,40.7761,-73.956,3,11,8
382126,40.7191,-74.0069,1,9,11


In [8]:
print("Number of rows : {}".format(data_sample.shape[0]))
print("Number of columns : {}".format(data_sample.shape[1]))
print()

Number of rows : 30000
Number of columns : 5



In [9]:
## map without clustering
fig = px.scatter_mapbox(
        data_sample, 
        lat="Lat", 
        lon="Lon",
        color="DayOfWeek",
        mapbox_style="carto-positron"
)

fig.show()

In [10]:
data_sample.head()

Unnamed: 0,Lat,Lon,DayOfWeek,Day,Hour
265005,40.7727,-73.9626,0,29,16
1001563,40.7374,-74.0065,5,27,15
352153,40.7249,-74.0101,5,6,21
894292,40.7761,-73.956,3,11,8
382126,40.7191,-74.0069,1,9,11


In [11]:
data_sample.dtypes

Lat          float64
Lon          float64
DayOfWeek      int32
Day            int32
Hour           int32
dtype: object

In [12]:
data_sample_0 = data_sample.loc[data_sample['DayOfWeek'] == 0]
data_sample_1 = data_sample.loc[data_sample['DayOfWeek'] == 1]
data_sample_2 = data_sample.loc[data_sample['DayOfWeek'] == 2]
data_sample_3 = data_sample.loc[data_sample['DayOfWeek'] == 3]
data_sample_4 = data_sample.loc[data_sample['DayOfWeek'] == 4]
data_sample_5 = data_sample.loc[data_sample['DayOfWeek'] == 5]
data_sample_6 = data_sample.loc[data_sample['DayOfWeek'] == 6]

In [13]:
data_sample_0.head()

Unnamed: 0,Lat,Lon,DayOfWeek,Day,Hour
265005,40.7727,-73.9626,0,29,16
631901,40.7636,-73.9656,0,29,5
838717,40.732,-73.984,0,29,4
213945,40.6426,-74.0181,0,22,19
373115,40.7262,-74.0014,0,8,16


In [14]:
numeric_features = ["Lat", "Lon", "DayOfWeek", "Hour"] # Positions des colonnes quantitatives dans X
numeric_transformer = StandardScaler()

# On combine les transformers dans un ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
    ])

# Preprocessings sur le dataset
print("Preprocessing sur le train set...")
print(data_sample_0.head())
X = preprocessor.fit_transform(data_sample_0) # fit_transform !!
print('...Terminé.')
print(X[0:5, :])
print()

Preprocessing sur le train set...
            Lat      Lon  DayOfWeek  Day  Hour
265005  40.7727 -73.9626          0   29    16
631901  40.7636 -73.9656          0   29     5
838717  40.7320 -73.9840          0   29     4
213945  40.6426 -74.0181          0   22    19
373115  40.7262 -74.0014          0    8    16
...Terminé.
[[ 0.80654317  0.08236512  0.          0.4088617 ]
 [ 0.58667689  0.03496893  0.         -1.49399291]
 [-0.17681481 -0.25572767  0.         -1.66697969]
 [-2.3368198  -0.79446432  0.          0.92782205]
 [-0.31694936 -0.53062555  0.          0.4088617 ]]



### DBSCAN algorithm

In [15]:
#travailler l'epsylon
db = DBSCAN(eps=0.3, min_samples=10, metric="manhattan")

db.fit(X)

### Find out how many clusters DBSCAN created

In [16]:
np.unique(db.labels_)

array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14],
      dtype=int64)

In [17]:
data_sample_0["cluster_0"] = db.labels_
data_sample_0.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,Lat,Lon,DayOfWeek,Day,Hour,cluster_0
265005,40.7727,-73.9626,0,29,16,0
631901,40.7636,-73.9656,0,29,5,-1
838717,40.732,-73.984,0,29,4,0
213945,40.6426,-74.0181,0,22,19,-1
373115,40.7262,-74.0014,0,8,16,0


In [18]:
data_sample_0['cluster_0'].value_counts()

cluster_0
 0     2601
-1      913
 2      100
 1       76
 4       49
 5       48
 3       29
 13      11
 6       10
 10      10
 14      10
 12      10
 9        9
 7        9
 11       7
 8        3
Name: count, dtype: int64

In [19]:
fig = px.scatter_mapbox(
        data_sample_0[data_sample_0.cluster_0 != -1], 
        lat="Lat", 
        lon="Lon",
        color="cluster_0",
        mapbox_style="carto-positron"
)

fig.show()

In [20]:
px.scatter_mapbox(
    data_sample_0.loc[data_sample_0.cluster_0 != -1, :],
    lat="Lat",
    lon="Lon",
    animation_frame="Hour",
    mapbox_style="carto-positron"
)

In [21]:
numeric_features = ["Lat", "Lon", "DayOfWeek", "Hour"] # Positions des colonnes quantitatives dans X
numeric_transformer = StandardScaler()

# On combine les transformers dans un ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
    ])

# Preprocessings sur le dataset
print("Preprocessing sur le train set...")
print(data_sample_1.head())
X = preprocessor.fit_transform(data_sample_1) # fit_transform !!
print('...Terminé.')
print(X[0:5, :])
print()

Preprocessing sur le train set...
            Lat      Lon  DayOfWeek  Day  Hour
382126  40.7191 -74.0069          1    9    11
473931  40.7061 -74.0180          1   16    10
273628  40.7704 -73.8658          1   30    19
386704  40.7466 -74.0073          1    9    17
378677  40.7884 -73.9782          1    9     4
...Terminé.
[[-0.55196433 -0.60257531  0.         -0.56098915]
 [-0.8807026  -0.80471821  0.         -0.74370288]
 [ 0.74528749  1.96700689  0.          0.90072072]
 [ 0.14344356 -0.60985974  0.          0.53529325]
 [ 1.20046356 -0.07991756  0.         -1.83998528]]



In [22]:
db.fit(X)

In [23]:
data_sample_1["cluster_1"] = db.labels_
fig = px.scatter_mapbox(
        data_sample_1[data_sample_1.cluster_1 != -1], 
        lat="Lat", 
        lon="Lon",
        color="cluster_1",
        mapbox_style="carto-positron"
)

fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [24]:
fig = px.scatter_mapbox(
        data_sample_1[data_sample_1.cluster_1 != -1], 
        lat="Lat", 
        lon="Lon",
        animation_frame="Hour",
        mapbox_style="carto-positron"
)

fig.show()

### KMEAN algorithm

In [25]:
numeric_features = ["Lat", "Lon", "DayOfWeek", "Day", "Hour"] # Positions des colonnes quantitatives dans X
numeric_transformer = StandardScaler()

# On combine les transformers dans un ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
    ])

# Preprocessings sur le dataset
print("Preprocessing sur le train set...")
print(data_sample.head())
X = preprocessor.fit_transform(data_sample) # fit_transform !!
print('...Terminé.')
print(X[0:5, :])
print()

Preprocessing sur le train set...
             Lat      Lon  DayOfWeek  Day  Hour
265005   40.7727 -73.9626          0   29    16
1001563  40.7374 -74.0065          5   27    15
352153   40.7249 -74.0101          5    6    21
894292   40.7761 -73.9560          3   11     8
382126   40.7191 -74.0069          1    9    11
...Terminé.
[[ 0.82119809  0.15911605 -1.5323931   1.59389733  0.30404685]
 [-0.04516501 -0.59978183  1.05014681  1.35711719  0.13563716]
 [-0.3519508  -0.66201491  1.05014681 -1.12907424  1.14609529]
 [ 0.90464382  0.27321003  0.01713085 -0.5371239  -1.04323066]
 [-0.49429941 -0.60669661 -1.01588512 -0.77390404 -0.53800159]]



In [26]:
wcss =  []
for i in range (2,11): 
    kmeans = KMeans(n_clusters= i)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
    
print(wcss)























[127093.53333762591, 111525.79275684498, 97269.35919425442, 85700.94549614735, 78649.42174601875, 72640.7811557154, 67960.69579055505, 64363.10400116149, 60553.239414221796]


In [27]:
fig = px.line(x = range(2,11), y = wcss)
fig.show()

In [28]:
# Utilisation du silhouette_score pour déterminer le nombre optimal de clusters
s_score = []
for i in range (2,11): 
    kmeans = KMeans(n_clusters= i)
    kmeans.fit(X)
    s_score.append(silhouette_score(X, kmeans.predict(X)))

print(s_score)





















[0.1743342276465836, 0.188840454643199, 0.18690827816409047, 0.2014395533119747, 0.2051926028851459, 0.19649532866772335, 0.200567431829384, 0.20253393619278245, 0.1995819103356034]


In [29]:
# Affichage de scores en fonction du nombre de clusters
fig = px.bar(x = range(2,11), y = s_score)
fig.show()

In [34]:
# On ré-entraîne un KMeans avec le nombre optimal de clusters
kmeans = KMeans(n_clusters= 6)
kmeans.fit(X)





In [35]:
data_sample.loc[:,'Cluster_KMeans'] = kmeans.predict(X)
data_sample.head()

Unnamed: 0,Lat,Lon,DayOfWeek,Day,Hour,Cluster_KMeans
265005,40.7727,-73.9626,0,29,16,0
1001563,40.7374,-74.0065,5,27,15,4
352153,40.7249,-74.0101,5,6,21,4
894292,40.7761,-73.956,3,11,8,3
382126,40.7191,-74.0069,1,9,11,2


In [36]:
fig = px.scatter_mapbox(
        data_sample, 
        lat="Lat", 
        lon="Lon",
        color="Cluster_KMeans",
        mapbox_style="carto-positron"
)

fig.show()

In [37]:
fig = px.scatter_mapbox(
        data_sample, 
        lat="Lat", 
        lon="Lon",
        color="DayOfWeek",
        mapbox_style="carto-positron"
)

fig.show()