# PROJET ANALYSE DE DONNEES
## Etude des stations de location de vélos dans Paris

In [None]:
library(ggplot2)
library(reshape2)
library(gridExtra)
library(tidyverse)
library(GGally)
library(plotly)
library(corrplot)
library(reshape2)
library(FactoMineR) 
library(factoextra)
library(glmnet)
library(ggfortify)
library(pROC)
library(ROCR)
library(RColorBrewer)
library(viridis)
library(leaflet)
library(mapview)

Dans ce notebook, nous allons étudier un jeu de données contenant des informations concernant les taux de disponibilités des vélos dans des stations Vélib parisiennes. Nous avons accés à ce taux pour toutes les heures de chaque jour de la semaine sur la période du 2 septembre au 7 septembre 2014.

Le notebook Python comportera l'ensemble des commentaires et des analyses réalisées à l'issue de notre étude. Ce notebook R comportera donc seulement les codes et les outputs.

## 1. Présentation des données

In [None]:
# on charge les données
load('data/velib.RData')
summary(velib)

In [None]:
# on prépare les données
loading = as.matrix(velib$data)
colnames(loading) = 1:ncol(loading)
rownames(loading) = velib$names

stations = 1:nrow(loading)
coord = velib$position[stations,]
coord$bonus = velib$bonus[stations]

# on prend exactement 7 jours de données (on enlève les 13 premières heures)
dates = 14:181
loading = loading[stations, dates]
colnames(loading) = 1:length(dates)
head(loading)
head(coord)

In [None]:
# on regarde s'il y a des données manquantes dans notre jeu de données
sum(is.na(loading) == TRUE)
sum(is.na(coord) == TRUE)
station_counts <- table(velib$names)

station_name <- station_counts[station_counts > 1]

print(station_name)
anyDuplicated(velib$names)
anyDuplicated(velib$position)

## 2. Analyse descriptive des données

In [None]:
options(repr.plot.width = 15, repr.plot.height = 10)

timeTick = 1 + 24*(0:6)

stations = sample.int(nrow(loading), 16)

df = melt(loading[stations,])

p = list()
for (i in 1:16){
    dfi = df[df$Var1 == velib$names[stations[i]],]
    p[[i]] = ggplot(dfi, aes(x=Var2, y=value)) + 
        geom_line(col="darkorchid") + 
       geom_vline(xintercept=timeTick, col="orange", linetype="dashed") +
       labs(title=velib$names[stations[i]],x = "Temps en heures", y = "Chargement")
}
do.call(grid.arrange,p)

In [None]:
options(repr.plot.width = 18, repr.plot.height = 6)

n_stations <- 1:nrow(loading)
loading_mean <- data.frame(stations = n_stations, mean = rowMeans(loading[n_stations, ]))

ggplot(loading_mean, aes(x = stations, y = mean)) + 
  geom_line(color = 'cornflowerblue', linewidth = 1) +
  geom_hline(yintercept = mean(loading), color = 'darkorange', linewidth = 2) +
  labs(x = "Stations", y = "Average loading")

In [None]:
print('--- Taux de chargement moyen ---')
print(mean(loading))
# --- #
print('')

In [None]:
mean_per_hour_per_day = colMeans(loading)
mean_per_hour_per_day = matrix(mean_per_hour_per_day, nrow = 24)
mean_per_hour         = rowMeans(mean_per_hour_per_day)
# --- #

mean_per_hour_per_day            = as.data.frame(mean_per_hour_per_day)
colnames(mean_per_hour_per_day)  = list("Lundi", "Mardi", "Mercredi","Jeudi", "Vendredi", "Samedi", "Dimanche")
mean_per_hour_per_day$time_range = c(1:24)
mean_per_hour_per_day            = melt(mean_per_hour_per_day, id='time_range', variable.name='Jours')

mean_per_hour            = as.data.frame(mean_per_hour)
colnames(mean_per_hour)  = list("Weekly")
mean_per_hour$time_range = c(1:24)

# --- #
options(repr.plot.width = 15, repr.plot.height = 10)

ggplot() +  
        geom_line(data=mean_per_hour_per_day, aes(x=time_range, y=value, color=Jours)) + 
        geom_line(data=mean_per_hour, aes(x = time_range , y=Weekly), linewidth = 1.5) +
        labs(title = "Toutes les stations",
        x = "Heure de la journée", 
        y = "Chargement moyen")

In [None]:
mean_per_hour_per_day = colMeans(loading)
mean_per_hour_per_day = matrix(mean_per_hour_per_day, nrow = 7)
mean_per_hour         = rowMeans(mean_per_hour_per_day)
# --- #

mean_per_hour_per_day            = as.data.frame(mean_per_hour_per_day)
colnames(mean_per_hour_per_day) <- list("0h", "1h", "2h","3h", "4h", "5h", "6h", "7h", "8h", "9h", "10h", "11h", "12h", "13h", "14h", "15h", "16h", "17h", "18h", "19h", "20h", "21h", "22h", "23h")
mean_per_hour_per_day$time_range = c(1:7)
mean_per_hour_per_day            = melt(mean_per_hour_per_day, id='time_range', variable.name='Heures')

mean_per_hour            = as.data.frame(mean_per_hour)
colnames(mean_per_hour)  = list("Weekly")
mean_per_hour$time_range = c(1:7)

# --- #
options(repr.plot.width = 15, repr.plot.height = 10)

ggplot() +  
        geom_line(data=mean_per_hour_per_day, aes(x=time_range, y=value, color=Heures)) + 
        geom_line(data=mean_per_hour, aes(x = time_range , y=Weekly), linewidth = 1.5) +
        labs(title="Toutes les stations",
        x = "Jour", 
        y = "Chargement moyen")

In [None]:
options(repr.plot.width = 15, repr.plot.height = 6)
time_range = 1:24
i = sample(1:1189,1)
df = loading[i,]
mean_per_hour_per_day = matrix(df, nrow = 24)
mean_per_hour         = rowMeans(mean_per_hour_per_day)

mean_per_hour_per_day            = as.data.frame(mean_per_hour_per_day)
colnames(mean_per_hour_per_day)  = list("Lundi", "Mardi", "Mercredi","Jeudi", "Vendredi", "Samedi", "Dimanche")
mean_per_hour_per_day$time_range = c(1:24)
mean_per_hour_per_day            = melt(mean_per_hour_per_day, id='time_range', variable.name='Jours')

mean_per_hour            = as.data.frame(mean_per_hour)
colnames(mean_per_hour)  = list("Weekly")
mean_per_hour$time_range = c(1:24)
ggplot() +  
       geom_line(data=mean_per_hour_per_day, aes(x=time_range, y=value, color=Jours)) + 
       geom_line(data=mean_per_hour, aes(x = time_range , y=Weekly), linewidth = 1.5) +
       labs(title = velib$names[i],
       x = "Heure de la journée", 
       y = "Chargement moyen")

In [None]:
print('--- Taux de chargement moyen de la station la moins remplie ---')
i = which.min(rowMeans(loading)) 
print(rowMeans(loading)[i])
print(coord[i, ])

In [None]:
print('--- Taux de chargement moyen de la station la plus remplie ---')
i = which.max(rowMeans(loading))
print(rowMeans(loading)[i])
print(coord[i, ])

In [None]:
install.packages("ggmap")
library(ggmap)
register_stadiamaps("d4f81b8a-655d-418e-841e-c5339e89ba9b", write = TRUE)
library(viridis)

In [None]:
lundi <- rowMeans(loading[,c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24)])
mardi <- rowMeans(loading[,c(25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48)])
mercredi <- rowMeans(loading[,c(49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72)])
jeudi <- rowMeans(loading[,c(73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96)])
vendredi <- rowMeans(loading[,c(97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120)])
samedi <- rowMeans(loading[,c(121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144)])
dimanche <- rowMeans(loading[,c(145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168)])


options(repr.plot.width = 20, repr.plot.height = 15)

days  = list("Lundi", "Mardi", "Mercredi","Jeudi", "Vendredi", "Samedi", "Dimanche")

df = coord
df$lundi = lundi
df$mardi = mardi
df$mercredi = mercredi
df$jeudi = jeudi
df$vendredi = vendredi
df$samedi = samedi
df$dimanche = dimanche
p = list()
for (i in 1:7){
    load_per_day = df[,3+i]

    dfi = coord
    dfi$loading = load_per_day
    p[[i]] = qmplot(data=dfi, longitude, latitude, color=loading) + 
          geom_point(size = 2) + 
          scale_color_viridis(name = "Chargement", option = "C", direction = -1) +
          labs(title = paste('Chargement moyen le',days[i]))
}


do.call(grid.arrange,c(p, ncol=3))

In [None]:
options(repr.plot.width = 20, repr.plot.height = 10)
hour = c(1,5,9,13,17,21,24)

dfi = coord
p = list()
for (i in 1:length(hour)){
    h    = hour[i]
    hours = seq(h, 168, 24)
    load_per_hour = rowMeans(loading[,hours])

    df = coord
    df$loading = load_per_hour
    p[[i]] = qmplot(data=df, longitude, latitude, color=loading) + 
          geom_point(size = 2) + 
          scale_color_viridis(name = "Chargement", option = "C", direction = -1) +
          labs(title = paste('Chargement moyen à',(h-1),'h'))
}

do.call(grid.arrange,c(p, ncol=3))

## 3. Etude sur le jeu de données complet 

### 3.1. ACP

In [None]:
boxplot(loading, 
        main = "Boxplot",
        xlab = "Colonne",                         
        ylab = "Valeurs") 

In [None]:
loading <- as.data.frame(loading)
loading$Hill <- as.factor(coord$bonus)
pca <- PCA(loading, scale.unit = TRUE,
           graph = FALSE, quali.sup=169)

In [None]:
plot(cumsum(pca$eig[,2]), type = "l")
n_components <- which(cumsum(pca$eig[,2]) >= 80)[1]
cat("On garde", n_components, "composants pour l'ACP\n")

In [None]:
pca <- PCA(loading, scale.unit = TRUE,
           graph = FALSE, quali.sup=169, ncp = 7)
print(pca$eig[1:7, 2])
cat('--- PCA ---\n')
cat('Dimension initiale :', dim(loading), '\n')
cat('Dimension après projection :', c(nrow(loading), 7), '\n\n')

cat('--- Variance expliquée ---\n')
cat('Composante 1 :', round(pca$eig[1, 1], 2), 'i.e.', round(pca$eig[1, 2], 2), '% de la variance totale\n')
cat('Composante 2 :', round(pca$eig[2, 1], 2), 'i.e.', round(pca$eig[2, 2], 2), '% de la variance totale\n')
cat('Composante 3 :', round(pca$eig[3, 1], 2), 'i.e.', round(pca$eig[3, 2], 2), '% de la variance totale\n')

In [None]:
g1<-fviz_eig(pca, addlabels = TRUE, ylim = c(0, 40), xlim = c(1,7)) +
      ggtitle("Proportion de Variance Expliquée par Chaque Composante Principale") +
      xlab("Composantes Principales") +
      ylab("Pourcentage de Variance Expliquée")
print(g1)

In [None]:
pca <- PCA(loading, scale.unit = TRUE,
           graph = FALSE, quali.sup=169, ncp = 5)
loading_pca=pca$ind$coord

In [None]:
g1<-fviz_eig(pca, addlabels = TRUE, ylim = c(0, 40), xlim = c(1,5)) +
      ggtitle("Proportion de Variance Expliquée par Chaque Composante Principale") +
      xlab("Composantes Principales") +
      ylab("Pourcentage de Variance Expliquée")
print(g1)

In [None]:
box<-ggplot(melt(pca$ind$coord),aes(x=Var2,y=value)) + 
        geom_boxplot()+
        xlab("") +
        ylab("") +
        ggtitle("Boxplot des 5 composantes principales")
print(box)

In [None]:
g1 <- fviz_pca_var(pca, axes = c(1, 2)) +
      ggtitle("Graphe des variables") +
      xlab("PC1") +
      ylab("PC2")

print(g1)

In [None]:
g2 <- fviz_pca_var(acp, axes = c(1, 3)) +
      ggtitle("Graphe des variables") +
      xlab("PC1") +
      ylab("PC3")

print(g2)

#### Clustering

#### 3.1.1. Méthode de clustering avec k-means

In [None]:
library(cluster)
library(RColorBrewer)
library(viridis)

In [None]:
matchClasses <- function(classif1, classif2) {
  cm <- table(classif1, classif2)
  K <- ncol(cm)
  a <- integer(K)
  b <- integer(K)
  for (j in seq_len(K)) {
    for (i in seq_len(K)) {
      if (a[j] < cm[i, j]) {
        a[j] <- cm[i, j]
        b[j] <- i
      }
    }
  }
  
  print("")
  print(paste("Classes size:", toString(a)))
  print(paste("Class (in the classif1 numbering):", toString(b)))
  print("")
  
  table <- cm
  for (i in seq_len(K)) {
    table[, b[i]] <- cm[, i]
  }
  
  clusters <- classif2
  n <- length(classif2)
  for (i in seq_len(n)) {
    for (j in seq_len(K)) {
      if (classif2[i] == j) {
        clusters[i] <- b[j]
      }
    }
  }
  
  list(table = table, clusters = clusters)
}


In [None]:
options(repr.plot.width = 9, repr.plot.height = 6)
fviz_nbclust(loading[,-169], FUNcluster=stats::kmeans, method="wss") +
    ggtitle("Score WSS") +
    xlab("Nombre de clusters") +
    ylab("Somme des carrés")

fviz_nbclust(loading[,-169], FUNcluster=stats::kmeans, method="silhouette") +
    ggtitle("Score Silhouette") +
    xlab("Nombre de clusters") +
    ylab("Score silhouette moyen")

In [None]:
options(repr.plot.width = 15, repr.plot.height = 10)
for (centers in 2:5) {
  kmeans <- kmeans(loading, centers=centers)
  sil <- silhouette(kmeans$cluster, dist(loading))
  visualizer <- fviz_silhouette(sil, ylim = c(-0.2, 0.6), main = paste("centers =", centers))
  print(visualizer)
}

In [None]:
K=4
kmeans = kmeans(loading,centers=K)
clusters <- kmeans$cluster

In [None]:
cluster_counts <- table(clusters)
barplot(cluster_counts, 
        main = "Nombre d'individus par cluster", 
        xlab = "Cluster", 
        ylab = "Nombre d'individus")

In [None]:
plot(kmeans$withinss)

In [None]:
options(repr.plot.width = 9, repr.plot.height = 6)
fviz_pca(pca, axes=c(1,2), geom = c("point"), col.ind=as.factor(reskmeans$cluster))

In [None]:
cluster_palette <- colorFactor(c("#E41A1C","#4DAF4A","#377EB8"), domain = unique(reskmeans$cluster))  # Utiliser la palette viridis avec 10 couleurs

leaflet(velib$position) %>% 
  addTiles(urlTemplate = "https://{s}.basemaps.cartocdn.com/light_all/{z}/{x}/{y}{r}.png", 
           attribution = 'Map tiles by Carto, under CC BY 3.0. Data by OpenStreetMap, under ODbL.') %>% 
  addCircleMarkers(radius = 3,
                   color = ~cluster_palette(reskmeans$cluster),
                   stroke = FALSE, 
                   fillOpacity = 0.9) %>%
addLegend(position = "bottomright", 
            pal = cluster_palette, 
            values = ~reskmeans$cluster,
            title = "Cluster",
            opacity = 1)

In [None]:
distances <- kmeans$tot.withinss

# Variance intraclasse pour chaque cluster
variances <- numeric(kmeans$centers)
for (cluster in 1:kmeans$centers) {
  cluster_distances <- distances[kmeans$cluster == cluster, cluster]
  variance <- var(cluster_distances)
  variances[cluster] <- variance
}

# Affichage
barplot(variances, col='skyblue', xlab='Cluster', ylab='Variance intra-classe', 
        main='Variance intra-classe par cluster', names.arg=paste('Cluster', 1:kmeans$centers))

In [None]:
mean_loadings <- aggregate(loading[, -169], by = list(cluster = reskmeans$cluster), FUN = mean)
mean_loadings
mean_loadings_melted <- melt(mean_loadings, id.vars = "cluster")

ggplot(mean_loadings_melted, aes(x = variable, y = value, color = factor(cluster), group = cluster)) +
  geom_line() +
  labs(title = "Chargement moyen des stations par cluster", x = "Temps en heures", y = "Chargement") +
  scale_color_discrete(name = "Cluster")

In [None]:
tbl2 = table(coord$bonus,reskmeans$cluster)
print(tbl2)
mosaicplot(tbl2,color=c(1:4), main = "Stations en altitude en fonction du clusters")

Etude sur loading_pca

In [None]:
options(repr.plot.width = 9, repr.plot.height = 6)
fviz_nbclust(loading_pca, FUNcluster=stats::kmeans, method="wss") +
    ggtitle("Score WSS") +
    xlab("Nombre de clusters") +
    ylab("Somme des carrés")

fviz_nbclust(loading_pca, FUNcluster=stats::kmeans, method="silhouette") +
    ggtitle("Score Silhouette") +
    xlab("Nombre de clusters") +
    ylab("Score silhouette moyen")

In [None]:
options(repr.plot.width = 15, repr.plot.height = 10)
for (centers in 2:5) {
  kmeans_pca <- kmeans(loading_pca, centers=centers)
  sil <- silhouette(kmeans_pca$cluster, dist(loading_pca))
  visualizer <- fviz_silhouette(sil, ylim = c(-0.2, 0.6), main = paste("centers =", centers))
  print(visualizer)
}

In [None]:
kmeans_pca = kmeans(loading_pca,centers=4)
clusters_pca <- kmeans_pca$cluster

In [None]:
cluster_counts_pca <- table(clusters_pca)
barplot(cluster_counts_pca, 
        main = "Nombre d'individus par cluster", 
        xlab = "Cluster", 
        ylab = "Nombre d'individus")

In [None]:
fviz_cluster(kmeans_pca, data=loading[,-169], ellipse.type="norm", labelsize=8, geom=c("point"))

In [None]:
tbl2 = table(coord$bonus,clusters_pca)
print(tbl2)
mosaicplot(tbl2,color=c(1:4), main = "Stations en altitude en fonction du clusters")

In [None]:
mean_loadings2 <- aggregate(loading[, -169], by = list(cluster = clusters_pca), FUN = mean)
mean_loadings2
mean_loadings_melted2 <- melt(mean_loadings2, id.vars = "cluster")

ggplot(mean_loadings_melted2, aes(x = variable, y = value, color = factor(cluster), group = cluster)) +
  geom_line() +
  labs(title = "Chargement moyen des stations par cluster", x = "Temps en heures", y = "Chargement") +
  scale_color_discrete(name = "Cluster")

In [None]:
cluster_palette <- colorFactor(c("#E41A1C","#4DAF4A","#377EB8"), domain = unique(clusters))  # Utiliser la palette viridis avec 10 couleurs

leaflet(velib$position) %>% 
  addTiles(urlTemplate = "https://{s}.basemaps.cartocdn.com/light_all/{z}/{x}/{y}{r}.png", 
           attribution = 'Map tiles by Carto, under CC BY 3.0. Data by OpenStreetMap, under ODbL.') %>% 
  addCircleMarkers(radius = 3,
                   color = ~cluster_palette(clusters),
                   stroke = FALSE, 
                   fillOpacity = 0.9) %>%
addLegend(position = "bottomright", 
            pal = cluster_palette, 
            values = ~clusters,
            title = "Cluster",
            opacity = 1)

cluster_palette <- colorFactor(c("#E41A1C","#4DAF4A","#377EB8"), domain = unique(clusters_pca))  # Utiliser la palette viridis avec 10 couleurs

leaflet(velib$position) %>% 
  addTiles(urlTemplate = "https://{s}.basemaps.cartocdn.com/light_all/{z}/{x}/{y}{r}.png", 
           attribution = 'Map tiles by Carto, under CC BY 3.0. Data by OpenStreetMap, under ODbL.') %>% 
  addCircleMarkers(radius = 3,
                   color = ~cluster_palette(clusters_pca),
                   stroke = FALSE, 
                   fillOpacity = 0.9) %>%
addLegend(position = "bottomright", 
            pal = cluster_palette, 
            values = ~clusters_pca,
            title = "Cluster",
            opacity = 1)

result <- matchClasses(clusters, clusters_pca)
clusters_pca_sorted <- result$clusters

# Quels sont les points identiques ?
points_diff <- clusters != clusters_pca_sorted

# Calcul du nombre de points différents
num_diff_points <- sum(points_diff)

# Calcul du pourcentage de réussite
pourcentage_reussite <- (1 - num_diff_points / length(clusters)) * 100

# Affichage du résultat
cat("Nombre de points différents :", num_diff_points, "sur", length(clusters), "\n")
cat("Pourcentage de réussite :", sprintf("%.2f", pourcentage_reussite), "%\n")

In [None]:
library(caret)
conf_matrix <- table(clusters, clusters_pca)
conf_matrix_df <- as.data.frame(as.table(conf_matrix))

ggplot(data=conf_matrix_df, aes(x = clusters_pca, y = clusters, fill = Freq)) +
    geom_tile() +
    geom_text(aes(label=Freq),color="black") +
    scale_fill_gradient(low="white", high="blue") +
    xlab("Sur les données réduites (PCA)") +
    ylab("Sur les données complètes") +
    ggtitle("Matrice de confusion") +
    theme_minimal()

In [None]:
print(conf_matrix)

#### 3.1.2. CAH : Agglomerative Clustering

In [None]:
options(repr.plot.width = 12, repr.plot.height = 6)
grid.arrange(
    fviz_nbclust(loading_pca, FUNcluster=hcut, method="wss") + ggtitle("Score WSS"),
    fviz_nbclust(loading_pca, FUNcluster=hcut, method="silhouette") + ggtitle("Score Silhouette"),
    ncol=2
)

In [None]:
distances = dist(loading_pca, method="euclidean")

hclustaverage = hclust(distances, method="average")
hclustward = hclust(distances,method="ward.D")
hclustsingle = hclust(distances, method="single")
hclustcomplete = hclust(distances,method="complete")

reshclust_average = cutree(hclustaverage, 4)
reshclust_ward = cutree(hclustward, 4)
reshclust_complete = cutree(hclustcomplete, 4)
reshclust_single = cutree(hclustsingle, 4)

fviz_dend(hclustaverage, k=4, show_labels=FALSE, rect=TRUE, main = "Dendrogram avec linkage Average")
fviz_dend(hclustward, k=4, show_labels=FALSE, rect=TRUE, main = "Dendrogram avec linkage Ward")
fviz_dend(hclustsingle, k=4, show_labels=FALSE, rect=TRUE, main = "Dendrogram avec linkage Single")
fviz_dend(hclustcomplete, k=4, show_labels=FALSE, rect=TRUE, main = "Dendrogram avec linkage Complete")

In [None]:
options(repr.plot.width = 9, repr.plot.height = 6)

fviz_pca(pca, axes=c(1,2), geom = c("point"), col.ind=as.factor(reshclust_ward))

In [None]:
cluster_palette <- colorFactor(c("#E41A1C","#4DAF4A","#377EB8"), domain = unique(reshclust_ward))

leaflet(velib$position) %>% 
  addTiles(urlTemplate = "https://{s}.basemaps.cartocdn.com/light_all/{z}/{x}/{y}{r}.png", 
           attribution = 'Map tiles by Carto, under CC BY 3.0. Data by OpenStreetMap, under ODbL.') %>% 
  addCircleMarkers(radius = 3,
                   color = ~cluster_palette(reshclust_ward),
                   stroke = FALSE, 
                   fillOpacity = 0.9) %>%
addLegend(position = "bottomright", 
            pal = cluster_palette, 
            values = ~reshclust_ward,
            title = "Cluster",
            opacity = 1)

In [None]:
mean_loadings_cah <- aggregate(loading[, -169], by = list(cluster = reshclust_ward), FUN = mean)
mean_loadings_cah
mean_loadings_melted_cah <- melt(mean_loadings_cah, id.vars = "cluster")

ggplot(mean_loadings_melted_cah, aes(x = variable, y = value, color = factor(cluster), group = cluster)) +
  geom_line() +
  labs(title = "Chargement moyen des stations par cluster", x = "Temps en heures", y = "Chargement") +
  scale_color_discrete(name = "Cluster")

In [None]:
tbl2 = table(coord$bonus,reshclust_ward)
print(tbl2)
mosaicplot(tbl2,color=c(1:4), main = "Stations en altitude en fonction du clusters")

#### 3.1.3. Gaussian Mixture Models

In [None]:
library(mclust)

In [None]:
resBICall = mclustBIC(loading_pca, G=1:10)
summary(resBICall)

resBICall = Mclust(loading_pca, G=1:10)
summary(resBICall)

fviz_mclust(resBICall, what="BIC")

In [None]:
options(repr.plot.width = 10, repr.plot.height = 6)
gmm = Mclust(loading_pca, G=4, modelNames = "VVE")
clusters_gmm_loading = gmm$classification
fviz_cluster(gmm, data=loading_pca, ellipse.type="norm", geom="point")# --- #

aux = data.frame(
    label = paste("Cluster", clusters_gmm_loading, sep=""), 
    proba = apply(gmm$z, 1, max))

ggplot(aux, aes(x=label, y=proba)) + 
    geom_boxplot(colour=1:4, fill=1:4, alpha=.4)

In [None]:
options(repr.plot.width = 20, repr.plot.height = 10)

grid.arrange(
    fviz_pca(pca, axes=c(1,2), geom = c("point"), col.ind=as.factor(clusters_gmm_loading)),
    fviz_pca(pca, axes=c(1,3), geom=c("point"), col.ind=as.factor(clusters_gmm_loading)),
    fviz_pca(pca, axes=c(2,3), geom=c("point"), col.ind=as.factor(clusters_gmm_loading)),
    ncol=3
)

In [None]:
cluster_palette <- colorFactor(c("#E41A1C","#4DAF4A","#377EB8"), domain = unique(clusters_gmm_loading))  # Utiliser la palette viridis avec 10 couleurs

leaflet(velib$position) %>% 
  addTiles(urlTemplate = "https://{s}.basemaps.cartocdn.com/light_all/{z}/{x}/{y}{r}.png", 
           attribution = 'Map tiles by Carto, under CC BY 3.0. Data by OpenStreetMap, under ODbL.') %>% 
  addCircleMarkers(radius = 3,
                   color = ~cluster_palette(clusters_gmm_loading),
                   stroke = FALSE, 
                   fillOpacity = 0.9) %>%
addLegend(position = "bottomright", 
            pal = cluster_palette, 
            values = ~clusters_gmm_loading,
            title = "Cluster",
            opacity = 1)
### pareil que pour la première carte

In [None]:
mean_loadings_gmm <- aggregate(loading[, -169], by = list(cluster = clusters_gmm_loading), FUN = mean)
mean_loadings_gmm
mean_loadings_melted_gmm <- melt(mean_loadings_gmm, id.vars = "cluster")

ggplot(mean_loadings_melted_gmm, aes(x = variable, y = value, color = factor(cluster), group = cluster)) +
  geom_line() +
  labs(title = "Chargement moyen des stations par cluster", x = "Temps en heures", y = "Chargement") +
  scale_color_discrete(name = "Cluster")

In [None]:
tbl2_cah = table(coord$bonus,clusters_gmm_loading)
print(tbl2_cah)
mosaicplot(tbl2_cah,color=c(1:4), main = "Stations en altitude en fonction du clusters")

In [None]:
conf_matrix <- table(clusters_gmm_loading, clusters_pca)
conf_matrix_df <- as.data.frame(as.table(conf_matrix))

ggplot(data=conf_matrix_df, aes(x = clusters_pca, y = clusters_gmm_loading, fill = Freq)) +
    geom_tile() +
    geom_text(aes(label=Freq),color="black") +
    scale_fill_gradient(low="white", high="blue") +
    xlab("Sur les données réduites (PCA)") +
    ylab("Sur les données complètes") +
    ggtitle("Matrice de confusion") +
    theme_minimal()

Les résultats en GMM sont trop éloignées des autres méthodes, le package R renvoie des résultats différents du Python

### 3.2. CA : comparaison entre Kmeans et Colline

Voir notebook python

## 4. Etude sur le jeu de données par jour

### 4.1. ACP

In [None]:
#Création du jeu de données data_jours 

In [None]:
lundi <- rowMeans(loading[,c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24)])
mardi <- rowMeans(loading[,c(25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48)])
mercredi <- rowMeans(loading[,c(49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72)])
jeudi <- rowMeans(loading[,c(73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96)])
vendredi <- rowMeans(loading[,c(97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120)])
samedi <- rowMeans(loading[,c(121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144)])
dimanche <- rowMeans(loading[,c(145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168)])

data_jours <- data.frame(lundi,mardi,mercredi,jeudi,vendredi,samedi,dimanche)

colnames(data_jours) <- list("Lundi","Mardi","Mercredi","Jeudi","Vendredi","Samedi","Dimanche")
data_jours$Station <- velib$names
data_jours$Station <- as.factor(data_jours$Station)
data_jours$Hill <- as.factor(coord$bonus)
head(data_jours)

In [None]:
pca_2 <- PCA(data_jours, scale.unit = TRUE,
           graph = FALSE, quali.sup = c(8,9))

In [None]:
plot(cumsum(pca_2$eig[,2]), type = "l")
n_components <- which(cumsum(pca_2$eig[,2]) >= 85)[1]
cat("On garde", n_components, "composants pour l'ACP\n")

In [None]:
pca_2 <- PCA(data_jours, scale.unit = TRUE,
           graph = FALSE, quali.sup=c(8,9), ncp = 3)
data_jours_pca=pca_2$ind$coord
print(pca_2$eig[1:3, 2])
cat('--- PCA ---\n')
cat('Dimension initiale :', dim(data_jours), '\n')
cat('Dimension après projection :', c(nrow(data_jours), 3), '\n\n')

cat('--- Variance expliquée ---\n')
cat('Composante 1 :', round(pca_2$eig[1, 1], 2), 'i.e.', round(pca_2$eig[1, 2], 2), '% de la variance totale\n')
cat('Composante 2 :', round(pca_2$eig[2, 1], 2), 'i.e.', round(pca_2$eig[2, 2], 2), '% de la variance totale\n')
cat('Composante 3 :', round(pca_2$eig[3, 1], 2), 'i.e.', round(pca_2$eig[3, 2], 2), '% de la variance totale\n')

In [None]:
g2<-fviz_eig(pca_2, addlabels = TRUE, ylim = c(0, 40), xlim = c(1,3)) +
      ggtitle("Proportion de Variance Expliquée par Chaque Composante Principale") +
      xlab("Composantes Principales") +
      ylab("Pourcentage de Variance Expliquée")
print(g2)

In [None]:
box<-ggplot(melt(pca_2$ind$coord),aes(x=Var2,y=value)) + 
        geom_boxplot()+
        xlab("") +
        ylab("") +
        ggtitle("Boxplot des 3 composantes principales")
print(box)

In [None]:
g1 <- fviz_pca_var(pca_2, axes = c(1, 2)) +
      ggtitle("Graphe des variables") +
      xlab("PC1") +
      ylab("PC2")

print(g1)

#### Clustering

#### 4.1.1. Méthode de clustering avec k-means

In [None]:
options(repr.plot.width = 9, repr.plot.height = 6)
fviz_nbclust(data_jours[,-c(8,9)], FUNcluster=stats::kmeans, method="wss") +
    ggtitle("Within sum of square (WSS) according to the number of clusters")

fviz_nbclust(data_jours[,-c(8,9)], FUNcluster=stats::kmeans, method="silhouette") +
    ggtitle("Silhouette score according to the number of clusters")

In [None]:
options(repr.plot.width = 15, repr.plot.height = 10)
for (centers in 2:5) {
  kmeans_jours <- kmeans(data_jours[,-c(8,9)], centers=centers)
  sil <- silhouette(kmeans_jours$cluster, dist(data_jours[,-c(8,9)]))
  visualizer <- fviz_silhouette(sil, ylim = c(-0.2, 0.6), main = paste("centers =", centers))
  print(visualizer)
}

In [None]:
K=3
kmeans_jours = kmeans(loading,centers=K)
clusters_jours <- kmeans_jours$cluster

In [None]:
cluster_counts_jours <- table(clusters_jours)
barplot(cluster_counts_jours, 
        main = "Nombre d'individus par cluster", 
        xlab = "Cluster", 
        ylab = "Nombre d'individus")

In [None]:
mean_loadings3 <- aggregate(loading[, -169], by = list(cluster = clusters_jours), FUN = mean)
mean_loadings3
mean_loadings_melted3 <- melt(mean_loadings3, id.vars = "cluster")

ggplot(mean_loadings_melted3, aes(x = variable, y = value, color = factor(cluster), group = cluster)) +
  geom_line() +
  labs(title = "Chargement moyen des stations par cluster", x = "Temps en heures", y = "Chargement") +
  scale_color_discrete(name = "Cluster")

In [None]:
tbl3 = table(coord$bonus,clusters_jours)
print(tbl3)
mosaicplot(tbl3,color=c(1:3), main = "Stations en altitude en fonction du clusters")

Etude sur data_jours_pca

In [None]:
options(repr.plot.width = 9, repr.plot.height = 6)
fviz_nbclust(data_jours_pca, FUNcluster=stats::kmeans, method="wss") +
    ggtitle("Within sum of square (WSS) according to the number of clusters")

fviz_nbclust(data_jours_pca, FUNcluster=stats::kmeans, method="silhouette") +
    ggtitle("Silhouette score according to the number of clusters")

In [None]:
options(repr.plot.width = 15, repr.plot.height = 10)
for (centers in 2:5) {
  kmeans_PCA_jours <- kmeans(data_jours_pca, centers=centers)
  sil <- silhouette(kmeans_PCA_jours$cluster, dist(data_jours_pca))
  visualizer <- fviz_silhouette(sil, ylim = c(-0.2, 0.6), main = paste("centers =", centers))
  print(visualizer)
}

In [None]:
K=3
kmeans_PCA_jours = kmeans(data_jours_pca,centers=K)
clusters_jours_pca <- kmeans_PCA_jours$cluster

In [None]:
result <- matchClasses(clusters_jours, clusters_jours_pca)
clusters_jours_pca_sorted <- result$clusters

In [None]:
cluster_counts_jours <- table(clusters_jours_pca_sorted)
barplot(cluster_counts_jours, 
        main = "Nombre d'individus par cluster", 
        xlab = "Cluster", 
        ylab = "Nombre d'individus")

In [None]:
fviz_cluster(kmeans_PCA_jours, data=data_jours[,-c(8,9)], ellipse.type="norm", labelsize=8, geom=c("point"))

In [None]:
tbl4 = table(coord$bonus,clusters_jours_pca_sorted)
print(tbl4)
mosaicplot(tbl4,color=c(1:3), main = "Stations en altitude en fonction du clusters")

In [None]:
mean_loadings_4 <- aggregate(loading[,-169], by = list(cluster = clusters_jours_pca_sorted), FUN = mean)
mean_loadings_4
mean_loadings_melted_4 <- melt(mean_loadings_4, id.vars = "cluster")

ggplot(mean_loadings_melted_4, aes(x = variable, y = value, color = factor(cluster), group = cluster)) +
  geom_line() +
  labs(title = "Chargement moyen des stations par cluster", x = "Temps en heures", y = "Chargement") +
  scale_color_discrete(name = "Cluster")

In [None]:
result <- matchClasses(clusters_jours, clusters_jours_pca)
clusters_jours_pca_sorted <- result$clusters

cluster_palette <- colorFactor(c("#E41A1C","#4DAF4A","#377EB8"), domain = unique(clusters_jours))  # Utiliser la palette viridis avec 10 couleurs

leaflet(velib$position) %>% 
  addTiles(urlTemplate = "https://{s}.basemaps.cartocdn.com/light_all/{z}/{x}/{y}{r}.png", 
           attribution = 'Map tiles by Carto, under CC BY 3.0. Data by OpenStreetMap, under ODbL.') %>% 
  addCircleMarkers(radius = 3,
                   color = ~cluster_palette(clusters_jours),
                   stroke = FALSE, 
                   fillOpacity = 0.9) %>%
addLegend(position = "bottomright", 
            pal = cluster_palette, 
            values = ~clusters_jours,
            title = "Cluster",
            opacity = 1)

cluster_palette <- colorFactor(c("#E41A1C","#4DAF4A","#377EB8"), domain = unique(clusters_jours_pca_sorted))  # Utiliser la palette viridis avec 10 couleurs

leaflet(velib$position) %>% 
  addTiles(urlTemplate = "https://{s}.basemaps.cartocdn.com/light_all/{z}/{x}/{y}{r}.png", 
           attribution = 'Map tiles by Carto, under CC BY 3.0. Data by OpenStreetMap, under ODbL.') %>% 
  addCircleMarkers(radius = 3,
                   color = ~cluster_palette(clusters_jours_pca_sorted),
                   stroke = FALSE, 
                   fillOpacity = 0.9) %>%
addLegend(position = "bottomright", 
            pal = cluster_palette, 
            values = ~clusters_jours_pca_sorted,
            title = "Cluster",
            opacity = 1)


# Quels sont les points identiques ?
points_diff <- clusters_jours != clusters_jours_pca_sorted

# Calcul du nombre de points différents
num_diff_points <- sum(points_diff)

# Calcul du pourcentage de réussite
pourcentage_reussite <- (1 - num_diff_points / length(clusters)) * 100

# Affichage du résultat
cat("Nombre de points différents :", num_diff_points, "sur", length(clusters), "\n")
cat("Pourcentage de réussite :", sprintf("%.2f", pourcentage_reussite), "%\n")

In [None]:
library(caret)
conf_matrix <- table(clusters_jours, clusters_jours_pca_sorted)
conf_matrix_df <- as.data.frame(as.table(conf_matrix))

ggplot(data=conf_matrix_df, aes(x = clusters_jours_pca_sorted, y = clusters_jours, fill = Freq)) +
    geom_tile() +
    geom_text(aes(label=Freq),color="black") +
    scale_fill_gradient(low="white", high="blue") +
    xlab("Sur les données réduites (PCA)") +
    ylab("Sur les données complètes") +
    ggtitle("Matrice de confusion") +
    theme_minimal()

In [None]:
print(conf_matrix)

#### 4.1.2. CAH : Agglomerative Clustering

In [None]:
options(repr.plot.width = 12, repr.plot.height = 6)
grid.arrange(
    fviz_nbclust(data_jours_pca, FUNcluster=hcut, method="wss") + ggtitle("Score WSS"),
    fviz_nbclust(data_jours_pca, FUNcluster=hcut, method="silhouette") + ggtitle("Score Silhouette"),
    ncol=2
)

In [None]:
d2 = dist(data_jours_pca, method="euclidean")

hclustaverage2 = hclust(d2, method="average")
hclustward2 = hclust(d2,method="ward.D")
hclustsingle2 = hclust(d2, method="single")
hclustcomplete2 = hclust(d2,method="complete")

reshclust_average2 = cutree(hclustaverage2, 3)
reshclust_ward2 = cutree(hclustward2, 3)
reshclust_single2 = cutree(hclustsingle2, 3)
reshclust_complete2 = cutree(hclustcomplete2, 3)

fviz_dend(hclustaverage2, k=3, show_labels=FALSE, rect=TRUE, main = "Dendrogram avec linkage Average")
fviz_dend(hclustward2, k=3, show_labels=FALSE, rect=TRUE, main = "Dendrogram avec linkage Ward")
fviz_dend(hclustsingle2, k=3, show_labels=FALSE, rect=TRUE, main = "Dendrogram avec linkage Single")
fviz_dend(hclustcomplete2, k=3, show_labels=FALSE, rect=TRUE, main = "Dendrogram avec linkage Complete")

In [None]:
options(repr.plot.width = 9, repr.plot.height = 6)

fviz_pca(pca_2, axes=c(1,2), geom = c("point"), col.ind=as.factor(reshclust_ward2))

In [None]:
cluster_palette <- colorFactor(c("#E41A1C","#4DAF4A","#377EB8"), domain = unique(reshclust_ward2))

leaflet(velib$position) %>% 
  addTiles(urlTemplate = "https://{s}.basemaps.cartocdn.com/light_all/{z}/{x}/{y}{r}.png", 
           attribution = 'Map tiles by Carto, under CC BY 3.0. Data by OpenStreetMap, under ODbL.') %>% 
  addCircleMarkers(radius = 3,
                   color = ~cluster_palette(reshclust_ward2),
                   stroke = FALSE, 
                   fillOpacity = 0.9) %>%
addLegend(position = "bottomright", 
            pal = cluster_palette, 
            values = ~reshclust_ward2,
            title = "Cluster",
            opacity = 1)

In [None]:
mean_loadings_5 <- aggregate(loading[, -169], by = list(cluster = reshclust_ward2), FUN = mean)
mean_loadings_5
mean_loadings_melted_5 <- melt(mean_loadings_5, id.vars = "cluster")

ggplot(mean_loadings_melted_5, aes(x = variable, y = value, color = factor(cluster), group = cluster)) +
  geom_line() +
  labs(title = "Chargement moyen des stations par cluster", x = "Temps en heures", y = "Chargement") +
  scale_color_discrete(name = "Cluster")

In [None]:
tbl5 = table(coord$bonus,reshclust_ward2)
print(tbl5)
mosaicplot(tbl5,color=c(1:3), main = "Stations en altitude en fonction du clusters")

#### 4.1.3. Gaussian Mixture Models

In [None]:
resBICall2 = mclustBIC(data_jours_pca, G=1:10)
summary(resBICall2)

resBICall2 = Mclust(data_jours_pca, G=1:10)
summary(resBICall2)

fviz_mclust(resBICall2, what="BIC")

In [None]:
options(repr.plot.width = 10, repr.plot.height = 6)
gmm_jours = Mclust(data_jours_pca, G=3, modelNames = "VEE")
fviz_cluster(gmm_jours, data=data_jours_pca, ellipse.type="norm", geom="point")

# --- #

aux = data.frame(
    label = paste("Cluster", gmm_jours$classification, sep=""), 
    proba = apply(gmm_jours$z, 1, max))

ggplot(aux, aes(x=label, y=proba)) + 
    geom_boxplot(colour=1:3, fill=1:3, alpha=.4)

In [None]:
options(repr.plot.width = 20, repr.plot.height = 10)

fviz_pca(acp2, axes=c(1,2), geom = c("point"), col.ind=as.factor(resBIC2$classification))

In [None]:
cluster_palette <- colorFactor(c("#E41A1C", "#FF33CC", "#377EB8", "#4DAF4A", "#FF7F00", "#FFFF33"), domain = unique(resBIC2$classification))  # Utiliser la palette viridis avec 10 couleurs

leaflet(velib$position) %>% 
  addTiles(urlTemplate = "https://{s}.basemaps.cartocdn.com/light_all/{z}/{x}/{y}{r}.png", 
           attribution = 'Map tiles by Carto, under CC BY 3.0. Data by OpenStreetMap, under ODbL.') %>% 
  addCircleMarkers(radius = 3,
                   color = ~cluster_palette(resBIC2$classification),
                   stroke = FALSE, 
                   fillOpacity = 0.9) %>%
addLegend(position = "bottomright", 
            pal = cluster_palette, 
            values = ~resBIC2$classification,
            title = "Cluster",
            opacity = 1)

In [None]:
mean_loadings_2gmm <- aggregate(loading[, -169], by = list(cluster = resBIC2$classification), FUN = mean)
mean_loadings_2gmm
mean_loadings_melted_2gmm <- melt(mean_loadings_2gmm, id.vars = "cluster")

ggplot(mean_loadings_melted_2gmm, aes(x = variable, y = value, color = factor(cluster), group = cluster)) +
  geom_line() +
  labs(title = "Chargement moyen des stations par cluster", x = "Temps en heures", y = "Chargement") +
  scale_color_discrete(name = "Cluster")

In [None]:
tbl2_ch2 = table(coord$bonus,resBIC2$classification)
print(tbl2_ch2)
mosaicplot(tbl2_ch2,color=c(1:3), main = "Stations en altitude en fonction du clusters")

### CA entre CAH et Kmeans 

In [None]:
table_contingence <- table(reskmeans2$cluster,reshclust_ward2)
table_contingence

In [None]:
res.ca = CA(table_contingence, graph=FALSE)
fviz_ca_biplot(res.ca, repel=TRUE)

In [None]:
table_contingence2 <- table(reskmeans2$cluster,resBIC2$classification)
table_contingence2

In [None]:
res.ca2 = CA(table_contingence2, graph=FALSE)
fviz_ca_biplot(res.ca2, repel=TRUE)

## 5. Etude sur le jeu de données par heures

### 5.1. ACP

In [None]:
#Création du jeu de données data_heures

In [None]:
minuit_am <- rowMeans(loading[,c(1,25,49,73,97,121,145)])
une_am <- rowMeans(loading[,c(2,26,50,74,98,122,146)])
deux_am <- rowMeans(loading[,c(3,27,51,75,99,123,147)])
trois_am <- rowMeans(loading[,c(4,28,52,76,100,124,148)])
quatre_am <- rowMeans(loading[,c(5,29,53,77,101,125,149)])
cinq_am <- rowMeans(loading[,c(6,30,54,78,102,126,150)])
six_am <- rowMeans(loading[,c(7,31,55,79,103,127,151)])
sept_am <- rowMeans(loading[,c(8,32,56,80,104,128,152)])
huit_am <- rowMeans(loading[,c(9,33,57,81,105,129,153)])
neuf_am <- rowMeans(loading[,c(10,34,58,82,106,130,154)])
dix_am <- rowMeans(loading[,c(11,35,59,83,107,131,155)])
onze_am <- rowMeans(loading[,c(12,36,60,84,108,132,156)])
minuit_pm <- rowMeans(loading[,c(13,37,61,85,109,133,157)])
une_pm <- rowMeans(loading[,c(14,38,62,86,110,134,158)])
deux_pm <- rowMeans(loading[,c(15,39,63,87,111,135,159)])
trois_pm <- rowMeans(loading[,c(16,40,64,88,112,136,160)])
quatre_pm <- rowMeans(loading[,c(17,41,65,89,113,137,161)])
cinq_pm <- rowMeans(loading[,c(18,42,66,90,114,138,162)])
six_pm <- rowMeans(loading[,c(19,43,67,91,115,139,163)])
sept_pm <- rowMeans(loading[,c(20,44,68,92,116,140,164)])
huit_pm <- rowMeans(loading[,c(21,45,69,93,117,141,165)])
neuf_pm <- rowMeans(loading[,c(22,46,70,94,118,142,166)])
dix_pm <- rowMeans(loading[,c(23,47,71,95,119,143,167)])
onze_pm <- rowMeans(loading[,c(24,48,72,96,120,144,168)])
data_heures <- data.frame(Minuit = minuit_am, "une h" = une_am, "2h" = deux_am, "3h" = trois_am, "4h" = quatre_am, "5h" = cinq_am, "6h" = six_am, "7h" = sept_am, "8h" = huit_am, "9h" = neuf_am
                 , "10h" = dix_am, "11h" = onze_am, "12h" = minuit_pm, "13h" = une_pm, "14h" = deux_pm, "15h" = trois_pm, "16h" = quatre_pm, "17h" = cinq_pm, "18h" = six_pm, 
                 "19h" = sept_pm, "20h" = huit_pm, "21h" = neuf_pm, "22h" = dix_pm, "23h" = onze_pm)

colnames(data_heures) <- list("0h", "1h", "2h","3h", "4h", "5h", "6h", "7h", "8h", "9h", "10h", "11h", "12h", "13h", "14h", "15h", "16h", "17h", "18h", "19h", "20h", "21h", "22h", "23h")
data_heures$Station <- velib$names
data_heures$Hill <- as.factor(coord$bonus)
data_heures$Station <- as.factor(data_heures$Station)
head(data_heures)

In [None]:
# ACP1
pca_3 <- PCA(data_heures, scale.unit = TRUE,
           graph = FALSE, quali.sup = c(25,26))


In [None]:
plot(cumsum(pca_3$eig[,2]), type = "l")
n_components <- which(cumsum(pca_3$eig[,2]) >= 90)[1]
cat("On garde", n_components, "composants pour l'ACP\n")

In [None]:
pca_3 <- PCA(data_heures, scale.unit = TRUE,
           graph = FALSE, quali.sup=c(25,26), ncp = 3)
data_heures_pca=pca_3$ind$coord
print(pca_3$eig[1:3, 2])
cat('--- PCA ---\n')
cat('Dimension initiale :', dim(data_heures), '\n')
cat('Dimension après projection :', c(nrow(data_jours), 3), '\n\n')

cat('--- Variance expliquée ---\n')
cat('Composante 1 :', round(pca_3$eig[1, 1], 2), 'i.e.', round(pca_3$eig[1, 2], 2), '% de la variance totale\n')
cat('Composante 2 :', round(pca_3$eig[2, 1], 2), 'i.e.', round(pca_3$eig[2, 2], 2), '% de la variance totale\n')
cat('Composante 3 :', round(pca_3$eig[3, 1], 2), 'i.e.', round(pca_3$eig[3, 2], 2), '% de la variance totale\n')

In [None]:
g3<-fviz_eig(pca_3, addlabels = TRUE, ylim = c(0, 60), xlim = c(1,3)) +
      ggtitle("Proportion de Variance Expliquée par Chaque Composante Principale") +
      xlab("Composantes Principales") +
      ylab("Pourcentage de Variance Expliquée")
print(g3)

In [None]:
box<-ggplot(melt(pca_3$ind$coord),aes(x=Var2,y=value)) + 
        geom_boxplot()+
        xlab("") +
        ylab("") +
        ggtitle("Boxplot des 3 composantes principales")
print(box)

In [None]:
g4 <- fviz_pca_var(pca_3, axes = c(1, 2)) +
      ggtitle("Graphe des variables") +
      xlab("PC1") +
      ylab("PC2")

print(g4)

In [None]:
g5 <- fviz_pca_var(pca_3, axes = c(1, 3)) +
      ggtitle("Graphe des variables") +
      xlab("PC1") +
      ylab("PC2")

print(g5)

#### 5.1.1. Méthode de clustering avec k-means

In [None]:
options(repr.plot.width = 9, repr.plot.height = 6)
fviz_nbclust(data_heures[,-c(25,26)], FUNcluster=stats::kmeans, method="wss") +
    ggtitle("Within sum of square (WSS) according to the number of clusters")

fviz_nbclust(data_heures[,-c(25,26)], FUNcluster=stats::kmeans, method="silhouette") +
    ggtitle("Silhouette score according to the number of clusters")

In [None]:
options(repr.plot.width = 15, repr.plot.height = 10)
for (centers in 2:5) {
  kmeans_heures <- kmeans(data_heures[,-c(25,26)], centers=centers)
  sil <- silhouette(kmeans_heures$cluster, dist(data_heures[,-c(25,26)]))
  visualizer <- fviz_silhouette(sil, ylim = c(-0.2, 0.6), main = paste("centers =", centers))
  print(visualizer)
}

In [None]:
K=4
kmeans_heures = kmeans(loading,centers=4)
clusters_heures <- kmeans_heures$cluster

In [None]:
cluster_counts_heures <- table(clusters_heures)
barplot(cluster_counts_heures, 
        main = "Nombre d'individus par cluster", 
        xlab = "Cluster", 
        ylab = "Nombre d'individus")

In [None]:
mean_loadings5 <- aggregate(loading[, -c(169)], by = list(cluster = clusters_heures), FUN = mean)
mean_loadings5
mean_loadings_melted5 <- melt(mean_loadings5, id.vars = "cluster")

ggplot(mean_loadings_melted5, aes(x = variable, y = value, color = factor(cluster), group = cluster)) +
  geom_line() +
  labs(title = "Chargement moyen des stations par cluster", x = "Temps en heures", y = "Chargement") +
  scale_color_discrete(name = "Cluster")

In [None]:
tbl5 = table(coord$bonus,clusters_heures)
print(tbl5)
mosaicplot(tbl5,color=c(1:4), main = "Stations en altitude en fonction du clusters")

Etude sur data_heures_pca

In [None]:
options(repr.plot.width = 9, repr.plot.height = 6)
fviz_nbclust(data_heures_pca, FUNcluster=stats::kmeans, method="wss") +
    ggtitle("Within sum of square (WSS) according to the number of clusters")

fviz_nbclust(data_heures_pca, FUNcluster=stats::kmeans, method="silhouette") +
    ggtitle("Silhouette score according to the number of clusters")

In [None]:
options(repr.plot.width = 15, repr.plot.height = 10)
for (centers in 2:5) {
  kmeans_PCA_heures <- kmeans(data_heures_pca, centers=centers)
  sil <- silhouette(kmeans_PCA_heures$cluster, dist(data_heures_pca))
  visualizer <- fviz_silhouette(sil, ylim = c(-0.2, 0.6), main = paste("centers =", centers))
  print(visualizer)
}

In [None]:
K=4
kmeans_PCA_heures = kmeans(data_heures_pca,centers=4)
clusters_heures_pca <- kmeans_PCA_heures$cluster

In [None]:
cluster_counts_heures <- table(clusters_heures_pca)
barplot(cluster_counts_heures, 
        main = "Nombre d'individus par cluster", 
        xlab = "Cluster", 
        ylab = "Nombre d'individus")

In [None]:
fviz_cluster(kmeans_PCA_heures, data=data_heures[,-c(25,26)], ellipse.type="norm", labelsize=8, geom=c("point"))

In [None]:
tbl5_2 = table(coord$bonus,clusters_heures_pca)
print(tbl5_2)
mosaicplot(tbl5_2,color=c(1:4), main = "Stations en altitude en fonction du clusters")

In [None]:
mean_loadings6 <- aggregate(loading[, -c(169)], by = list(cluster = clusters_heures_pca), FUN = mean)
mean_loadings6
mean_loadings_melted6 <- melt(mean_loadings6, id.vars = "cluster")

ggplot(mean_loadings_melted6, aes(x = variable, y = value, color = factor(cluster), group = cluster)) +
  geom_line() +
  labs(title = "Chargement moyen des stations par cluster", x = "Temps en heures", y = "Chargement") +
  scale_color_discrete(name = "Cluster")

In [None]:
result <- matchClasses(clusters_heures, clusters_heures_pca)
res_pca_sorted <- result$clusters

In [None]:
cluster_palette <- colorFactor(c("#E41A1C","#4DAF4A","#377EB8"), domain = unique(clusters_heures))  # Utiliser la palette viridis avec 10 couleurs

leaflet(velib$position) %>% 
  addTiles(urlTemplate = "https://{s}.basemaps.cartocdn.com/light_all/{z}/{x}/{y}{r}.png", 
           attribution = 'Map tiles by Carto, under CC BY 3.0. Data by OpenStreetMap, under ODbL.') %>% 
  addCircleMarkers(radius = 3,
                   color = ~cluster_palette(clusters_heures),
                   stroke = FALSE, 
                   fillOpacity = 0.9) %>%
addLegend(position = "bottomright", 
            pal = cluster_palette, 
            values = ~clusters_heures,
            title = "Cluster",
            opacity = 1)

cluster_palette <- colorFactor(c("#E41A1C","#4DAF4A","#377EB8"), domain = unique(res_pca_sorted))  # Utiliser la palette viridis avec 10 couleurs

leaflet(velib$position) %>% 
  addTiles(urlTemplate = "https://{s}.basemaps.cartocdn.com/light_all/{z}/{x}/{y}{r}.png", 
           attribution = 'Map tiles by Carto, under CC BY 3.0. Data by OpenStreetMap, under ODbL.') %>% 
  addCircleMarkers(radius = 3,
                   color = ~cluster_palette(res_pca_sorted),
                   stroke = FALSE, 
                   fillOpacity = 0.9) %>%
addLegend(position = "bottomright", 
            pal = cluster_palette, 
            values = ~res_pca_sorted,
            title = "Cluster",
            opacity = 1)


# Quels sont les points identiques ?
points_diff <- clusters_heures != res_pca_sorted

# Calcul du nombre de points différents
num_diff_points <- sum(points_diff)

# Calcul du pourcentage de réussite
pourcentage_reussite <- (1 - num_diff_points / length(clusters)) * 100

# Affichage du résultat
cat("Nombre de points différents :", num_diff_points, "sur", length(clusters), "\n")
cat("Pourcentage de réussite :", sprintf("%.2f", pourcentage_reussite), "%\n")

In [None]:
conf_matrix <- table(clusters_heures, res_pca_sorted)
conf_matrix_df <- as.data.frame(as.table(conf_matrix))

ggplot(data=conf_matrix_df, aes(x = res_pca_sorted, y = clusters_heures, fill = Freq)) +
    geom_tile() +
    geom_text(aes(label=Freq),color="black") +
    scale_fill_gradient(low="white", high="blue") +
    xlab("Sur les données réduites (PCA)") +
    ylab("Sur les données complètes") +
    ggtitle("Matrice de confusion") +
    theme_minimal()

In [None]:
print(conf_matrix)

#### 5.1.2. CAH : Agglomerative Clustering

In [None]:
options(repr.plot.width = 12, repr.plot.height = 6)
grid.arrange(
    fviz_nbclust(data_heures_pca, FUNcluster=hcut, method="wss") + ggtitle("Score WSS"),
    fviz_nbclust(data_heures_pca, FUNcluster=hcut, method="silhouette") + ggtitle("Score Silhouette"),
    ncol=2
)

In [None]:
d2 = dist(data_heures_pca, method="euclidean")

hclustaverage2 = hclust(d2, method="average")
hclustward2 = hclust(d2,method="ward.D")
hclustsingle2 = hclust(d2, method="single")
hclustcomplete2 = hclust(d2,method="complete")

reshclust_average2 = cutree(hclustaverage2, 4)
reshclust_ward2 = cutree(hclustward2, 4)
reshclust_single2 = cutree(hclustsingle2, 4)
reshclust_complete2 = cutree(hclustcomplete2, 4)


options(repr.plot.width=10, repr.plot.height=10)

fviz_dend(hclustaverage2, k=4, show_labels=FALSE, rect=TRUE, main = "Dendrogram avec linkage Average")
fviz_dend(hclustward2, k=4, show_labels=FALSE, rect=TRUE, main = "Dendrogram avec linkage Ward")
fviz_dend(hclustsingle2, k=4, show_labels=FALSE, rect=TRUE, main = "Dendrogram avec linkage Single")
fviz_dend(hclustcomplete2, k=4, show_labels=FALSE, rect=TRUE, main = "Dendrogram avec linkage Complete")

In [None]:
options(repr.plot.width = 9, repr.plot.height = 6)

fviz_pca(pca_2, axes=c(1,2), geom = c("point"), col.ind=as.factor(reshclust_ward2))

In [None]:
cluster_palette <- colorFactor(c("#E41A1C","#4DAF4A","#377EB8"), domain = unique(reshclust_ward2))

leaflet(velib$position) %>% 
  addTiles(urlTemplate = "https://{s}.basemaps.cartocdn.com/light_all/{z}/{x}/{y}{r}.png", 
           attribution = 'Map tiles by Carto, under CC BY 3.0. Data by OpenStreetMap, under ODbL.') %>% 
  addCircleMarkers(radius = 3,
                   color = ~cluster_palette(reshclust_ward2),
                   stroke = FALSE, 
                   fillOpacity = 0.9) %>%
addLegend(position = "bottomright", 
            pal = cluster_palette, 
            values = ~reshclust_ward2,
            title = "Cluster",
            opacity = 1)

In [None]:
mean_loadings_7 <- aggregate(loading[, -169], by = list(cluster = reshclust_ward2), FUN = mean)
mean_loadings_7
mean_loadings_melted_7 <- melt(mean_loadings_7, id.vars = "cluster")

ggplot(mean_loadings_melted_7, aes(x = variable, y = value, color = factor(cluster), group = cluster)) +
  geom_line() +
  labs(title = "Chargement moyen des stations par cluster", x = "Temps en heures", y = "Chargement") +
  scale_color_discrete(name = "Cluster")

In [None]:
tbl7 = table(coord$bonus,reshclust_ward2)
print(tbl7)
mosaicplot(tbl7,color=c(1:4), main = "Stations en altitude en fonction du clusters")

#### 5.1.3. Gaussian Mixture Models

In [None]:
resBICall2 = mclustBIC(data_heures_pca, G=1:10)
summary(resBICall2)

resBICall2 = Mclust(data_heures_pca, G=1:10)
summary(resBICall2)

fviz_mclust(resBICall2, what="BIC")

In [None]:
options(repr.plot.width = 10, repr.plot.height = 6)
gmm = Mclust(data_heures_pca, G=4, modelNames = "VEE")
clusters_gmm_heures=gmm$classification
fviz_cluster(gmm, data=df1[,-c(25,26)], ellipse.type="norm", geom="point")

# --- #

aux = data.frame(
    label = paste("Cluster", clusters_gmm_heures, sep=""), 
    proba = apply(gmm$z, 1, max))

ggplot(aux, aes(x=label, y=proba)) + 
    geom_boxplot(colour=1:4, fill=1:4, alpha=.4)

In [None]:
options(repr.plot.width = 20, repr.plot.height = 10)

fviz_pca(pca_2, axes=c(1,2), geom = c("point"), col.ind=as.factor(clusters_gmm_heures))

In [None]:
mean_loadings_gmm5_1 <- aggregate(loading[, -169], by = list(cluster = clusters_gmm_heures), FUN = mean)
mean_loadings_gmm5_1
mean_loadings_melted_gmm5_1 <- melt(mean_loadings_gmm5_1, id.vars = "cluster")

ggplot(mean_loadings_melted_gmm5_1, aes(x = variable, y = value, color = factor(cluster), group = cluster)) +
  geom_line() +
  labs(title = "Chargement moyen des stations par cluster", x = "Temps en heures", y = "Chargement") +
  scale_color_discrete(name = "Cluster")

In [None]:
tbl2_gmm1 = table(coord$bonus,clusters_gmm_heures)
print(tbl2_gmm1)
mosaicplot(tbl2_gmm1,color=c(1:4), main = "Stations en altitude en fonction du clusters")

Résultats GMM très différents en R.

## 6. MCA sur data_heures

### 6.1. MCA

In [None]:
minuit_am <- rowMeans(loading[,c(1,25,49,73,97,121,145)])
une_am <- rowMeans(loading[,c(2,26,50,74,98,122,146)])
deux_am <- rowMeans(loading[,c(3,27,51,75,99,123,147)])
trois_am <- rowMeans(loading[,c(4,28,52,76,100,124,148)])
quatre_am <- rowMeans(loading[,c(5,29,53,77,101,125,149)])
cinq_am <- rowMeans(loading[,c(6,30,54,78,102,126,150)])
six_am <- rowMeans(loading[,c(7,31,55,79,103,127,151)])
sept_am <- rowMeans(loading[,c(8,32,56,80,104,128,152)])
huit_am <- rowMeans(loading[,c(9,33,57,81,105,129,153)])
neuf_am <- rowMeans(loading[,c(10,34,58,82,106,130,154)])
dix_am <- rowMeans(loading[,c(11,35,59,83,107,131,155)])
onze_am <- rowMeans(loading[,c(12,36,60,84,108,132,156)])
minuit_pm <- rowMeans(loading[,c(13,37,61,85,109,133,157)])
une_pm <- rowMeans(loading[,c(14,38,62,86,110,134,158)])
deux_pm <- rowMeans(loading[,c(15,39,63,87,111,135,159)])
trois_pm <- rowMeans(loading[,c(16,40,64,88,112,136,160)])
quatre_pm <- rowMeans(loading[,c(17,41,65,89,113,137,161)])
cinq_pm <- rowMeans(loading[,c(18,42,66,90,114,138,162)])
six_pm <- rowMeans(loading[,c(19,43,67,91,115,139,163)])
sept_pm <- rowMeans(loading[,c(20,44,68,92,116,140,164)])
huit_pm <- rowMeans(loading[,c(21,45,69,93,117,141,165)])
neuf_pm <- rowMeans(loading[,c(22,46,70,94,118,142,166)])
dix_pm <- rowMeans(loading[,c(23,47,71,95,119,143,167)])
onze_pm <- rowMeans(loading[,c(24,48,72,96,120,144,168)])
loading_quali_heures <- data.frame(Minuit = minuit_am, "une h" = une_am, "2h" = deux_am, "3h" = trois_am, "4h" = quatre_am, "5h" = cinq_am, "6h" = six_am, "7h" = sept_am, "8h" = huit_am, "9h" = neuf_am
                 , "10h" = dix_am, "11h" = onze_am, "12h" = minuit_pm, "13h" = une_pm, "14h" = deux_pm, "15h" = trois_pm, "16h" = quatre_pm, "17h" = cinq_pm, "18h" = six_pm, 
                "19h" = sept_pm, "20h" = huit_pm, "21h" = neuf_pm, "22h" = dix_pm, "23h" = onze_pm)

colnames(loading_quali_heures) <- list("0h", "1h", "2h","3h", "4h", "5h", "6h", "7h", "8h", "9h", "10h", "11h", "12h", "13h", "14h", "15h", "16h", "17h", "18h", "19h", "20h", "21h", "22h", "23h")
remplacer_valeurs <- function(valeur) {
  ifelse(valeur >= 0 & valeur <= 0.2, "-", 
         ifelse(valeur > 0.2 & valeur <= 0.6, "=", "+"))
}
loading_quali_heures <- as.data.frame(sapply(loading_quali_heures, remplacer_valeurs))
loading_quali_heures$Station <- velib$names
loading_quali_heures$Hill <- as.factor(coord$bonus)
loading_quali_heures$Station <- as.factor(loading_quali_heures$Station)
loading_quali_heures[] <- lapply(loading_quali_heures, as.factor)
head(loading_quali_heures)

In [None]:
mca3 = MCA(loading_quali_heures[c(1:24,26)],graph=FALSE)
loading_heures_mca=mca3$ind$coord
head(loading_heures_mca)
#fviz_screeplot(res.mca1, addlabels=TRUE)
#fviz_screeplot(res.mca1, addlabels=TRUE, ncp=7)
#fviz_mca_biplot(res.mca1)
#fviz_mca_biplot(res.mca1, axes=c(1,3))

In [None]:
print(mca3$eig[1:20,])

In [None]:
plot(mca3$eig[1:20,1], type = "l",xlab = "Dimensions", ylab = "Valeurs propres", main = "MCA")


In [None]:
mca3 = MCA(loading_quali_heures[c(1:24,26)],graph=FALSE,ncp=7)
loading_heures_mca=mca3$ind$coord

In [None]:
fviz_mca_var(mca3, col.var = "contrib",
             gradient.cols = c("blue", "yellow", "red"), axes=c(1,2),
             repel = TRUE)
fviz_mca_var(mca3, col.var = "contrib",
             gradient.cols = c("blue", "yellow", "red"), axes=c(1,3),
             repel = TRUE)

In [None]:
fviz_contrib(mca3, choice="var", axes=1, top=20)

fviz_contrib(mca3, choice="var", axes=2, top=20)

fviz_contrib(mca3, choice="var", axes=3, top=20)

### 6.2. Méthode de clustering K-means

In [None]:
options(repr.plot.width = 9, repr.plot.height = 6)
fviz_nbclust(loading_heures_mca, FUNcluster=stats::kmeans, method="wss") +
    ggtitle("Within sum of square (WSS) according to the number of clusters")

fviz_nbclust(loading_heures_mca, FUNcluster=stats::kmeans, method="silhouette") +
    ggtitle("Silhouette score according to the number of clusters")

In [None]:
options(repr.plot.width = 15, repr.plot.height = 10)
for (centers in 2:5) {
  kmeans_mca3 <- kmeans(loading_heures_mca, centers=centers)
  sil <- silhouette(kmeans_mca3$cluster, dist(loading_heures_mca))
  visualizer <- fviz_silhouette(sil, ylim = c(-0.2, 0.6), main = paste("centers =", centers))
  print(visualizer)
}

In [None]:
kmeans_mca3=kmeans(loading_heures_mca,centers=4)
clusters_mca3=kmeans_mca3$cluster
mean_loadings_mca3 <- aggregate(loading[, -c(169)], by = list(cluster = clusters_mca3), FUN = mean)
mean_loadings_mca3
mean_loadings_melted_mca3 <- melt(mean_loadings_mca3, id.vars = "cluster")

ggplot(mean_loadings_melted_mca3, aes(x = variable, y = value, color = factor(cluster), group = cluster)) +
  geom_line() +
  labs(title = "Chargement moyen des stations par cluster", x = "Temps en heures", y = "Chargement") +
  scale_color_discrete(name = "Cluster")

In [None]:
cluster_palette <- colorFactor(c("#E41A1C","#4DAF4A","#377EB8"), domain = unique(clusters_mca3))  # Utiliser la palette viridis avec 10 couleurs

leaflet(velib$position) %>% 
  addTiles(urlTemplate = "https://{s}.basemaps.cartocdn.com/light_all/{z}/{x}/{y}{r}.png", 
           attribution = 'Map tiles by Carto, under CC BY 3.0. Data by OpenStreetMap, under ODbL.') %>% 
  addCircleMarkers(radius = 3,
                   color = ~cluster_palette(clusters_mca3),
                   stroke = FALSE, 
                   fillOpacity = 0.9) %>%
addLegend(position = "bottomright", 
            pal = cluster_palette, 
            values = ~clusters_mca3,
            title = "Cluster",
            opacity = 1)

### 6.3. GMM

In [None]:
resBICall = mclustBIC(loading_heures_mca, G=1:10)
summary(resBICall)

# --- #

resBICall = Mclust(loading_heures_mca, G=1:10)
summary(resBICall)


fviz_mclust(resBICall, what="BIC")

In [None]:
options(repr.plot.width = 10, repr.plot.height = 6)
gmm = Mclust(loading_heures_mca, G=4, modelNames = "EEV")
clusters_gmm_heures_mca=gmm$classification
fviz_cluster(gmm, data=loading_heures_mca, ellipse.type="norm", geom="point")

aux = data.frame(
    label = paste("Cluster", clusters_gmm_heures_mca, sep=""), 
    proba = apply(gmm$z, 1, max))

ggplot(aux, aes(x=label, y=proba)) + 
    geom_boxplot(colour=1:4, fill=1:4, alpha=.4)

In [None]:
mean_loadings_mca3 <- aggregate(loading[, -c(169)], by = list(cluster = clusters_gmm_heures_mca), FUN = mean)
mean_loadings_mca3
mean_loadings_melted_mca3 <- melt(mean_loadings_mca3, id.vars = "cluster")

ggplot(mean_loadings_melted_mca3, aes(x = variable, y = value, color = factor(cluster), group = cluster)) +
  geom_line() +
  labs(title = "Chargement moyen des stations par cluster", x = "Temps en heures", y = "Chargement") +
  scale_color_discrete(name = "Cluster")

In [None]:
cluster_palette <- colorFactor(c("#E41A1C","#4DAF4A","#377EB8"), domain = unique(clusters_gmm_heures_mca))  # Utiliser la palette viridis avec 10 couleurs

leaflet(velib$position) %>% 
  addTiles(urlTemplate = "https://{s}.basemaps.cartocdn.com/light_all/{z}/{x}/{y}{r}.png", 
           attribution = 'Map tiles by Carto, under CC BY 3.0. Data by OpenStreetMap, under ODbL.') %>% 
  addCircleMarkers(radius = 3,
                   color = ~cluster_palette(clusters_gmm_heures_mca),
                   stroke = FALSE, 
                   fillOpacity = 0.9) %>%
addLegend(position = "bottomright", 
            pal = cluster_palette, 
            values = ~clusters_gmm_heures_mca,
            title = "Cluster",
            opacity = 1)

In [None]:
def_pred_station <- function(coord1, coord2) {
    min <- 1000
    r <- 1
    for (i in 1:nrow(coord)) {
        resultat <- sqrt((coord1 - coord[i,1])^2 + (coord2 - coord[i,2])^2)
        if (resultat < min) {
            min <- resultat
            r <- i
        }
    }
    result <- reskmeans$cluster[i]
    return(result)
}

In [None]:
def_pred_station(2.35,48.45)