Permalink
Cannot retrieve contributors at this time
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
56 lines (39 sloc)
1.44 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| library(factoextra) | |
| library(cluster) | |
| #load data | |
| df <- USArrests | |
| #remove rows with missing values | |
| df <- na.omit(df) | |
| #scale each variable to have a mean of 0 and sd of 1 | |
| df <- scale(df) | |
| #define linkage methods | |
| m <- c( "average", "single", "complete", "ward") | |
| names(m) <- c( "average", "single", "complete", "ward") | |
| #function to compute agglomerative coefficient | |
| ac <- function(x) { | |
| agnes(df, method = x)$ac | |
| } | |
| #calculate agglomerative coefficient for each clustering linkage method | |
| sapply(m, ac) | |
| #perform hierarchical clustering using Ward's minimum variance | |
| clust <- agnes(df, method = "ward") | |
| #produce dendrogram | |
| pltree(clust, cex = 0.6, hang = -1, main = "Dendrogram") | |
| #calculate gap statistic for each number of clusters (up to 10 clusters) | |
| gap_stat <- clusGap(df, FUN = hcut, nstart = 25, K.max = 10, B = 50) | |
| #produce plot of clusters vs. gap statistic | |
| fviz_gap_stat(gap_stat) | |
| #compute distance matrix | |
| d <- dist(df, method = "euclidean") | |
| #perform hierarchical clustering using Ward's method | |
| final_clust <- hclust(d, method = "ward.D2" ) | |
| #cut the dendrogram into 4 clusters | |
| groups <- cutree(final_clust, k=4) | |
| # Number of members in each cluster | |
| table(groups) | |
| #append cluster labels to original data | |
| final_data <- cbind(USArrests, cluster = groups) | |
| #display first six rows of final data | |
| head(final_data) | |
| #find mean values for each cluster | |
| aggregate(final_data, by=list(cluster=final_data$cluster), mean) |