In [None]:
# libraries
library("readxl")
options(warn = 0)
library(dplyr)
library(ggplot2)
library(caTools)
library(caret)
library(GGally)
library(janitor)
library(corrplot)
library(tidyverse)
library(hrbrthemes)
library(viridis)
library(NbClust)
library(factoextra)
library(gridExtra)

df <- read_excel("data/Whitewine_v2.xlsx")
#add index column to data frame
df$index <- 1:nrow(df)
head(df)

# replacing each quality measures into 1-4 range
# 1 wrost and 4 is best
df["quality"][df["quality"] == 5] <- 1
df["quality"][df["quality"] == 6] <- 2
df["quality"][df["quality"] == 7] <- 3
df["quality"][df["quality"] == 8] <- 4

head(df)

boxplot(df[1:11])

# format columns
df <- janitor::clean_names(df)

#EDA
summary(df)

#check null values
sum(is.na(df))

#Correlation Heatmap of Variables
corrplot(cor(df[1:12]))

z_score = function(x) {
  return((x - mean(x)) / sd(x))
}

dfNorm <- as.data.frame(lapply(df[1:11], z_score))
dfNorm$index <- 1:nrow(dfNorm)
head(dfNorm)

summary(dfNorm[1:11])

#only keep rows in dataframe with all z-scores less than absolute value of 3 
no_outliers <- dfNorm[!rowSums(dfNorm[1:11]>3),]

head(no_outliers)

# merge normalized one ane orginal quality
# merge two data frames by ID
df_final <- merge(no_outliers,df[12:13],by="index")

head(df_final)

summary(df_final[2:12])

histogram = function(x,title,x_label) {
  # Add a Normal Curve (Thanks to Peter Dalgaard)
  x <- x
  h<-hist(x, breaks=10, col="red", xlab=x_label,
    main=title)
  xfit<-seq(min(x),max(x),length=40)
  yfit<-dnorm(xfit,mean=mean(x),sd=sd(x))
  yfit <- yfit*diff(h$mids[1:2])*length(x)
  lines(xfit, yfit, col="blue", lwd=2)
}

# histogram(df$fixed_acidity,"fixed_acidity values before preprocessing","fixed_acidity value")
# histogram(df_final$fixed_acidity,"fixed_acidity values after preprocessing","fixed_acidity value")
histogram(df$quality,"quality values before preprocessing","fixed_acidity value")
histogram(df_final$quality,"quality values after preprocessing","fixed_acidity value")

head(df_final)

# see the datasrt types
str(df_final)

df_nb <- df_final[2:12]

#setting seed point
set.seed(26)

# #euclidient distance
no_of_clusters_eud = NbClust(df_nb,distance="euclidean", min.nc=2,max.nc=10,method="kmeans",index="all")
# #manhatten
no_of_clusters_man = NbClust(df_nb,distance="manhattan", min.nc=2,max.nc=15,method="kmeans",index="all")
# #maximum
no_of_clusters_maximum = NbClust(df_nb,distance="maximum", min.nc=2,max.nc=15,method="kmeans",index="all")

fviz_nbclust(df_final[2:12],kmeans,method="wss")+geom_vline(xintercept=4,linetype=1)


k = 1:10
set.seed(25)	
WSS = sapply(k, function(k) {kmeans(df_final[2:12], centers=k)$tot.withinss})

# You can then use a line plot to plot the within sum of squares with a different number of k
plot(k, WSS, type="b", xlab= "Number of k", ylab="Within sum of squares")

# silhouette
fviz_nbclust(df_final[2:12], kmeans, method = "silhouette") + labs(subtitle = "Silhoutte Method")


# clustering
head(df_final)

x=df_final[2:12]
y=df_final$quality

kc_2 <- kmeans(x,centers=2)
kc_2
str(kc_2)
table(y,kc_2$cluster)
# implement the metrix
confusionMatrix(

 factor(kc_2$cluster, levels = 1:4),

 factor(df_final$quality,levels=1:4)
)
fviz_cluster(kc_2,data=x)


kc_3 <- kmeans(x,centers=3)
kc_3
str(kc_3)
table(y,kc_3$cluster)
# implement the metrix
confusionMatrix(

 factor(kc_3$cluster, levels = 1:4),

 factor(df_final$quality,levels=1:4)
)
fviz_cluster(kc_3,data=x)


kc_4 <- kmeans(x,centers=4)
kc_4
str(kc_4)
table(y,kc_4$cluster)
# implement the metrix
confusionMatrix(

 factor(kc_4$cluster, levels = 1:4),

 factor(df_final$quality,levels=1:4)
)
fviz_cluster(kc_4,data=x)


# plots to compare
p1 <- fviz_cluster(kc_2, geom = "point", data = x) + ggtitle("k = 2")
p2 <- fviz_cluster(kc_3, geom = "point",  data = x) + ggtitle("k = 3")
p3 <- fviz_cluster(kc_4, geom = "point",  data = x) + ggtitle("k = 4")
grid.arrange(p1, p2, p3,nrow = 2)


head(df_final)

principle_components <- prcomp(df_final[2:12])
principle_components
summary(principle_components)

plot(principle_components)
plot(principle_components, type = "l")
biplot(principle_components)
biplot(principle_components, scale = 0)

str(principle_components)

df_final_with_pcs <- cbind(df_final, principle_components$x)

head(df_final_with_pcs)

library(ggplot2)

ggplot(df_final_with_pcs, aes(PC9,PC10,PC11,col = quality, fill = quality)) +
  stat_ellipse(geom = "polygon", col = "black", alpha = 0.5) +
  geom_point(shape = 21, col = "black")
  
x_2 = df_final_with_pcs[22:24]
y_2 = df_final_with_pcs$quality
kcpca <- kmeans(x_2,4)
kcpca
fviz_cluster(kcpca,data=x_2)
table(y_2,kcpca$cluster)