In [None]:
library(skmeans)
library(ramify)

In [None]:
#spherical k-means based on cosine dissimilarity
clusterMeans=function(data,k){
  skmeans(data,k,method="pclust",control=list(nruns = 1000, maxchains = 100))
} 

In [None]:
elbow_lst<-function(data,mk){
    wsq_lst <- vector("list", mk)
    for (i in 1:mk){
        wsq <- skmeans(data,i,method="pclust",control=list(nruns = 1000))$value
        wsq_lst[[i]] <- wsq}
#         i <- i+1}
    return(wsq_lst)
} 

In [None]:
## Importing packages
library(data.table) # Import big files (CSV or Text) 
library(readxl) # Import Excel files

#  cluster
set.seed(3)

Step 0: input data

In [None]:
data <- read.csv("../input/d/jingyicheng/final-final-data-angular/food_angular.csv")
df <- data[1:nrow(data),2:ncol(data)]
# Converting into numeric matrix
df2 <- data.matrix(df)
data_stock <- read.csv('../input/d/jingyicheng/final-final-data-angular/stock_angular.csv')
df_stock <- data_stock[1:nrow(data_stock),3:ncol(data_stock)-1]
df_stock2 <- data.matrix(df_stock)

step1: generate elbow plots

In [None]:
stock_lst = elbow_lst(df_stock2, 30)
write.csv(stock_lst,"stock_elbow.csv")

In [None]:
food_lst = elbow_lst(df2, 30)
write.csv(food_lst,"food_elbow.csv")

In [None]:
start.time <- Sys.time()
stock_m1 <- clusterMeans(df_stock2,k=5)
end.time <- Sys.time()
time.taken <- round(end.time - start.time,2)
time.taken

In [None]:
start.time <- Sys.time()
stock_m2 <- clusterMeans(df_stock2,k=10)
end.time <- Sys.time()
time.taken <- round(end.time - start.time,2)
time.taken

In [None]:
matrix1 <- stock_m1$cluster
matrix2 <- stock_m2$cluster

In [None]:
vector1 <- c(matrix1)
df_cluster1<- data.frame(vector1)
write.csv(df_cluster1,"k5stock_cluster.csv")
vector2 <- c(matrix2)
df_cluster2<- data.frame(vector2)
write.csv(df_cluster2,"k10stock_cluster.csv")

In [None]:
proto1 <- stock_m1$prototypes
proto2 <- stock_m2$prototypes

write.csv(proto1,"k5stock.csv")
write.csv(proto2,"k10stock.csv")

In [None]:
start.time <- Sys.time()
food_m1 <- clusterMeans(df2,k=15)
end.time <- Sys.time()
time.taken <- round(end.time - start.time,2)
time.taken

In [None]:
start.time <- Sys.time()
food_m2 <- clusterMeans(df2,k=20)
end.time <- Sys.time()
time.taken <- round(end.time - start.time,2)
time.taken

In [None]:
# matrixf1 <- clusterMeans(df2,k=15)$prototypes
# matrixf2 <- clusterMeans(df2,k=20)$prototypes

matrixf1 <- food_m1$prototypes
matrixf2 <- food_m2$prototypes
write.csv(matrixf1,"k15food.csv")
write.csv(matrixf2,"k20food.csv")

run KPC

In [None]:
#######################
#single iteration
#centroids is a k*d matrix with current proposals
clusterPC_iter=function(data, centroids){
  k=length(centroids[,1])
  n=length(data[,1])
  d=length(data[1,])
#     data-size(n*d); centroids-size(k*d);  so we transpose centroids
  M=data%*%t(centroids)
  #find current value, this value is our criteria corresponds to minimizing the expected c2 dissimilarity:
  v=mean(apply(M,1,max))
#     find the index gr where the maximum is obtained
  gr=argmax(M,rows=T)
  for (i in 1:k){
    seldata=data[gr==i,]
    if (length(seldata)==d) #interpretation problem when just one vector
      seldata=t(seldata)
#       sig is a d * d matrix of seldata^2
    Sig=t(seldata)%*%seldata/n
    res=eigen(Sig)
    centroids[i,]=abs(res$vectors[,1]) #use the first eigenvector, the entries of which are necessarily positive
#      the first eigenvector for each cluster as the new centroid_i
  }
#   list(centroids,v)
  list(centroids,v,gr)
}


In [None]:
#pick randomly the initial centers
clusterPCOnce=function(data,k,tol,startFromMeans=FALSE){
  val=0
  n=length(data[,1])
  if (startFromMeans)
    centroids=clusterMeans(data,k)  
  else{
    centroids=data[sample(1:n,k),]
    if (k==1)   #make sure it is a matrix
      centroids=t(as.matrix(centroids))
  }
  niter=0
  repeat{
    niter=niter+1
    res=clusterPC_iter(data, centroids)
    centroids=res[[1]]
    cluster = res[[3]]
    diff=res[[2]]-val
    val=res[[2]]
    if(diff<tol)
      break
  }
#   print(niter)
  list(centroids,val,cluster)
}

In [None]:
#iterate nrep times and pick the best
clusterPC=function(data,k,tol=10^(-5),nrep=1000,startFromMeans=FALSE){
  maxval=0
  for( i in 1:nrep){
    res=clusterPCOnce(data,k,tol,startFromMeans && (i==1))
    if (res[[2]]>maxval){
      cluster = res[[3]]
      maxval=res[[2]]
      centroids=res[[1]]
    }
  }
#   centroids
  list(centroids,maxval,cluster)
}

In [None]:
elbow_lst_pc<-function(data,mk){
    val_lst <- vector("list", mk)
    for (i in 1:mk){
        res <- clusterPC(data,i,tol=10^(-5),nrep=100,startFromMeans=FALSE)
        val <- res[[2]]
        val_lst[[i]] <- val}
#         i <- i+1}
    return(val_lst)
}

In [None]:
stock_lst_pc = elbow_lst_pc(df_stock2, 30)
stock_lst_pc

In [None]:
food_lst_pc = elbow_lst_pc(df2, 30)
food_lst_pc

In [None]:
start.time <- Sys.time()
codes to measure time
stock_matrix_kpc <- clusterPC(df_stock2,5)$centroids
end.time <- Sys.time()
time.taken <- round(end.time - start.time,2)
time.taken

In [None]:
temp <- clusterPC(df_stock2,5)
temp$cluster

In [None]:
# stock_matrix_kpc
write.csv(stock_matrix_kpc,"k5stock_kpc.csv")

In [None]:
start.time <- Sys.time()
# codes to measure time
stock_matrix_kpc2 <- clusterPC(df_stock2,10)$centroids
end.time <- Sys.time()
time.taken <- round(end.time - start.time,2)
time.taken

In [None]:
# stock_matrix_kpc2
write.csv(stock_matrix_kpc2,"k10stock_kpc.csv")

In [None]:
start.time <- Sys.time()
# codes to measure time
food_matrix_kpc <- clusterPC(df2,15)$centroids
end.time <- Sys.time()
time.taken <- round(end.time - start.time,2)
time.taken

In [None]:
start.time <- Sys.time()
# codes to measure time
food_matrix_kpc2 <- clusterPC(df2,20)$centroids
end.time <- Sys.time()
time.taken <- round(end.time - start.time,2)
time.taken

In [None]:
# stock_matrix_kpc
write.csv(food_matrix_kpc,"k15food_kpc.csv")
write.csv(food_matrix_kpc2,"k20food_kpc.csv")