# kmeans

In [None]:
library(jsonlite)
library(lm.beta)
library(quantmod)
library(anomalize)
library(tibbletime)
library(dplyr)
library(ggplot2)
library(cluster)
library(ppclust)

whole = readLines("Data.jsonl",encoding="UTF-8")

index = sample(1:length(whole), 1000, replace=FALSE)
index_3 = index[1:500]
# index_1 = index[501:1000]  ##1년치

# 3년치
json_df_3 = data.frame()
for(i in index_3) {
    temp = data.frame(keyword=jsonlite::fromJSON(txt=whole[i])$keyword, data=t(jsonlite::fromJSON(txt=whole[i])$data[1]))
    json_df_3 = rbind(json_df_3, temp)
}
json_df_3 = json_df_3[-which(apply(data.frame(json_df_3$data), 2, sum)==0),]  ## 전부 다 0인 데이터셋 pass

In [None]:
X = data.frame()

for(i in 1:nrow(json_df_3)) {
    
    dataset = scale(na.omit(json_df_3[i,2][[1]]))  ## 표준화
    
    if(sum(as.numeric(table(dataset)) > length(dataset)/2)>0) next  ## 0인 데이터가 절반 이상일 경우 pass
    
    # 절댓값 slope
    temp = data.frame(x = 1:length(dataset), y = dataset)
    model = lm(temp[,2] ~ temp[,1], data=temp)  ## model : 선형회귀모형
    if(summary(lm.beta(model))$coef[2,5]<0.05) slope = abs(summary(lm.beta(model))$coef[2,1])
    else slope = 0
    
    # three_ttest
    group1 = dataset[1:12]
    group2 = dataset[13:24]
    group3 = dataset[25:36]
    first = ifelse(t.test(group1, group2)$p.value<0.05,"H1","H0")
    second = ifelse(t.test(group2, group3)$p.value<0.05,"H1","H0")
    three_ttest = ifelse(first=="H1" & second=="H1", "11", ifelse(first=="H1" & second=="H0", "10", ifelse(first=="H0" & second=="H1", "01", "00")))
    
    # MSE
    mse = anova(lm.beta(model))$Mean[2]

    # peakvalley : 극값 개수
    peakvalley = length(findPeaks(dataset)) + length(findValleys(dataset))

    # ttest : H0) diff값의 평균 = 0  / H1) diff값의 평균 != 0
    if(shapiro.test(diff(dataset))$p.value>0.05) {
        ttest = ifelse(t.test(diff(dataset), mu=0)$p.value<0.05,"H1","H0")
    } else ttest = "H1"

    # anomaly : 이상치 개수
    anomalized = as_data_frame(data.frame(time=as.Date(1:length(dataset),origin = "2020-01-01"),value=dataset)) %>%
        time_decompose(value) %>%  ## season, trend, extra 분해
        anomalize(remainder, alpha=0.05, max_anoms=0.05)   ## 이상 감지 수행(하한, 상한)
    anomaly = sum(anomalized$anomaly=="Yes")

    # variation : max or min / median
    temp = dataset + abs(min(dataset))
    variation = max(max(temp)-median(temp), median(temp)-min(temp)) / sd(temp)
    
    # outlier
    outlier = length(boxplot(dataset)$out)
    
    # mm : median / abs_max
    mm = median(dataset) / max(abs(dataset))
    
    X = rbind(X,data.frame(slope, three_ttest, mse, peakvalley, ttest, anomaly, variation, outlier, mm))
}

In [None]:
# set.seed(42)
result = kmeans(daisy(X),3, nstart=50)$cluster
qplot(mse, variation, colour = as.factor(result), data=X)
# t(which(result==1))
# t(which(result==2))
# t(which(result==3))
# plot(json_df_3[12,2][[1]],type='l')
# plot(json_df_3[14,2][[1]],type='l')
# plot(json_df_3[16,2][[1]],type='l')
# plot(json_df_3[21,2][[1]],type='l')
# plot(json_df_3[27,2][[1]],type='l')


#kmeans 이외 (pam, fcm, hclust) : not good

# trend & anomaly

In [None]:
library(jsonlite)
library(lm.beta)
library(anomalize)
library(tibbletime)
library(dplyr)
library(ggplot2)
library(gridExtra)

whole = readLines("Data.jsonl",encoding="UTF-8")

json_df = data.frame()
for(i in 1:length(whole)) {
    temp = data.frame(keyword=jsonlite::fromJSON(txt=whole[i])$keyword, data=t(jsonlite::fromJSON(txt=whole[i])$data[1]))
    json_df = rbind(json_df, temp)
}
json_df = json_df[-which(apply(data.frame(json_df$data), 2, sum)==0),]  ## 전부 다 0인 데이터셋 pass

In [None]:
k = 1 ## dataset 하나 지정
data_title = json_df[k,1]
dataset = na.omit(json_df[k,2][[1]])

# linear regression
temp = data.frame(x = 1:length(dataset), y = dataset)
model = lm(temp[,2] ~ temp[,1], data=temp)  ## model : 선형회귀모형
isRegression = ifelse(summary(lm.beta(model))$coef[2,5] < 0.05, TRUE, FALSE)  ## 회귀계수 유의한지
b = summary(lm.beta(model))$coef[2,1]  ## slope
a = summary(lm.beta(model))$coef[1,1]  ## intercept


# anomaly : 이상치 탐지
anomalized = as_data_frame(data.frame(time=as.Date(1:length(dataset),origin = "2020-01-01"),value=dataset)) %>%
    time_decompose(value) %>%  ## season, trend, extra 분해
    anomalize(remainder, alpha=0.05, max_anoms=0.05)   ## 이상 감지 수행(하한, 상한)
isAnomaly = ifelse(sum(anomalized$anomaly=="Yes") > 0, TRUE, FALSE)  ## 이상치 존재하는지


# plot
df = data.frame(x=1:length(dataset), y=dataset)
result = ggplot(df, aes(x=df[,1], y=df[,2])) + geom_point() + xlab('time') + ylab('observed') +
ggtitle(paste("데이터명 : ",data_title)) + theme(plot.title = element_text(size = 20, hjust = 0.5))
if(isRegression) {
    result = result + geom_abline(aes(intercept=a, slope=b), color='darkblue', size = 1)
}
if(isAnomaly) {
   result = result + geom_point(mapping=aes(x=df[which(anomalized$anomaly=="Yes"),1],y=df[which(anomalized$anomaly=="Yes"),2]), color="red", size=2) +
    geom_point(mapping=aes(x=df[which(anomalized$anomaly=="Yes"),1],y=df[which(anomalized$anomaly=="Yes"),2]), shape=1, color="red", size=4.5)
}


if(isRegression) {
    comment_regression = data.frame(comment="데이터는 트렌드가 존재합니다.", result=paste("y = ", a, " + ", b, "x"))
} else comment_regression = data.frame(comment="데이터는 트렌드가 존재하지 않습니다.", result="-")

if(isAnomaly) {
    comment_anomaly = data.frame(comment="데이터는 이상치가 존재합니다.", result=paste("index num : ", which(anomalized$anomaly=="Yes")))
} else comment_anomaly = data.frame(comment="데이터는 이상치가 존재하지 않습니다.", result="-")
comment = rbind(comment_regression, comment_anomaly)


grid.arrange(result, tableGrob(comment, rows=NULL), heights = c(3, 1))

# persistent homology

In [None]:
library(jsonlite)
library(TDA)

whole = readLines("Data.jsonl",encoding="UTF-8")

json_df = data.frame()
for(i in 1:200) {
    temp = data.frame(keyword=jsonlite::fromJSON(txt=whole[i])$keyword, data=t(jsonlite::fromJSON(txt=whole[i])$data[1]))
    json_df = rbind(json_df, temp)
}

# set1
one = scale(json_df[18,2][[1]])
two = scale(json_df[164,2][[1]])
three = scale(json_df[103,2][[1]])
four = scale(json_df[23,2][[1]])

# set2
# one = scale(json_df[3,2][[1]])
# two = scale(json_df[154,2][[1]])
# three = scale(json_df[9,2][[1]])
# four = scale(json_df[19,2][[1]])

In [None]:
par(mfrow=c(2,2))
by = .01
h = .3

# one
dataset = one
xlim = cbind(round(min(dataset))-1, round(max(dataset))+1)
xseq = seq(from = round(min(dataset))-1, to = round(max(dataset))+1, by = by)

diag = gridDiag(X = dataset, FUN = kde, lim = xlim, by = by, h = h)
band = bootstrapBand(X = dataset, FUN = kde, Grid = xseq, B = 100, parallel = FALSE, alpha = 0.1, h = h)
plot(diag[["diagram"]], band = 2 * band[["width"]], main = "KDE Diagram - one")


# two
dataset = two
xlim = cbind(round(min(dataset))-1, round(max(dataset))+1)
xseq = seq(from = round(min(dataset))-1, to = round(max(dataset))+1, by = by)

diag = gridDiag(X = dataset, FUN = kde, lim = xlim, by = by, h = h)
band = bootstrapBand(X = dataset, FUN = kde, Grid = xseq, B = 100, parallel = FALSE, alpha = 0.1, h = h)
plot(diag[["diagram"]], band = 2 * band[["width"]], main = "KDE Diagram - two")


# three
dataset = three
xlim = cbind(round(min(dataset))-1, round(max(dataset))+1)
xseq = seq(from = round(min(dataset))-1, to = round(max(dataset))+1, by = by)

diag = gridDiag(X = dataset, FUN = kde, lim = xlim, by = by, h = h)
band = bootstrapBand(X = dataset, FUN = kde, Grid = xseq, B = 100, parallel = FALSE, alpha = 0.1, h = h)
plot(diag[["diagram"]], band = 2 * band[["width"]], main = "KDE Diagram - three")


# four
dataset = four
xlim = cbind(round(min(dataset))-1, round(max(dataset))+1)
xseq = seq(from = round(min(dataset))-1, to = round(max(dataset))+1, by = by)

diag = gridDiag(X = dataset, FUN = kde, lim = xlim, by = by, h = h)
band = bootstrapBand(X = dataset, FUN = kde, Grid = xseq, B = 100, parallel = FALSE, alpha = 0.1, h = h)
plot(diag[["diagram"]], band = 2 * band[["width"]], main = "KDE Diagram - four")

# tsclust

In [None]:
library(dtwclust)
library(jsonlite)

whole = readLines("Data.jsonl",encoding="UTF-8")

json_df = data.frame()
for(i in 1:length(whole)) {
    temp = data.frame(keyword=jsonlite::fromJSON(txt=whole[i])$keyword, data=t(tslist(zscore(jsonlite::fromJSON(txt=whole[i])$data[1][[1]]))))
    if(sum(as.numeric(table(temp$data[[1]])) > length(temp$data[[1]])/2)>0) next  ## 절반 이상이 0이면 pass
    json_df = rbind(json_df, temp)
}

In [None]:
smooth_df = data.frame()
for(i in 1:nrow(json_df)) {
    temp = data.frame(keyword=json_df[i,1], data=t(tslist(predict(smooth.spline(json_df[i,2][[1]], lambda=0.0001))$y)))
    smooth_df = rbind(smooth_df, temp)
}
fit3=tsclust(smooth_df[1:1000,2], k=20L, distance="euclidean")
plot(fit3)

# FDA(functional data analysis)

In [None]:
library(jsonlite)
library(dtwclust)
library(fda)
library(funFEM)

whole = readLines("Data.jsonl",encoding="UTF-8")

json_df = data.frame()
for(i in 1:length(whole)) {
    temp = data.frame(keyword=jsonlite::fromJSON(txt=whole[i])$keyword, data=t(tslist(zscore(jsonlite::fromJSON(txt=whole[i])$data[1][[1]]))))
    if(sum(as.numeric(table(temp$data[[1]])) > length(temp$data[[1]])/2)>0) next  ## 절반 이상이 0이면 pass
    json_df = rbind(json_df, temp)
}

In [None]:
basis = create.bspline.basis(c(1,36))
fdobj = smooth.basis(1:36,sapply(data.frame(json_df[1:1000,2]), as.numeric),basis)
res = funFEM(fdobj$fd, model="all")

plotfit.fd(sapply(data.frame(json_df[1:1000,2]), as.numeric),1:36,fdobj$fd)  ## 데이터별 관측치에 뼈대 적합 plot

fdobj$coefs = t(res$prms$my)
plot(fdobj, col=as.factor(res$cls))  ## 전체 뼈대 plot

which(res$cls==1)
which(res$cls==2)
which(res$cls==3)   ## 클러스터 결과
# plot(json_df[6,2][[1]], type="l")
# plot(json_df[37,2][[1]], type="l")
# plot(json_df[191,2][[1]], type="l")
# plot(json_df[525,2][[1]], type="l")
# plot(json_df[944,2][[1]], type="l")