In [None]:
%config Completer.use_jedi = False

In [None]:
import transformers
from transformers import AutoModel, AutoTokenizer
import pandas as pd

In [None]:
import matplotlib.pyplot as plt
import numpy as np

In [None]:
from sklearn.metrics import davies_bouldin_score, normalized_mutual_info_score, silhouette_score
from sklearn.decomposition import PCA
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans, AgglomerativeClustering, SpectralClustering, BisectingKMeans
from sklearn.decomposition import PCA
from scipy.spatial.distance import euclidean
from sentence_transformers import SentenceTransformer

In [None]:
sets = 'KIR'
fin_bert_path = '../../tools/bert-base-finnish-uncased-v1/'
input_text_path = '../../data/weak_data/{}/full_text.txt'.format(sets)

In [None]:
sent_tensor_list = []
length_list = []
sent_list = []
with open(input_text_path, 'r') as f:
    for line in f.readlines():
        sent_list.append(line.strip())
        #line_tensor = tokenizer.encode(line)
        #sent_tensor_list.append(line_tensor)
        #length_list.append(len(line_tensor))
    f.close()

In [None]:
model = SentenceTransformer('../../tools/sbert-uncased-finnish-paraphrase')
tfidf = model.encode(sent_list)

In [None]:
tfidf.shape

In [None]:
# tfidf_vect = TfidfVectorizer()
# tfidf = tfidf_vect.fit_transform(sent_list)

In [None]:
kmeans = KMeans(n_clusters=2).fit(tfidf)

In [None]:
len(kmeans.labels_)

In [None]:
davies_bouldin_score(tfidf.toarray(), kmeans.labels_)

In [None]:
silhouette_score(tfidf.toarray(), kmeans.labels_)

In [None]:
num_cluster = np.linspace(2, 10, 9)

In [None]:
d_score_list = []
s_score_list = []
for i in num_cluster:
    kmeans = KMeans(n_clusters=int(i)).fit(tfidf)
    d_score = davies_bouldin_score(tfidf.toarray(), kmeans.labels_)
    s_score = silhouette_score(tfidf.toarray(), kmeans.labels_)
    d_score_list.append(d_score)
    s_score_list.append(s_score)

In [None]:
plt.plot(num_cluster, d_score_list, label='davies_bouldin_score')
plt.plot(num_cluster, s_score_list, label='silhouette_score')
plt.legend()
plt.show()

#### PCA

In [None]:
# calculate the length
file_length_list = []
med_spec_list = ['KIR', 'LAH', 'OPER', 'RTG', 'SAD']
for med in med_spec_list:
    file_path = '../../data/processed_data/{}/full_sample.txt'.format(med)
    with open(file_path, 'r') as f:
        sent_len = len(f.readlines())
        file_length_list.append(sent_len)
        f.close()

In [None]:
[i/sum(file_length_list)*1000 for i in file_length_list]
for i in range(len(file_length_list)):
    print('The num of {} is: {}; '.format(med_spec_list[i], int(file_length_list[i]/sum(file_length_list)*1000)))

##### PCA--> 200 --> better (Davies-Bouldin index --> lower score is better)

In [None]:
pca = PCA(n_components=200)
reduced_fea = pca.fit_transform(tfidf.toarray())

In [None]:
d_score_list = []
s_score_list = []
for i in num_cluster:
    kmeans = KMeans(n_clusters=int(i)).fit(reduced_fea)
    d_score = davies_bouldin_score(reduced_fea, kmeans.labels_)
    s_score = silhouette_score(reduced_fea, kmeans.labels_)
    d_score_list.append(d_score)
    s_score_list.append(s_score)
plt.plot(num_cluster, d_score_list, label='davies_bouldin_score')
plt.plot(num_cluster, s_score_list, label='silhouette_score')
plt.legend()
plt.show()

##### PCA--> 20

In [None]:
pca = PCA(n_components=20)
reduced_fea = pca.fit_transform(tfidf.toarray())

In [None]:
d_score_list = []
s_score_list = []
for i in num_cluster:
    kmeans = KMeans(n_clusters=int(i)).fit(reduced_fea)
    d_score = davies_bouldin_score(reduced_fea, kmeans.labels_)
    s_score = silhouette_score(reduced_fea, kmeans.labels_)
    d_score_list.append(d_score)
    s_score_list.append(s_score)
plt.plot(num_cluster, d_score_list, label='davies_bouldin_score')
plt.plot(num_cluster, s_score_list, label='silhouette_score')
plt.legend()
plt.show()

##### PCA--> 10

In [None]:
def max_min_scaling(inputs):
    return (inputs  - min(inputs)) /(max(inputs) - min(inputs))

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=10)
reduced_fea = pca.fit_transform(tfidf.toarray())

In [None]:
d_score_list = []
s_score_list = []
for i in num_cluster:
    kmeans = KMeans(n_clusters=int(i)).fit(reduced_fea)
    d_score = davies_bouldin_score(reduced_fea, kmeans.labels_)
    s_score = silhouette_score(reduced_fea, kmeans.labels_)
    d_score_list.append(d_score)
    s_score_list.append(s_score)
plt.plot(num_cluster, d_score_list, label='davies_bouldin_score')
plt.plot(num_cluster, s_score_list, label='silhouette_score')
plt.legend()
plt.show()

In [None]:
d = max_min_scaling(d_score_list)[-1]
s = max_min_scaling(s_score_list)[-1]
s/d

In [None]:
ds_score_list(max_min_scaling(s_score_list), max_min_scaling(d_score_list))

##### PCA--> 5

In [None]:
pca = PCA(n_components=5)
reduced_fea = pca.fit_transform(tfidf.toarray())

d_score_list = []
s_score_list = []
for i in num_cluster:
    kmeans = KMeans(n_clusters=int(i)).fit(reduced_fea)
    d_score = davies_bouldin_score(reduced_fea, kmeans.labels_)
    s_score = silhouette_score(reduced_fea, kmeans.labels_)
    d_score_list.append(d_score)
    s_score_list.append(s_score)
plt.plot(num_cluster, d_score_list, label='davies_bouldin_score')
plt.plot(num_cluster, s_score_list, label='silhouette_score')
plt.legend()
plt.show()

In [None]:
d = max_min_scaling(d_score_list)[-1]
s = max_min_scaling(s_score_list)[-1]
s/d

##### PCA--> 2

In [None]:
pca = PCA(n_components=2)
reduced_fea = pca.fit_transform(tfidf.toarray())

d_score_list = []
s_score_list = []
for i in num_cluster:
    kmeans = KMeans(n_clusters=int(i)).fit(reduced_fea)
    d_score = davies_bouldin_score(reduced_fea, kmeans.labels_)
    s_score = silhouette_score(reduced_fea, kmeans.labels_)
    d_score_list.append(d_score)
    s_score_list.append(s_score)
plt.plot(num_cluster, d_score_list, label='davies_bouldin_score')
plt.plot(num_cluster, s_score_list, label='silhouette_score')
plt.legend()
plt.show()

In [None]:
d = max_min_scaling(d_score_list)[-1]
s = max_min_scaling(s_score_list)[-1]
s/d

In [None]:
ds_score_list(max_min_scaling(s_score_list), max_min_scaling(d_score_list))

##### BEST: PCA --> 10; K-means --> 10 

In [None]:
pca = PCA(n_components=10)
reduced_fea = pca.fit_transform(tfidf)

kmeans = KMeans(n_clusters=10).fit(reduced_fea)

In [None]:
len(kmeans.labels_)

In [None]:
# each cluster --> choose 21 samples
int(214/10)

In [None]:
sample_str = ''
sample_list = []
annotate_sent_list = []
num_samples = 21
num_clusters = 10
for i in range(num_clusters):
    cluster_idx = np.where(kmeans.labels_ == i)[0].tolist() 
    # cluster_idx = np.where(kmeans.labels_ == 0)[0].tolist() 
    # sample = np.random.choice(cluster_idx, 21) 
    sample = euclidean(cluster_idx, num_samples)
    annotate_sent = []
    for j in sample:
        annotate_sent.append(sent_list[j])
    sample_list += list(sample)
    annotate_sent_list += list(annotate_sent) 
df = pd.DataFrame({'index': sample_list, 'sent': annotate_sent_list})

In [None]:
df

In [None]:
len(sent_list)

In [None]:
len(list(set(sent_list)))

In [None]:
label = 0
cluster_idx = np.where(kmeans.labels_ == 0)[0].tolist() 
first_tensor = reduced_fea[cluster_idx[label]]
cluster_center = kmeans.cluster_centers_[label]
euclidean(first_tensor, cluster_center)

In [None]:
label = 0
cluster_idx = np.where(kmeans.labels_ == label)[0].tolist() 

incluster_dist_list = []
cluster_center = kmeans.cluster_centers_[label]
for idx in cluster_idx:
    point = reduced_fea[idx]
    dist = euclidean(point, cluster_center)
    incluster_dist_list.append(dist)

In [None]:
x = np.linspace(1, len(incluster_dist_list), len(incluster_dist_list))
plt.plot(x, incluster_dist_list)
plt.show()

In [None]:
# normalize the distance
normalizd_dist = np.array(incluster_dist_list)

In [None]:
normalizd_dist = normalizd_dist/normalizd_dist.sum()

In [None]:
inverse_norm_dist = 1/normalizd_dist

In [None]:
P = inverse_norm_dist/inverse_norm_dist.sum()

In [None]:
sum(inverse_norm_dist/inverse_norm_dist.sum())

In [None]:
selected_idx = np.random.choice(cluster_idx, 21, p=inverse_norm_dist/inverse_norm_dist.sum()) 

In [None]:
len(list(P))

In [None]:
len(cluster_idx)

In [None]:
diff_idx = list(set(cluster_idx) - set(selected_idx))

In [None]:
len(cluster_idx)

In [None]:
len(diff_idx)

In [None]:
np.diff(cluster_idx, selected_idx)

In [None]:
np.random.choice(cluster_idx, 21, p=normalizd_dist/normalizd_dist.sum()) 

In [None]:
def cluster_sampling(cluster_idx, num_samples):
    incluster_dist_list = []
    cluster_center = kmeans.cluster_centers_[label]
    for idx in cluster_idx:
        point = reduced_fea[idx]
        dist = euclidean(point, cluster_center)
        incluster_dist_list.append(dist)
    normalizd_dist = max_min_scaling(np.array(incluster_dist_list))
    sampled_idx = np.random.choice(cluster_idx, 21, p=normalizd_dist/normalizd_dist.sum()) 
    
    return sampled_idx

In [None]:
len(cluster_idx)

In [None]:
len(cluster_idx)

In [None]:
reduced_fea.shape

In [None]:
cluster_center = kmeans.cluster_centers_[1]
euclidean(first_tensor, cluster_center)

In [None]:
with open('../../data/processed_data/KIR/annotation.txt', 'w') as f:
    f.write('\n'.join(annotate_sent_list))
    f.close()

In [None]:
df.to_csv('../../data/processed_data/KIR/annotation.csv', index=False)

In [None]:
cluster_idx = np.where(kmeans.labels_ == 0)[0].tolist() 
sample = np.random.choice(cluster_idx, 21) 
annotate_sent = []
for j in sample:
    annotate_sent.append(sent_list[j])
df = pd.DataFrame({'index': sample, 'sent': annotate_sent})

In [None]:
df

### run for all documents

In [None]:
# model = SentenceTransformer('../../tools/sbert-uncased-finnish-paraphrase')
# tfidf = model.encode(sent_list)

In [None]:
def max_min_scaling(inputs):
    return (inputs  - min(inputs)) /(max(inputs) - min(inputs))

In [None]:
def select_pca_kmean(sets):
    input_text_path = '../../data/weak_data/{}/full_text.txt'.format(sets)
    model = SentenceTransformer('../../tools/sbert-uncased-finnish-paraphrase')
    sent_tensor_list = []
    length_list = []
    sent_list = []
    with open(input_text_path, 'r') as f:
        for line in f.readlines():
            sent_list.append(line.strip())
        f.close()
    sent_vec = model.encode(sent_list)
    
    return sent_vec

In [None]:
def d_s_score_plot(tfidf, n_cluster):
    num_cluster = np.linspace(2, 10, 9)
    pca = PCA(n_components=n_cluster)
    reduced_fea = pca.fit_transform(tfidf)
    d_score_list = []
    s_score_list = []
    for i in num_cluster:
        # SpectralClustering   AgglomerativeClustering BisectingKMeans
        kmeans = AgglomerativeClustering(n_clusters=int(i)).fit(reduced_fea)
        d_score = davies_bouldin_score(reduced_fea, kmeans.labels_)
        s_score = silhouette_score(reduced_fea, kmeans.labels_)
        d_score_list.append(d_score)
        s_score_list.append(s_score)
    # d = max_min_scaling(d_score_list)[-1]
    # s = max_min_scaling(s_score_list)[-1]
    # return s/d
    plt.plot(num_cluster, d_score_list, label='davies_bouldin_score')
    plt.plot(num_cluster, s_score_list, label='silhouette_score')
    plt.legend()
    plt.show()

In [None]:
def ds_score_list(s_score_list, d_score_list):
    score_list = []
    for i in range(len(s_score_list)):
        s = s_score_list[i]
        d = d_score_list[i]
        if d != 0:
            score_list.append(s/d)
    return score_list

In [None]:
def d_s_score(tfidf, n_cluster):
    num_cluster = np.linspace(2, 10, 9)
    pca = PCA(n_components=n_cluster)
    reduced_fea = pca.fit_transform(tfidf)
    d_score_list = []
    s_score_list = []
    for i in num_cluster:
        kmeans = AgglomerativeClustering(n_clusters=int(i)).fit(reduced_fea)
        d_score = davies_bouldin_score(reduced_fea, kmeans.labels_)
        s_score = silhouette_score(reduced_fea, kmeans.labels_)
        d_score_list.append(d_score)
        s_score_list.append(s_score)
    # d = max_min_scaling(d_score_list)
    # s = max_min_scaling(s_score_list)
    score_list = ds_score_list(max_min_scaling(s_score_list), max_min_scaling(d_score_list))
    return score_list

In [None]:
def d_s_score(tfidf, n_cluster):
    num_cluster = np.linspace(2, 10, 9)
    pca = PCA(n_components=n_cluster)
    reduced_fea = pca.fit_transform(tfidf)
    d_score_list = []
    s_score_list = []
    for i in num_cluster:
        kmeans = AgglomerativeClustering(n_clusters=int(i)).fit(reduced_fea)
        d_score = davies_bouldin_score(reduced_fea, kmeans.labels_)
        #s_score = silhouette_score(reduced_fea, kmeans.labels_)
        d_score_list.append(d_score)
        #s_score_list.append(s_score)
    # d = max_min_scaling(d_score_list)
    # s = max_min_scaling(s_score_list)
    # score_list = ds_score_list(max_min_scaling(s_score_list), max_min_scaling(d_score_list))
    return d_score_list

#### KIR

In [None]:
sets = 'KIR'
n_cluster = 200
score_list = d_s_score(select_pca_kmean(sets), n_cluster)
print('Score of {} in {} clusters is: {}'.format(sets, n_cluster, score_list))

n_cluster = 100
score_list = d_s_score(select_pca_kmean(sets), n_cluster)
print('Score of {} in {} clusters is: {}'.format(sets, n_cluster, score_list))

n_cluster = 50
score_list = d_s_score(select_pca_kmean(sets), n_cluster)
print('Score of {} in {} clusters is: {}'.format(sets, n_cluster, score_list))

n_cluster = 20
score_list = d_s_score(select_pca_kmean(sets), n_cluster)
print('Score of {} in {} clusters is: {}'.format(sets, n_cluster, score_list))

n_cluster = 10
score_list = d_s_score(select_pca_kmean(sets), n_cluster)
print('Score of {} in {} clusters is: {}'.format(sets, n_cluster, score_list))

n_cluster = 5
score_list = d_s_score(select_pca_kmean(sets), n_cluster)
print('Score of {} in {} clusters is: {}'.format(sets, n_cluster, score_list))

n_cluster = 2
score_list = d_s_score(select_pca_kmean(sets), n_cluster)
print('Score of {} in {} clusters is: {}'.format(sets, n_cluster, score_list))

#### PCA-->2; Kmeans-->10

In [None]:
int(227/10)

#### LAH

In [None]:
sets = 'LAH'
n_cluster = 200
score_list = d_s_score(select_pca_kmean(sets), n_cluster)
print('Score of {} in {} clusters is: {}'.format(sets, n_cluster, score_list))

n_cluster = 100
score_list = d_s_score(select_pca_kmean(sets), n_cluster)
print('Score of {} in {} clusters is: {}'.format(sets, n_cluster, score_list))

n_cluster = 50
score_list = d_s_score(select_pca_kmean(sets), n_cluster)
print('Score of {} in {} clusters is: {}'.format(sets, n_cluster, score_list))

n_cluster = 20
score_list = d_s_score(select_pca_kmean(sets), n_cluster)
print('Score of {} in {} clusters is: {}'.format(sets, n_cluster, score_list))

n_cluster = 10
score_list = d_s_score(select_pca_kmean(sets), n_cluster)
print('Score of {} in {} clusters is: {}'.format(sets, n_cluster, score_list))

n_cluster = 5
score_list = d_s_score(select_pca_kmean(sets), n_cluster)
print('Score of {} in {} clusters is: {}'.format(sets, n_cluster, score_list))

n_cluster = 2
score_list = d_s_score(select_pca_kmean(sets), n_cluster)
print('Score of {} in {} clusters is: {}'.format(sets, n_cluster, score_list))

#### PCA-->10; Kmeans-->10

In [None]:
int(203/10)

#### OPER

In [None]:
sets = 'OPER'
n_cluster = 200
score_list = d_s_score(select_pca_kmean(sets), n_cluster)
print('Score of {} in {} clusters is: {}'.format(sets, n_cluster, score_list))

n_cluster = 100
score_list = d_s_score(select_pca_kmean(sets), n_cluster)
print('Score of {} in {} clusters is: {}'.format(sets, n_cluster, score_list))

n_cluster = 50
score_list = d_s_score(select_pca_kmean(sets), n_cluster)
print('Score of {} in {} clusters is: {}'.format(sets, n_cluster, score_list))

n_cluster = 20
score_list = d_s_score(select_pca_kmean(sets), n_cluster)
print('Score of {} in {} clusters is: {}'.format(sets, n_cluster, score_list))

n_cluster = 10
score_list = d_s_score(select_pca_kmean(sets), n_cluster)
print('Score of {} in {} clusters is: {}'.format(sets, n_cluster, score_list))

n_cluster = 5
score_list = d_s_score(select_pca_kmean(sets), n_cluster)
print('Score of {} in {} clusters is: {}'.format(sets, n_cluster, score_list))

n_cluster = 2
score_list = d_s_score(select_pca_kmean(sets), n_cluster)
print('Score of {} in {} clusters is: {}'.format(sets, n_cluster, score_list))

#### PCA-->5; Kmeans-->9

In [None]:
int(203/10)

#### RTG

In [None]:
sets = 'RTG'
n_cluster = 200
score_list = d_s_score(select_pca_kmean(sets), n_cluster)
print('Score of {} in {} clusters is: {}'.format(sets, n_cluster, score_list))

n_cluster = 100
score_list = d_s_score(select_pca_kmean(sets), n_cluster)
print('Score of {} in {} clusters is: {}'.format(sets, n_cluster, score_list))

n_cluster = 50
score_list = d_s_score(select_pca_kmean(sets), n_cluster)
print('Score of {} in {} clusters is: {}'.format(sets, n_cluster, score_list))

n_cluster = 20
score_list = d_s_score(select_pca_kmean(sets), n_cluster)
print('Score of {} in {} clusters is: {}'.format(sets, n_cluster, score_list))

n_cluster = 10
score_list = d_s_score(select_pca_kmean(sets), n_cluster)
print('Score of {} in {} clusters is: {}'.format(sets, n_cluster, score_list))

n_cluster = 5
score_list = d_s_score(select_pca_kmean(sets), n_cluster)
print('Score of {} in {} clusters is: {}'.format(sets, n_cluster, score_list))

n_cluster = 2
score_list = d_s_score(select_pca_kmean(sets), n_cluster)
print('Score of {} in {} clusters is: {}'.format(sets, n_cluster, score_list))

#### PCA-->5; Kmeans-->9

In [None]:
int(163/10)

#### SAD

In [None]:
sets = 'SAD'
n_cluster = 200
score_list = d_s_score(select_pca_kmean(sets), n_cluster)
print('Score of {} in {} clusters is: {}'.format(sets, n_cluster, score_list))

n_cluster = 100
score_list = d_s_score(select_pca_kmean(sets), n_cluster)
print('Score of {} in {} clusters is: {}'.format(sets, n_cluster, score_list))

n_cluster = 50
score_list = d_s_score(select_pca_kmean(sets), n_cluster)
print('Score of {} in {} clusters is: {}'.format(sets, n_cluster, score_list))

n_cluster = 20
score_list = d_s_score(select_pca_kmean(sets), n_cluster)
print('Score of {} in {} clusters is: {}'.format(sets, n_cluster, score_list))

n_cluster = 10
score_list = d_s_score(select_pca_kmean(sets), n_cluster)
print('Score of {} in {} clusters is: {}'.format(sets, n_cluster, score_list))

n_cluster = 5
score_list = d_s_score(select_pca_kmean(sets), n_cluster)
print('Score of {} in {} clusters is: {}'.format(sets, n_cluster, score_list))

n_cluster = 2
score_list = d_s_score(select_pca_kmean(sets), n_cluster)
print('Score of {} in {} clusters is: {}'.format(sets, n_cluster, score_list))

#### PCA-->5; Kmeans-->10

In [None]:
int(204/10)