In [None]:
%config Completer.use_jedi = False

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import davies_bouldin_score, normalized_mutual_info_score, silhouette_score
from sklearn.decomposition import PCA
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from scipy.spatial.distance import euclidean
from sentence_transformers import SentenceTransformer

In [None]:
def cluster_sampling(kmeans, label, reduced_fea, cluster_idx, num_samples):
    incluster_dist_list = []
    cluster_center = kmeans.cluster_centers_[label]
    for idx in cluster_idx:
        point = reduced_fea[idx]
        dist = euclidean(point, cluster_center)
        incluster_dist_list.append(dist)
    normalizd_dist = np.array(incluster_dist_list)
    inverse_norm_dist = 1/normalizd_dist
    sampled_idx = np.random.choice(cluster_idx, num_samples, p=inverse_norm_dist/inverse_norm_dist.sum()) 
    
    return sampled_idx, inverse_norm_dist/inverse_norm_dist.sum()

In [None]:
def max_min_scaling(inputs):
    return (inputs  - min(inputs)) /(max(inputs) - min(inputs))

In [None]:
def in_cluster_sample(sent_list, sets, kmeans, kmean_cluster, sample_per_cluster, reduced_fea):
    sample_str = ''
    p_str = ''
    cluster_idx_str = ''
    sample_list = []
    prob_list = []
    annotate_sent_list = []
    un_annotate_sent_list = []
    cluster_idx_list = []
    
    for i in range(kmean_cluster):
        cluster_idx = np.where(kmeans.labels_ == i)[0].tolist() 
        # cluster_idx = np.where(kmeans.labels_ == 0)[0].tolist() 
        # sample = np.random.choice(cluster_idx, sample_per_cluster) 
        sample, p = cluster_sampling(kmeans, i, reduced_fea, cluster_idx, sample_per_cluster)
        diff_idx = list(set(cluster_idx) - set(sample))
        annotate_sent = []
        un_annotate_sent = []
        for j in sample:
            annotate_sent.append(sent_list[j])
            
        for z in diff_idx:
            un_annotate_sent.append(sent_list[z])
            
        sample_list += list(sample)
        prob_list.append(';'.join([str(item) for item in list(p)]))
        annotate_sent_list += list(annotate_sent) 
        un_annotate_sent_list += list(un_annotate_sent) 
        cluster_idx_list.append(';'.join([str(item) for item in cluster_idx]))
        
    df = pd.DataFrame({'index': sample_list, 'sent': annotate_sent_list})
    df.to_csv('../../data/weak_data/{}/annotation.csv'.format(sets), index=False)
    
    df = pd.DataFrame({'index': cluster_idx_list, 'prob': prob_list})
    df.to_csv('../../data/weak_data/{}/annotation_prob.csv'.format(sets), index=False)
    
    with open('../../data/weak_data/{}/annotation.txt'.format(sets), 'w') as f:
        f.write('\n'.join(annotate_sent_list))
        f.close()
        
    with open('../../data/weak_data/{}/un_annotation.txt'.format(sets), 'w') as f:
        f.write('\n'.join(un_annotate_sent_list))
        f.close()

In [None]:
def gen_sent_list(sets, pca_dim, kmean_cluster, sample_per_cluster):
    input_text_path = '../../data/weak_data/{}/full_text.txt'.format(sets)
    model = SentenceTransformer('../../tools/sbert-uncased-finnish-paraphrase')
    sent_list = []
    with open(input_text_path, 'r') as f:
        for line in f.readlines():
            sent_list.append(line.strip())
        f.close()
    sent_list = list(set(sent_list))
    # tfidf_vect = TfidfVectorizer()
    # tfidf = tfidf_vect.fit_transform(sent_list)
    sent_vec = model.encode(sent_list)
    pca = PCA(n_components=pca_dim)
    reduced_fea = pca.fit_transform(sent_vec)
    kmeans = KMeans(n_clusters=kmean_cluster).fit(reduced_fea)
    
    in_cluster_sample(sent_list, sets, kmeans, kmean_cluster, sample_per_cluster, reduced_fea)

In [None]:
sets = 'KIR'
pca_dim = 10
kmean_cluster = 9
sample_per_cluster = 25
gen_sent_list(sets, pca_dim, kmean_cluster, sample_per_cluster)

In [None]:
sets = 'LAH'
pca_dim = 10
kmean_cluster = 10
sample_per_cluster = 20
gen_sent_list(sets, pca_dim, kmean_cluster, sample_per_cluster)

In [None]:
sets = 'OPER'
pca_dim = 5
kmean_cluster = 9
sample_per_cluster = 22
gen_sent_list(sets, pca_dim, kmean_cluster, sample_per_cluster)

In [None]:
sets = 'RTG'
pca_dim = 5
kmean_cluster = 9
sample_per_cluster = 18
gen_sent_list(sets, pca_dim, kmean_cluster, sample_per_cluster)

In [None]:
sets = 'SAD'
pca_dim = 5
kmean_cluster = 8
sample_per_cluster = 25
gen_sent_list(sets, pca_dim, kmean_cluster, sample_per_cluster)