# import pacakges

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances  #k-means using euclidean_distances
import gensim
import scipy
import datetime
import pickle
import gap
import warnings
# warnings.filterwarnings('ignore')

# Load GloVe embedding vectors

In [None]:
#data_name options = freebase, dbpedia, wisekb
dataname = 'wisekb'

In [None]:
# opt_k : dbpedia = 15
#         wiseKB = 27
#         freebase = 46
opt_k=27

In [None]:
f = open(f'./glove_{dataname}/person_embedding','rb')
vector = pickle.load(f, encoding='latin1')
f.close()

f = open(f'./glove_{dataname}/person_words','rb')
word = pickle.load(f, encoding='latin1')
f.close()
 
glove_dict = {}
for i in range(len(vector)):
    glove_dict[word[i]] = vector[i]
    
print(len(glove_dict))    

In [None]:
label_dir = f'./data/{dataname}/'

train_pos_loc = label_dir + 'train_positive_20000.txt'
train_neg_loc = label_dir + 'train_negative_5000.txt'
test_pos_loc = label_dir + 'test_positive_5000.txt'
test_neg_loc = label_dir + 'test_negative_5000.txt'

train_pos_embedding = []
train_neg_embedding = []
test_pos_embedding = []
test_neg_embedding = []

if dataname=='freebase':
    sep = '\t'
else:
    sep = ' '
    
with open(train_pos_loc) as f:          
    for i in f:
        train_pos_embedding.append(i.split(sep)[0].strip())
        
print('train_pos : ',len(train_pos_embedding))
print(train_pos_embedding[:5])

with open(train_neg_loc) as f:          
    for i in f:
        train_neg_embedding.append(i.split(sep)[0].strip())
        
print('\ntrain_neg : ',len(train_neg_embedding))
print(train_neg_embedding[:5])

with open(test_pos_loc) as f:       
    for i in f:
        test_pos_embedding.append(i.split(sep)[0].strip())
        
print('\ntest_pos : ',len(test_pos_embedding))
print(test_pos_embedding[:5])

with open(test_neg_loc) as f:
    for i in f:
        test_neg_embedding.append(i.split(sep)[0].strip())
        
print('\ntest_neg : ',len(test_neg_embedding))
print(test_neg_embedding[:5])

# run Gap algorithm 
- to get optimal K for K-means algorithm

In [None]:
'''
this function first generates B reference samples; for each sample, the sample size is the same as the original datasets;
the value for each reference sample follows a uniform distribution for the range of each feature of the original datasets;
using a simplify formula to compute the D of each cluster, and then the Wk; K should be a increment list, 1-10 is fair enough;
the B value is about the number of replicated samples to run gap-statistics, it is recommended as 10, and it should not be changed/decreased that to a smaller value;

parameters:
    X: np.array, the original data;
    refs: np.array or None, it is the replicated data that you want to compare with if there exists one; if no existing replicated/proper data, just use None, and the function
    will automatically generates them; 
    B: int, the number of replicated samples to run gap-statistics; it is recommended as 10, and it should not be changed/decreased that to a smaller value;
    K: list type, the range of K values to test on;
    N_init: int, states the number of initial starting points for each K-mean running under sklearn, in order to get stable clustering result each time; 
    you may not need such many starting points, so it can be reduced to a smaller number to fasten the computation;
    n_jobs below is not an argument for this function,but it clarifies the parallel computing, could fasten the computation, this can be only changed inside the script, not as an argument of the function;
'''
# for debug
#X = init_board_gauss(200,4, clear = False)
#plt.scatter(X[:,0],X[:,1])

# X = np.array([glove_dict[n.split()[0].strip()] for n in train_pos_embedding])
# gaps, sk, K = gap.gap_statistic(X)
print()

# For K-means algorithm 

In [None]:
'''
function to run K-means algorithm

parameters:
    vector_list : list of vectors to which the K-means algorithm applies
    k : value of K to run K-means
    
return:
    X : glove_vectors of input 
    k_centroid_points : centroid points of k clusters
    k_labels : label for the cluster to which each vector belongs to
'''
def Kmeans_alg(vector_list, k):
    X = np.array([glove_dict[n.split()[0]] for n in vector_list])

    kmeans = KMeans(n_clusters=k, random_state=0,max_iter=1000).fit(X)
    k_centroid_points = kmeans.cluster_centers_  # each c_count of center point 
    k_labels = kmeans.labels_                    # [...] index is vector's index, value is in cluster
    
    return X, k_centroid_points, k_labels


In [None]:
'''
function to check which cluster given vector belongs to 

parameters:
    vec_tuple_list : 20000 length list of tuple tuple (entitiy, vector)
    delats_ : 1 use initial radius
    max_distance_each_cluster : radius of each cluster dict key : cluster, value : radius distance
    centroids : centorid point each cluster nparray shape(28,100)    
    
return:
    in_cluster_dict : dict of which cluster a given vector belongs to 
'''
def Grant_to_cluster(vec_tuple_list, delta, max_distance_each_cluster, centroids, removed_clusters=[]):
    in_cluster_dict = {}
    distance_matrix = euclidean_distances([i[1] for i in vec_tuple_list], centroids) # numpy(20000, number of cluster)

    if len(removed_clusters) == len(centroids):
        print('There is no cluster left! Experiment Finished!')
        return None
    
    for i in range(len(list(vec_tuple_list))):
        distance_with_centroid = distance_matrix[i]
        close_cluster = np.argmin(distance_with_centroid) # 해당 entity와 가장 가까운 cluster
        
        check_cluster = True

        while check_cluster == True:
            # if close cluster is removed
            if close_cluster in removed_clusters:
                distance_with_centroid[close_cluster] = max(distance_with_centroid)
                close_cluster = np.argmin(distance_with_centroid)
            else:
                check_cluster = False
                
        delta = np.float32(delta)
        radius_delta = np.round(max_distance_each_cluster[close_cluster]*delta, 5)
        min_distance = np.round(min(distance_with_centroid), 5)
        
        if  radius_delta >= min_distance:
            in_cluster_dict[vec_tuple_list[i][0]] = close_cluster
        else:
            in_cluster_dict[vec_tuple_list[i][0]] = 'x'
    
    return in_cluster_dict

## metrics formulas

In [None]:
def countTPFP(true_cluster, false_cluster, centroids):
    true_count = {}
    for i in range(len(centroids)):
        true_count[i] = list(true_cluster.values()).count(i)

    true_count['x'] = list(true_cluster.values()).count('x')

    false_count = {}
    for i in range(len(centroids)):
        false_count[i] = list(false_cluster.values()).count(i)

    false_count['x'] = list(false_cluster.values()).count('x')

    return true_count, false_count

def getPR(true_count, false_count, cluster_labels, initial_TP):
    precision_list = []
    recall_list = []
    f1_list = []
    tp_list = []
    fn_list = []
    fp_list = []
    tn_list = []
    
    for cluster in range(cluster_labels):
        TP = true_count[cluster]
        FN = initial_TP[cluster] - TP
        FP = false_count[cluster]
        TN = false_count['x'] 

        if (TP+FP) == 0:
            precision = 0.0
        else:
            precision = float(TP)/(TP+FP)

        if (TP+FN) == 0:
            recall = 0.0
        else:
            recall =float(TP)/(TP+FN)

        if (precision+recall) == 0:
            f1_score = 0.0
        else:
            f1_score = float(2*precision*recall) / (precision+recall)
            
        precision_list.append(precision)
        recall_list.append(recall)
        f1_list.append(f1_score)

        tp_list.append(TP)
        fn_list.append(FN)
        fp_list.append(FP)
        tn_list.append(TN)

    return precision_list, recall_list, f1_list,\
                tp_list, fn_list, fp_list, tn_list

In [None]:
def getPRlist(pos_vector, neg_vector, max_distance_each_cluster, centroids, removed_cluster):
    precision_matrix = []
    recall_matrix = []
    f1_matrix = []
    tp_matrix = []
    fn_matrix = []
    fp_matrix = []
    tn_matrix = []
    
    true_cluster = Grant_to_cluster(pos_vector, 1, max_distance_each_cluster, centroids, removed_cluster)
    false_cluster = Grant_to_cluster(neg_vector, 1, max_distance_each_cluster, centroids, removed_cluster)
    true_count, false_count = countTPFP(true_cluster, false_cluster, centroids)
    if 'x' in true_count.keys():
        del(true_count['x'])
    initial_TP = [v for k, v in sorted(true_count.items())]

    deltas = np.arange(0.6,1.0,0.01)
    
    for delta in deltas:
        true_cluster = Grant_to_cluster(pos_vector, delta, max_distance_each_cluster, centroids, removed_cluster)
        false_cluster = Grant_to_cluster(neg_vector, delta, max_distance_each_cluster, centroids, removed_cluster)
        true_count, false_count = countTPFP(true_cluster, false_cluster, centroids)

        p, r, f1, tp, fn, fp, tn= getPR(true_count, false_count, len(centroids), initial_TP)

        precision_matrix.append(p) # precision of each cluster and delta / shape(40, len(centroids)) 
        recall_matrix.append(r) # recall of each cluster and delta / shape(40, len(centroids))
        f1_matrix.append(f1)
        tp_matrix.append(tp)
        fn_matrix.append(fn)
        fp_matrix.append(fp)
        tn_matrix.append(tn)

    return np.array(precision_matrix), np.array(recall_matrix), np.array(f1_matrix),\
            np.array(tp_matrix), np.array(fn_matrix),\
            np.array(fp_matrix), np.array(tn_matrix)

In [None]:
def getOptimalPRlist(pos_vector, neg_vector, max_distance_each_cluster, centroids, removed_cluster, optimaldeltas):
    optimalP = []
    optimalR = []
    optimalTP = []
    optimalFP = []

    true_cluster = Grant_to_cluster(pos_vector, 1, max_distance_each_cluster, centroids, removed_cluster)
    false_cluster = Grant_to_cluster(neg_vector, 1, max_distance_each_cluster, centroids, removed_cluster)
    true_count, false_count = countTPFP(true_cluster, false_cluster, centroids)
    if 'x' in true_count.keys():
        del(true_count['x'])
    initial_TP = [v for k, v in sorted(true_count.items())]
    
    for cluster, delta in enumerate(optimaldeltas):
        True_cluster = Grant_to_cluster(pos_vector,delta,max_distance_each_cluster,centroids, removed_cluster)
        False_cluster = Grant_to_cluster(neg_vector,delta,max_distance_each_cluster,centroids, removed_cluster)
        Truecount, Falsecount = countTPFP(True_cluster, False_cluster, centroids)
        
        #TP
        TP = []
        if 'x' in Truecount.keys():
            del(Truecount['x'])
            
        for k,v in sorted(Truecount.items()):
            if int(k) == cluster:
                optimalTP.append(v)
            TP.append(v)
            
        #FP
        FP = []
        if 'x' in Falsecount.keys():
            del(Falsecount['x'])
            
        for k,v in sorted(Falsecount.items()):
            if int(k) == cluster:
                optimalFP.append(v)        
            FP.append(v)
            
        #precision  
        precisionList = []
        for i in range(len(centroids)):
            if (TP[i]+FP[i]) == 0:
                precision = float(0.0)
            else:
                precision = TP[i] / (TP[i]+FP[i])
            precisionList.append(precision)
        
        #recall
        recallList = []
        for i in range(len(centroids)):
            FN = initial_TP[i]-TP[i]
            if (TP[i]+ FN)== 0:
                recall = float(0.0)
            else:
                recall = TP[i] / (TP[i]+FN)
            recallList.append(recall)

        optimalP.append(precisionList[cluster])
        optimalR.append(recallList[cluster])
        
    return optimalP, optimalR, optimalTP, optimalFP

# Pre-processing

In [None]:
############## run Kmeans algorithm ##############
vec_list, centroids, c_label = Kmeans_alg(train_pos_embedding, opt_k)

# dictionary for labeling entity with cluster 
dicts_each_cluster = {}

for c in range(opt_k):
    same_Cluster = []    
    for idx, label in enumerate(c_label):
        if label == c:
            same_Cluster.append(vec_list[idx].tolist())    
    dicts_each_cluster[c]= list(same_Cluster)

# dictionary for each cluster's max Euclidean distance 
max_distance_each_cluster = {}
each_cluster_distance = []

for i in dicts_each_cluster.keys():
    in_vectors = dicts_each_cluster[i] # list of vectors 
    tmp_distance = euclidean_distances(in_vectors, centroids[i].reshape(1,-1))
    each_cluster_distance.append(tmp_distance)
    max_distance_each_cluster[i] = max(tmp_distance)[0].astype(np.float32)
    
# Get vectors from embedding model 
train_pos_vector = list(map(lambda x: (x,glove_dict[x]), train_pos_embedding))
train_neg_vector = list(map(lambda x: (x,glove_dict[x]), train_neg_embedding))

## Restriction with conditions

In [None]:
remove_threshold = len(train_pos_embedding)*0.01
precision_threshold = 0.4

In [None]:
# 1st Check Statistics before delta optimzing 
true_cluster = Grant_to_cluster(train_pos_vector, 1, max_distance_each_cluster, centroids)
false_cluster = Grant_to_cluster(train_neg_vector, 1, max_distance_each_cluster, centroids)
true_count, false_count = countTPFP(true_cluster, false_cluster, centroids)

result_list = []
for cluster in range(len(centroids)):
    TP = true_count[cluster]
    FP = false_count[cluster]
    if (TP+FP) == 0:
        precision = 0.0
    else:
        precision = TP / (TP+FP)
    result_list.append([TP,FP,precision])

first_result_df = pd.DataFrame(result_list, columns=['1st_pos', '1st_neg', '1st_precision'])
# first_result_df

# Restriction on cluster
remove_condition_list = []

remove_condition_list.append(first_result_df['1st_pos'] < remove_threshold)
remove_condition_list.append(first_result_df['1st_precision'] < precision_threshold)

remove_cluster = set()
for condition in remove_condition_list:
    remove_cluster = remove_cluster.union(set(list(condition[condition].index.values)))

remove_cluster = sorted(remove_cluster)
print(remove_cluster)

# 2nd Check
true_cluster = Grant_to_cluster(train_pos_vector, 1, max_distance_each_cluster, centroids, remove_cluster)
false_cluster = Grant_to_cluster(train_neg_vector, 1, max_distance_each_cluster, centroids, remove_cluster)
true_count, false_count = countTPFP(true_cluster, false_cluster, centroids)

result_list = []
for cluster in range(len(centroids)):
    TP = true_count[cluster]
    FP = false_count[cluster]
    if (TP+FP) == 0:
        precision = 0.0
    else:
        precision = TP / (TP+FP)
    result_list.append([TP,FP,precision])
    
second_result_df = pd.DataFrame(result_list, columns=['2nd_pos', '2nd_neg', '2nd_precision'])
pre_result_df = pd.concat([first_result_df, second_result_df], axis = 1)
pre_result_df

# New Train Procedure

## Optimize delta for each clusters

In [None]:
P, R, F1, TP, FN, FP, TN = getPRlist(train_pos_vector, train_neg_vector, 
                                              max_distance_each_cluster, 
                                              centroids, remove_cluster)

TP_list = TP[np.array(F1).argmax(0),list(range(TP.shape[1]))]
optimal_deltas = np.array(F1).argmax(0)
optimal_deltas = (optimal_deltas * 0.01)+0.6
optimal_deltas = np.array([d if tp != 0 else 0 for d, tp in zip(optimal_deltas, TP_list)])

print('optimaldeltas each clusters : \n', optimal_deltas)
print()

In [None]:
optimalP, optimalR, optimalTP, optimalFP = getOptimalPRlist(train_pos_vector, train_neg_vector, 
                                                            max_distance_each_cluster,
                                                            centroids, remove_cluster,
                                                            optimal_deltas)

In [None]:
l1 = []
for n1, n2, n3, n4, n5 in zip(optimalP, optimalR,  optimalTP, optimalFP, optimal_deltas):
    l1.append([n1, n2, n3, n4, n5])
    
opt_delta_df = pd.DataFrame(l1, columns=['precision', 'recall', 'TP','FP', 'delta'])
opt_delta_df

In [None]:
tmp_parse_df = opt_delta_df[['delta','TP','FP', 'precision']]
train_result_df = pd.concat([pre_result_df, tmp_parse_df], axis = 1)
train_result_df

## Prediction Result

In [None]:
# Get test vectors from embedding model 
test_pos_vector = list(map(lambda x: (x,glove_dict[x]), test_pos_embedding))
test_neg_vector = list(map(lambda x: (x,glove_dict[x]), test_neg_embedding))

In [None]:
optimalP, optimalR, optimalTP, optimalFP = getOptimalPRlist(test_pos_vector, test_neg_vector, 
                                                            max_distance_each_cluster,
                                                            centroids, remove_cluster, optimal_deltas)

In [None]:
l1 = []
for n1, n2, n3, n4,n5 in zip(optimalP, optimalR, optimalTP, optimalFP, optimal_deltas):
    l1.append([n1, n2, n3, n4, n5])
    
output_df = pd.DataFrame(l1, columns=['precision', 'recall', 'TP','FP', 'delta'])
output_df

## Test output Dataframe

In [None]:
tmp_output_df = output_df[['TP','FP', 'precision']]
tmp_output_df.columns = ['Test_TP','Test_FP', 'Test_precision']

train_test_df = pd.concat([train_result_df, tmp_output_df], axis = 1)
train_test_df

In [None]:
new_TP = sum(train_test_df['Test_TP'].values)
new_FP = sum(train_test_df['Test_FP'].values)

new_precision = new_TP / (new_TP + new_FP)
new_recall = new_TP / len(test_pos_embedding)
new_f1 = (2*new_precision*new_recall) / (new_precision+new_recall)

print('TP : ', new_TP)
print('FP : ', new_FP)
print()
print('precision : ', new_precision)
print('recall : ', new_recall)
print('F1 : ', new_f1)

## With one delta - previous method

In [None]:
def Get_Precision_Recall(True_dic,False_dic):
    FN = list(True_dic.values()).count('x')
    TP = len(True_dic.values()) - FN
    TN = list(False_dic.values()).count('x')
    FP = len(False_dic.values()) - TN
    
    if (TP+FP) == 0:
        precision = 0.0
    else:
        precision = float(TP)/(TP+FP)
        
    if (TP+FN) == 0:
        recall = 0.0
    else:
        recall =float(TP)/(TP+FN)
    
    return precision,recall,TP,FP,TN,FN

In [None]:
deltas = np.arange(0.6,1.0,0.01)

result_list = []

for i in deltas:
    true_cluster = Grant_to_cluster(train_pos_vector, i, max_distance_each_cluster, centroids, remove_cluster)
    false_cluster = Grant_to_cluster(train_neg_vector, i, max_distance_each_cluster, centroids, remove_cluster)

    precision, recall, TP, FP, TN, FN = Get_Precision_Recall(true_cluster,false_cluster)
    if (precision + recall) == 0:
        f1_measures = 0
    else:
        f1_measures = float(2*precision*recall) / (precision+recall)

    print("deltas : "+ str(round(i,2)))

    result_list.append([i, TP, FN, TN, FP, precision, recall, f1_measures])

In [None]:
one_delta_df = pd.DataFrame(result_list, columns=['delta', 'TP', 'FN', 'TN', 'FP', 'precision', 'recall', 'F1'])
one_delta_df.sort_values(by=['F1'], ascending=False).head(5)

In [None]:
one_delta = one_delta_df.sort_values(by=['F1'], ascending=False).iloc[0]['delta']

In [None]:
true_cluster = Grant_to_cluster(test_pos_vector, one_delta, max_distance_each_cluster, centroids, remove_cluster)
false_cluster = Grant_to_cluster(test_neg_vector, one_delta, max_distance_each_cluster, centroids, remove_cluster)

precision, recall, TP, FP, TN, FN = Get_Precision_Recall(true_cluster,false_cluster)

if (precision + recall) == 0:
    f1_measures = 0
else:
    f1_measures = float(2*precision*recall) / (precision+recall)

result_list = []

result_list.append([one_delta, TP, FN, TN, FP, precision, recall, f1_measures])
one_result_df = pd.DataFrame(result_list, columns=['delta', 'TP', 'FN', 'TN', 'FP',  'precision', 'recall', 'F1'])
one_result_df