In [24]:
import numpy as np
import pandas as pd
from ast import literal_eval

# load data
datafile_path = "./data/embeddings_10k.csv"
datafile_path_v2 = "./data/embeddings_10k_v2.csv"

df_v1 = pd.read_csv(datafile_path)
df = pd.read_csv(datafile_path_v2)
print(df.head(10))
df['category_encoded'] = df_v1['category_encoded']
df["embedding_cat_headline"] = df.embedding_cat_headline.apply(literal_eval).apply(np.array)  # convert string to numpy array
cat_matrix = np.vstack(df.category_encoded.values)
matrix = np.vstack(df.embedding_cat_headline.values)

cat_matrix.shape



   Unnamed: 0                             embedding_cat_headline  \
0       50346  [-0.018390951678156853, 0.00830016192048788, 0...   
1       50356  [-0.011984625831246376, -0.007588221691548824,...   
2       57628  [-0.020466890186071396, 0.017243118956685066, ...   
3       63176  [-0.006752511486411095, -0.008794428780674934,...   
4       64463  [-0.023287193849682808, 0.0003693056642077863,...   
5       64476  [0.0009472903329879045, -0.008034930564463139,...   
6       66138  [-0.016023065894842148, -0.011005213484168053,...   
7       66139  [-0.012922842055559158, 0.010963845998048782, ...   
8       66144  [-0.007001790683716536, -0.02618696354329586, ...   
9       66879  [-0.018545327708125114, -0.014058117754757404,...   

                               embedding_description  
0  [0.0075088865123689175, 0.0004971068701706827,...  
1  [-0.012862027622759342, -0.008416718803346157,...  
2  [-0.01038492750376463, -0.011509961448609829, ...  
3  [-0.02313167229294777, -0.01

(9750, 1)

In [25]:
df["category_encoded"].unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41], dtype=int64)

In [26]:
n_clusters = 42
init_centroids = []
for i in range(n_clusters):
    init_centroids.append(df[df['category_encoded'] == i].sample(150).embedding_cat_headline.mean())
init_centroids = np.array(init_centroids)
init_centroids.shape

# for i in range(n_clusters):
#     print(df[df['category_encoded'] == i].count())

(42, 1536)

In [27]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=n_clusters, init=init_centroids, random_state=7)
kmeans.fit(matrix)
labels = kmeans.labels_
df["Cluster"] = labels

# print(df.groupby("Cluster").apply(lambda x: x)['category_encoded'])
df.groupby("Cluster").apply(lambda x: x)['category_encoded'].to_csv('./data/cluster_category.csv')

# cluster_cat_df = pd.read_csv('./data/cluster_category.csv')
# cluster_cat_count = []
# for i in range(n_clusters):
#     cluster_cat_count.append(cluster_cat_df.loc[cluster_cat_df['Cluster'] == i].category_encoded.value_counts())
    # print("cluster num:", i)
    # print("total count: ", cluster_cat_df.loc[cluster_cat_df['Cluster'] == i].count())
    # print(cluster_cat_df.loc[cluster_cat_df['Cluster'] == i].category_encoded.value_counts())
# print(cluster_cat_count)
# for cluster, cat in df.groupby("Cluster").apply(lambda x: x)['category_encoded']:
    

# print(df.groupby("Cluster").apply(lambda x: x.category_encoded.value_counts().idxmax()))
cluster_category_mapping = df.groupby("Cluster").apply(lambda x: x.category_encoded.value_counts().idxmax()).to_dict()
print(cluster_category_mapping)
cluster_centers = df.groupby("Cluster").embedding_cat_headline.mean()
# cluster_centers


  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


{0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41}


In [28]:
valuess = []
for key, value in cluster_category_mapping.items():
    valuess.append(value)

print(set( valuess))
print(len(set( valuess)))


{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41}
42


In [29]:
# choose several near classes if the distances are smaller than the threshold
def choose_near_classes(embed, num_near_classes, threshold, class_centers, category_encoded):
    chosen_classes = []
    for i, center in enumerate(class_centers):
        dis = np.linalg.norm(embed - center)
        if dis < threshold and i != category_encoded:
            chosen_classes.append([i, dis])
    chosen_classes = sorted(chosen_classes, key=lambda x: x[1])
    home_dis = np.linalg.norm(embed - class_centers[category_encoded][0])
    return (chosen_classes[:num_near_classes], home_dis)

# output the class feature value: 1.0 -> dis btw doc and home class center, value -> dis btw doc and other class center/ dis btw doc and home class center
def get_class_feature_value(near_classes, home_dis, num_classes, category_encoded):
    doc_topic_feature = np.zeros(num_classes)
    doc_topic_feature[category_encoded] = 1.0
    for i in near_classes:
        doc_topic_feature[i[0]] = (home_dis / (i[1]))**2
        if doc_topic_feature[i[0]] >= 1.0:
            doc_topic_feature[i[0]] = 1.0
    return doc_topic_feature

In [30]:
near_classes = choose_near_classes(matrix[10], 10, 0.6, cluster_centers.values, cat_matrix[10])
print(near_classes[0])
print(near_classes[1])
doc_topic_feature = get_class_feature_value(near_classes[0], near_classes[1], 42, cat_matrix[10])
print(doc_topic_feature)

all_doc_topic_feature = []
# print(len(all_doc_topic_feature[0]))
for i, x in enumerate(matrix):
    near_classes = choose_near_classes(x, 4, 0.6, cluster_centers.values, cat_matrix[i])
    doc_topic_feature = get_class_feature_value(near_classes[0], near_classes[1], n_clusters, cat_matrix[i])
    all_doc_topic_feature.append(doc_topic_feature)
# save to csv file && add header
df = pd.DataFrame(all_doc_topic_feature)
df.to_csv('./data/doc_vector_feature.csv', index=False)

# # test outcome
df = pd.read_csv('./data/doc_vector_feature.csv', index_col=False)
features = np.array(df.values)
print(features.shape)


[[7, 0.42279854584742965], [1, 0.43112961124744764], [36, 0.4645338126469711], [8, 0.504311500649085], [29, 0.5087826881343639], [26, 0.5152812705647009], [39, 0.5176882321168511], [17, 0.5260085193610494], [10, 0.5277658414546437], [23, 0.5279413029583664]]
0.38726014228927896
[1.         0.8068446  0.         0.         0.         0.
 0.         0.83895493 0.5896684  0.         0.53842231 0.
 0.         0.         0.         0.         0.         0.54202591
 0.         0.         0.         0.         0.         0.53806448
 0.         0.         0.56482887 0.         0.         0.57934992
 0.         0.         0.         0.         0.         0.
 0.69497781 0.         0.         0.5595888  0.         0.        ]
(9750, 42)
