In [None]:
from sklearn.neighbors import kneighbors_graph
from sklearn.metrics.pairwise import cosine_distances
import seaborn as sns
import networkx as nx
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

This notebook is for the Windows

In [None]:
embedding_path = "../token_embeddings/tensors.tsv"
vocab_path = "../global_set/result.tsv"
emb_df = pd.read_csv(embedding_path, sep="\t", header=None)
voc_df = pd.read_csv(vocab_path, sep="\t").set_index("ID")
vocab = voc_df[["TOKEN"]]
vocab = vocab.to_dict()["TOKEN"]

In [None]:
emb = emb_df.values - emb_df.sum(0).values / 2040
emd_dist = cosine_distances(emb)
A = kneighbors_graph(emb_df, n_neighbors=5, mode="connectivity", metric="cosine")
G = nx.from_numpy_matrix(A.toarray(), create_using=nx.DiGraph)
nx.set_node_attributes(G, vocab, "token")

In [None]:
in_degree = G.in_degree()#nx.in_degree_centrality(G)
in_degree_sorted = sorted(in_degree, key=lambda x: x[1], reverse=True)
in_degree_dict = dict()
for node_id, node_degree in in_degree_sorted:
    in_degree_dict[vocab[node_id]] = node_degree
in_degree_vals = np.array([v for k,v in in_degree_sorted])
k = 20
print("Top %s nodes (based on in-degree)" %k)
for i in range(k):
    node_id, node_degree = in_degree_sorted[i]
    print(i, "%s: %s" %(vocab[i], node_degree))

In [None]:
plt.figure(figsize=(17,5))
plt.hist(in_degree_vals,100)
"Median of In-Degree distribution: %s" % np.median(in_degree_vals)
plt.title("In-degree distribution (Median: %s)" %np.median(in_degree_vals))
plt.xlabel("In-Degree")
plt.ylabel("Frequency")
sns.despine()
plt.tight_layout()
plt.show()

In [None]:
voc_cats = voc_df["CATEGORY"].unique()
in_deg_cats = dict()
print("Median for in_degrees")
for cat in voc_cats:
    if cat in ["OTHER", "NON"]:
        continue
    cat_tokens = voc_df[voc_df["CATEGORY"] == cat]["TOKEN"].values
    cat_degrees = [in_degree_dict[token] for token in cat_tokens]
    print("\t%s (n=%s): %s" %(cat,  len(cat_tokens), np.median(cat_degrees)))


In [None]:
voc_cats = voc_df["CATEGORY"].unique()
in_deg_cats = dict()
print("Number of nodes with 0 in-degrees")
for cat in voc_cats:
    if cat in ["OTHER", "NON"]:
        continue
    cat_tokens = voc_df[voc_df["CATEGORY"] == cat]["TOKEN"].values
    cat_degrees = [in_degree_dict[token] == 0 for token in cat_tokens]
    print("\t%s (n=%s): %s (%2.f %% )" %(cat,  len(cat_tokens), np.sum(cat_degrees),  100 * np.sum(cat_degrees)/len(cat_tokens)))