In [None]:
import pandas as pd
import time
import torch as th
from fastnode2vec import Node2Vec
from tqdm import tqdm
from sklearn.preprocessing import normalize
import dgl
from copy import copy

# Data loading
## original data
### N2V model

In [None]:
model = Node2Vec.load('cpc_embedding')

### vocab

In [None]:
vocab = model.wv.index_to_key

### CPC_Tree

In [None]:
cpc_df = pd.read_excel('../path/to/cpc_tree.xlsx') ## Including: CPC code, Level features, Frequency of occurrence, Child CPC, Parent CPC
cpc_tree=cpc_df.to_dict(orient='index')

In [None]:
for c in cpc_tree.keys():
    cpc_tree[c]['children'] = eval(cpc_tree[c]['children']) ## list_str->list
    cpc_tree[c]['embedding'] = th.zeros(512)
    cpc_tree[c]['embedding_up'] = th.zeros(512)
    cpc_tree[c]['embedding_down'] = th.zeros(512)

### CPC_Co_Occurrence_Net

In [None]:
cpc_coo = "../data/of/cpc_co_occurrence_net"

## Data processing
### Add original embedding to CPC_Tree

In [None]:
for v in tqdm(vocab):
    cpc_tree[v]['embedding'] = model.wv[v]

### cpc2id，id2cpc encode

In [None]:
cpc2id = {cpc:id_ for id_,cpc in enumerate(vocab)}
id2cpc = {id_:cpc for cpc,id_ in cpc2id.items()}

## Make Graph
### Graph Data

In [None]:
start_list = []
end_list = []
weigh_list = []
embedding_list = normalize([list(model.wv[id2cpc[i]]) for i in range(len(vocab))])

In [None]:
for s,e,w in tqdm(cpc_coo):
    start_list.append(cpc2id[s])
    end_list.append(cpc2id[e])
    weigh_list.append(w)

In [None]:
num_list = [int(cpc_tree[id2cpc[i]]['num']) for i in range(len(vocab))]

In [None]:
cpc_start = [eval(c)[0] for c in cpc_coo]
cpc_num = [c[2] for c in cpc_coo]
cpc_coo_num = {c:0 for c in cpc_start}

In [None]:
for c in range(len(cpc_start)):
    cpc_coo_num[cpc_start[c]] += cpc_num[c]

In [None]:
wight_degree_list = [cpc_coo_num[id2cpc[i]] for i in range(len(vocab))]

### Generate Graph

In [None]:
u,v = th.tensor(start_list), th.tensor(end_list)

In [None]:
g = dgl.graph((u,v))

### Add Weight

In [None]:
g.edata['w'] = th.tensor(weigh_list)
g.ndata['e'] = th.tensor(embedding_list)
g.ndata['n'] = th.tensor(num_list)
g.ndata['d'] = g.in_degrees()
g.ndata['w_d'] = th.tensor(wight_degree_list)

# Operation Function
## Reset Embedding

In [None]:
def reset_embedding():
    for c in cpc_tree.keys():
        cpc_tree[c]['embedding'] = th.zeros(512)
        cpc_tree[c]['embedding_up'] = th.zeros(512)
        cpc_tree[c]['embedding_down'] = th.zeros(512)

## Update Graph Node

In [None]:
def update_graph_node():
    embedding = normalize([list(cpc_tree[id2cpc[i]]['embedding_down']) for i in range(len(vocab))])
    g.ndata['e'] = th.tensor(embedding)

## UP

In [None]:
def parent_up(vocab_embedding):
    for id_,emb in enumerate(vocab_embedding):
        cpc_tree[id2cpc[id_]]['embedding'] = emb.clone()
        cpc_tree[id2cpc[id_]]['embedding'] = th.tensor(normalize([cpc_tree[id2cpc[id_]]['embedding'].tolist()])[0])
    for i in range(15):
        z = 16-i # 16->2
        for c in cpc_tree.keys():
            if cpc_tree[c]['level'] == z:
                children = cpc_tree[c]['children']
                if len(children)==0:
                    cpc_tree[c]['embedding_up'] = cpc_tree[c]['embedding']
                else:
                    all_num = cpc_tree[c]['num']
                    res_num = cpc_tree[c]['num']
                    if all_num != 0:
                        for child in children:
                            res_num -= cpc_tree[child]['num']
                            cpc_tree[c]['embedding_up'] += cpc_tree[child]['num']/all_num*cpc_tree[child]['embedding_up']
                        if res_num != 0:
                            cpc_tree[c]['embedding_up'] += res_num/all_num*cpc_tree[c]['embedding']
                cpc_tree[c]['embedding_up']=th.tensor(normalize([cpc_tree[c]['embedding_up'].tolist()])[0])

## Down

In [None]:
def inherit_down(inh_k):
    for i in range(2,17): # 2->16
        for c in cpc_tree.keys():
            if cpc_tree[c]['level'] == 2:
                cpc_tree[c]['embedding_down'] = cpc_tree[c]['embedding_up']
            if cpc_tree[c]['level'] == i and cpc_tree[c]['num'] > 0:
                children = cpc_tree[c]['children']
                inh_num = cpc_tree[c]['level']/16* inh_k
                emb_inherit = inh_num * cpc_tree[c]['embedding_down']
                peer_num = inh_k*(1-cpc_tree[c]['level']/16)
                child_emb_update = {}
                for child in children:
                    out_child = copy(children)
                    out_child.remove(child)
                    emb_peer = th.zeros(512)
                    for peer in out_child:
                        emb_peer -= peer_num/len(out_child) * cpc_tree[peer]['embedding_up']
                    child_emb_update[child] = emb_inherit+emb_peer+cpc_tree[child]['embedding_up']
                for child in children:
                    cpc_tree[child]['embedding_down'] = child_emb_update[child]
                    cpc_tree[child]['embedding_down'] = th.tensor(normalize([cpc_tree[child]['embedding_down'].tolist()])[0])

## Graph Update

In [None]:
def message_func(edges):
    return {'emb': edges.src['e'], 'wei': edges.data['w'], 'num': edges.dst['w_d'], 'deg': edges.dst['d']}

In [None]:
def reduce_func(nodes):
    update = []
    for batch_e, batch_w, batch_n, batch_d in zip(nodes.mailbox['emb'], nodes.mailbox['wei'], nodes.mailbox['num'], nodes.mailbox['deg']):
        z = th.zeros(512)
        for e,w,n,d in zip(batch_e, batch_w, batch_n, batch_d):
            z += (w*d-n)/(n*d)*e
        update.append(z)
    return {'update': th.stack(update)}

In [None]:
def update_coo_net(neig_k):
    g.update_all(message_func, reduce_func)
    g.ndata['e'] = g.ndata['e'] + neig_k*g.ndata['update']

## Define Loss Function
### Cosine Similarity

In [None]:
def cos_sim(x,y):
    return th.nn.functional.cosine_similarity(x, y, dim=0, eps=1e-8)

### Tree Loss

In [None]:
def tree_loss(cpc_tree_dict):
    loss_par = 0
    loss_pee = 0
    num = 0
    for i in range(2,17):
        for c in cpc_tree_dict.keys():
            if cpc_tree_dict[c]['level'] == i:
                loss_parent = 0
                loss_peer = []
                children = cpc_tree_dict[c]['children']
                if len(children) == 0:
                    continue
                else:
                    for child in children:
                        loss_parent += (1-cos_sim(cpc_tree_dict[c]['embedding_up'], cpc_tree_dict[child]['embedding_up']))*cpc_tree[c]['level']/16
                        out_child = copy(children)
                        out_child.remove(child)
                        for peer in out_child:
                            loss_peer.append(cos_sim(cpc_tree_dict[child]['embedding_up'], cpc_tree_dict[peer]['embedding_up'])*(1-cpc_tree[c]['level']/16))
                    loss_parent /= len(children)
                    if len(loss_peer) != 0:
                        loss_pee += sum(loss_peer)/len(loss_peer)
                        loss_par += loss_parent
                    else:
                        loss_par += loss_parent
                num += 1
    return loss_par/num, loss_pee/num

### Graph Loss

In [None]:
def graph_loss():
    deg_list = g.ndata['d']
    emb_list = g.ndata['e']
    num_list = g.ndata['w_d']
    loss = 0
    for s,e,w in tqdm(cpc_coo):
        start = cpc2id[s]
        end = cpc2id[e]
        mean_deg = deg_list[end]/num_list[end]
        loss+=(1-cos_sim(emb_list[start], emb_list[end])) * (w-mean_deg)/num_list[end]
    return loss/len(num_list)

# Training
## Function

In [None]:
def train(inh_k = 0.05, neig_k = 0.05):
    inherit_down(inh_k=inh_k)
    update_graph_node()
    update_coo_net(neig_k=neig_k)
    emb_list = g.ndata['e']
    reset_embedding()
    parent_up(emb_list)

## Loss Function

In [None]:
def com_loss():
    loss_par, loss_pee = tree_loss(cpc_tree)
    loss_g = graph_loss()
    return float(loss_par), float(loss_pee),float(loss_g), float(loss_par+loss_pee+loss_g)

## Train

In [None]:
loss_dict = {}

In [None]:
def train_embedding(inh_k = 0.05, neig_k = 0.05, k = 10):
    print("="*20+f"inh_k:{inh_k}, neig_k:{neig_k}"+"="*20)
    loss_dict.setdefault((inh_k, neig_k),{'iter':0, 'loss':10, 'loss_tg':(3,3,3), 'embedding':{}})
    reset_embedding()
    g.ndata['e'] = th.tensor(embedding_list)
    parent_up(g.ndata['e'])
    loss_par,loss_pee, loss_g, loss = com_loss()
    loss_dict[(inh_k, neig_k)]['loss'] = loss
    loss_dict[(inh_k, neig_k)]['loss_tg'] = (loss_par, loss_pee, loss_g)
    time_now = time.strftime("%m-%d %H:%M:%S", time.localtime())
    print(f"[{time_now}]iter{0}: loss:{loss}, loss_par:{loss_par}, loss_pae:{loss_pee}, loss_graph:{loss_g}")
    for i in range(k):
        train(inh_k=inh_k, neig_k=neig_k)
        loss_par,loss_pee, loss_g, loss = com_loss()
        time_now = time.strftime("%m-%d %H:%M:%S", time.localtime())
        print(f"[{time_now}]iter{i+1}: loss:{loss}, loss_par:{loss_par}, loss_pee:{loss_pee}, loss_graph:{loss_g}")
        if loss<loss_dict[(inh_k, neig_k)]['loss']:
            loss_dict[(inh_k, neig_k)]['iter'] = i+1
            loss_dict[(inh_k, neig_k)]['loss'] = loss
            loss_dict[(inh_k, neig_k)]['loss_tg'] = (loss_par, loss_pee, loss_g)
            loss_dict[(inh_k, neig_k)]['embedding'] = {c_:cpc_tree[c_]['embedding_up'] for c_ in cpc_tree.keys()}
        else:
            break

In [None]:
train_embedding(inh_k=0.8, neig_k=0.1, k=30)