In [1]:
import numpy as np
from datasets import Dataset, DatasetDict
from itertools import combinations
from tqdm import tqdm
from fastnode2vec import Graph, Node2Vec
from collections import Counter
import math
import random

## 0. load data

In [2]:
patent_data = DatasetDict.load_from_disk('/Users/xiaoen/Documents/科研/论文/链接预测/TOD-Code/data/AD_dataset')
patent_data.keys()

dict_keys(['AD_10', 'AD_11', 'AD_12', 'AD_13', 'AD_14', 'AD_15', 'AD_16', 'AD_17', 'AD_18', 'AD_19', 'AD_20', 'AD_21', 'AD_22', 'AD_23'])

In [3]:
patent_data

DatasetDict({
    AD_10: Dataset({
        features: ['PN', 'CPC', 'PA', 'PI', 'UPC'],
        num_rows: 52
    })
    AD_11: Dataset({
        features: ['PN', 'CPC', 'PA', 'PI', 'UPC'],
        num_rows: 70
    })
    AD_12: Dataset({
        features: ['PN', 'CPC', 'PA', 'PI', 'UPC'],
        num_rows: 92
    })
    AD_13: Dataset({
        features: ['PN', 'CPC', 'PA', 'PI', 'UPC'],
        num_rows: 132
    })
    AD_14: Dataset({
        features: ['PN', 'CPC', 'PA', 'PI', 'UPC'],
        num_rows: 256
    })
    AD_15: Dataset({
        features: ['PN', 'CPC', 'PA', 'PI', 'UPC'],
        num_rows: 460
    })
    AD_16: Dataset({
        features: ['PN', 'CPC', 'PA', 'PI', 'UPC'],
        num_rows: 1032
    })
    AD_17: Dataset({
        features: ['PN', 'CPC', 'PA', 'PI', 'UPC'],
        num_rows: 2012
    })
    AD_18: Dataset({
        features: ['PN', 'CPC', 'PA', 'PI', 'UPC'],
        num_rows: 2911
    })
    AD_19: Dataset({
        features: ['PN', 'CPC', 'PA', 'PI', 'UP

## 1. CPC2ID

In [4]:
id_data = list(patent_data.keys())[4:11]

In [5]:
cpc_list = []
for file in id_data:
    for i in range(len(patent_data[file])):
        l = patent_data[file][i]['CPC']
        cpc_list.extend(l)
cpc_set = list(set(cpc_list))

In [6]:
cpc_id = [{'CPC':cpc, 'id':id_} for id_,cpc in enumerate(cpc_set)]

In [7]:
cpc_id = Dataset.from_list(cpc_id)
cpc_id.save_to_disk('/Users/xiaoen/Documents/科研/论文/链接预测/TOD-Code/data/CPC2ID')

Saving the dataset (0/1 shards):   0%|          | 0/14670 [00:00<?, ? examples/s]

In [8]:
cpc_id = Dataset.load_from_disk('../data/CPC2ID')
cpc_id = cpc_id.to_list()
cpc_id = {c['CPC']:c['id'] for c in cpc_id}
id_cpc = {c[1]:c[0] for c in cpc_id.items()}

## 2 CPC technical characteristics learning
### 2.0 network data

In [9]:
net_data = list(patent_data.keys())[4:11]

In [10]:
cpc_data = []
for file in net_data:
    for i in range(len(patent_data[file])):
        l = patent_data[file][i]['CPC']
        e_l = []
        for cpc in l:
            if cpc in cpc_id:
                e_l.append(cpc)
        cpc_data.append(e_l)

### 2.1 build a network

In [11]:
coo_raw_list = []
for cpc in tqdm(cpc_data):
    coo_raw_list.extend(list(combinations(cpc,2))+[(i[-1],i[0]) for i in list(combinations(cpc,2))])

100%|██████████| 14276/14276 [00:00<00:00, 77296.69it/s]


In [12]:
len(coo_raw_list)

3244294

In [13]:
cpc_coo_counter = Counter(coo_raw_list)

In [14]:
len(cpc_coo_counter)

951876

### 2.2 DeepWalk Learning Embedding

In [15]:
coo_dataset = [(coo[0][0], coo[0][1], coo[1]) for coo in tqdm(cpc_coo_counter.items())]

100%|██████████| 951876/951876 [00:00<00:00, 8723993.95it/s]


In [16]:
cpc_coo_graph = Graph(coo_dataset,directed=True, weighted=True)

Reading graph:   0%|          | 0/951876 [00:00<?, ?it/s]

In [17]:
DeepWalk=Node2Vec(cpc_coo_graph, dim=64, walk_length=50, window=10, p=1, q=1, batch_walks=256, workers=12)

In [18]:
DeepWalk.train(epochs=20)

Epochs:   0%|          | 0/20 [00:00<?, ?it/s]

In [19]:
cpc_embedding = {}
for c in tqdm(cpc_id):
    cpc_embedding[c] = DeepWalk.wv[c]

100%|██████████| 14670/14670 [00:00<00:00, 2156012.46it/s]


## 3 statistical characteristics
### 3.0 Data set preparation

In [20]:
def make_cpc_list_data(data):
    cpc_l = []
    for f in data:
        for z in range(len(patent_data[f])):
            c_l = patent_data[f][z]
            c_l['CPC'] = list(set(c_l['CPC'])-(set(c_l['CPC'])-set(cpc_set)))
            cpc_l.append(c_l)
    return cpc_l

In [21]:
train_net = make_cpc_list_data(list(patent_data.keys())[4:11])
test_net = make_cpc_list_data(list(patent_data.keys())[4:12])
val_net = make_cpc_list_data(list(patent_data.keys())[4:13])
train_net_last = make_cpc_list_data(list(patent_data.keys())[8:11])
test_net_last = make_cpc_list_data(list(patent_data.keys())[9:12])
val_net_last = make_cpc_list_data(list(patent_data.keys())[10:13])
train_net_label = make_cpc_list_data(list(patent_data.keys())[11:12])
test_net_label = make_cpc_list_data(list(patent_data.keys())[12:13])

### 3.1 CPC统计特征

In [22]:
def stat_feature(cpc_all, cpc_last):
    cpc_feature = {}
    for c in tqdm(cpc_id):
        cpc_feature[c] = {
                'last_patent':0,
                'all_patent':0,
                'grow_rate':0,
                'all_neighbor':[],
                'last_neighbor':[],
                'all_neighbor_num':0,
                'last_neighbor_num':0,
                'neighbor_grow_rate':0,
                'technology_emb':np.zeros(64),
                'assignee':[],
                'inventor':[],
                'company':[],
            }

        for cl in cpc_all:
            if c in cl['CPC']:
                cpc_feature[c]['all_patent'] += 1
                neighbor = []
                for cpc in cl['CPC']:
                    if cpc != c:
                        neighbor.append(cpc)
                cpc_feature[c]['all_neighbor'].extend(list(set(neighbor)))
                cpc_feature[c]['assignee'].extend(cl['PA'])
                cpc_feature[c]['inventor'].extend(cl['PI'])
                cpc_feature[c]['company'].extend(cl['UPC'])
        for cl in cpc_last:
            if c in cl['CPC']:
                cpc_feature[c]['last_patent'] += 1
                neighbor = []
                for cpc in cl['CPC']:
                    if cpc != c:
                        neighbor.append(cpc)
                cpc_feature[c]['last_neighbor'].extend(list(set(neighbor)))
        cpc_feature[c]['grow_rate'] = cpc_feature[c]['last_patent']/cpc_feature[c]['all_patent']
        cpc_feature[c]['all_neighbor'] = list(set(cpc_feature[c]['all_neighbor']))
        cpc_feature[c]['last_neighbor'] = list(set(cpc_feature[c]['last_neighbor']))
        cpc_feature[c]['all_neighbor_num'] = len(cpc_feature[c]['all_neighbor'])
        cpc_feature[c]['last_neighbor_num'] = len(cpc_feature[c]['last_neighbor'])
        cpc_feature[c]['neighbor_grow_rate'] = cpc_feature[c]['last_neighbor_num']/cpc_feature[c]['all_neighbor_num']
        cpc_feature[c]['assignee'] = list(set(cpc_feature[c]['assignee']))
        cpc_feature[c]['inventor'] = list(set(cpc_feature[c]['inventor']))
        cpc_feature[c]['company'] = list(set(cpc_feature[c]['company']))
        cpc_feature[c]['technology_emb'] = cpc_embedding[c]
    return cpc_feature

In [23]:
train_feature = stat_feature(train_net, train_net_last)

100%|██████████| 14670/14670 [00:34<00:00, 421.40it/s]


In [24]:
test_feature = stat_feature(test_net, test_net_last)

100%|██████████| 14670/14670 [00:37<00:00, 387.41it/s]


In [25]:
val_feature = stat_feature(val_net, val_net_last)

100%|██████████| 14670/14670 [00:43<00:00, 334.54it/s]


### 3.2 CPC feature construction

In [26]:
def make_cpc_emb(cpc_feature):
    cpc_emb = {}
    for c in tqdm(cpc_id):
        stand_last_patent = cpc_feature[c]['last_patent']
        stand_all_patent = cpc_feature[c]['all_patent']
        stand_grow_rate = cpc_feature[c]['grow_rate']
        stand_all_neighbor_num = cpc_feature[c]['all_neighbor_num']
        stand_last_neighbor_num = cpc_feature[c]['last_neighbor_num']
        stand_neighbor_growth_rate = cpc_feature[c]['neighbor_grow_rate']
        stat_emb = np.array([stand_last_patent, stand_all_patent, stand_grow_rate, stand_all_neighbor_num, stand_last_neighbor_num, stand_neighbor_growth_rate] * 6)
        tech_emb = cpc_feature[c]['technology_emb']
        cpc_emb[c]=np.concatenate([stat_emb, tech_emb])
    return cpc_emb

In [27]:
train_emb = make_cpc_emb(train_feature)

100%|██████████| 14670/14670 [00:00<00:00, 512058.12it/s]


In [28]:
test_emb = make_cpc_emb(test_feature)

100%|██████████| 14670/14670 [00:00<00:00, 486853.08it/s]


In [29]:
val_emb = make_cpc_emb(val_feature)

100%|██████████| 14670/14670 [00:00<00:00, 481191.51it/s]


### 3.3 build a network

In [30]:
def make_net(cpc_all):
    edges = []
    for c in cpc_all:
        edges.extend(list(combinations(c['CPC'],2))+[(i[-1],i[0]) for i in list(combinations(c['CPC'],2))])
    edge_counter = Counter(edges)
    return list(edge_counter.keys())

In [31]:
train_edge = make_net(train_net)

In [32]:
test_edge = make_net(test_net)

In [33]:
val_edge = make_net(val_net)

In [34]:
train_edge_label = make_net(train_net_label)

In [35]:
test_edge_label = make_net(test_net_label)

### 3.4 negative sampling

In [36]:
def make_neg_edge(edge_label):
    edge_cpc_inter = {i:[] for i in cpc_id}
    for cpc_edge in edge_label:
        edge_cpc_inter[cpc_edge[0]].append(cpc_edge[1])
        edge_cpc_inter[cpc_edge[1]].append(cpc_edge[0])
    for i in cpc_id:
        edge_cpc_inter[i] = list(set(edge_cpc_inter[i]))

    edge_cpc_not_inter = {i:[] for i in cpc_id}
    for i in tqdm(cpc_id):
        edge_cpc_not_inter[i] = list(set(cpc_id.keys())-set(edge_cpc_inter[i]+[i]))

    neg_ratio = 1
    neg_edge = []
    for cpc_edge in tqdm(edge_label):
        for i in range(neg_ratio):
            neg_cpc = random.choice(edge_cpc_not_inter[cpc_edge[0]])
            neg_edge.append((cpc_edge[0],neg_cpc))
    return neg_edge

In [37]:
train_neg_edge = make_neg_edge(train_edge_label)

100%|██████████| 14670/14670 [00:05<00:00, 2903.03it/s]
100%|██████████| 275254/275254 [00:01<00:00, 271978.82it/s]


In [38]:
len(train_neg_edge), len(train_edge_label)

(275254, 275254)

In [39]:
test_neg_edge = make_neg_edge(test_edge_label)

100%|██████████| 14670/14670 [00:05<00:00, 2678.20it/s]
100%|██████████| 252482/252482 [00:00<00:00, 295783.50it/s]


In [40]:
len(test_neg_edge), len(test_edge_label)

(252482, 252482)

### 3.5 Calculate edge characteristics

In [41]:
def cal_feature(src_node, dst_node, edge_feature):
    sn = src_node
    dn = dst_node
    ass_s = set(edge_feature[sn]['assignee'])
    ass_d = set(edge_feature[dn]['assignee'])
    inv_s = set(edge_feature[sn]['inventor'])
    inv_d = set(edge_feature[dn]['inventor'])
    com_s = set(edge_feature[sn]['company'])
    com_d = set(edge_feature[dn]['company'])
    nei_s = set(edge_feature[sn]['all_neighbor'])
    nei_d = set(edge_feature[dn]['all_neighbor'])
    same_ass = len(ass_s.intersection(ass_d))
    same_inv = len(inv_s.intersection(inv_d))
    same_com = len(com_s.intersection(com_d))
    same_nei = nei_s.intersection(nei_d)
    all_nei = nei_s.union(nei_d)
    cn = len(same_nei)
    jc = cn/len(all_nei)
    ss = 2*len(same_nei)/(len(nei_s)+len(nei_d))
    st = len(same_nei)/((len(nei_s)*len(nei_d))**0.5)
    hp = cn/min(len(nei_s),len(nei_d))
    hd = cn/max(len(nei_s),len(nei_d))
    lhn = cn/(len(nei_s)*len(nei_d))
    pa = len(nei_s)*len(nei_d)
    aa = 0
    ra = 0
    for n in same_nei:
        if len(edge_feature[n]['all_neighbor'])>1:
            aa += 1 / math.log(len(edge_feature[n]['all_neighbor']))
            ra += 1 / len(edge_feature[n]['all_neighbor'])
        elif len(edge_feature[n]['all_neighbor'])>0:
            ra += 1 / len(edge_feature[n]['all_neighbor'])
    same_group = 1 if sn.split('/')[0] == dn.split('/')[0] else 0
    same_class = 1 if sn[:3]==dn[:3] else 0
    same_section = 1 if sn[0]==dn[0] else 0
    return [same_ass, same_inv, same_com, same_group, same_class, same_section, cn, jc, ss, st, hp, hd, lhn, pa, aa, ra]

## 4 prepare data

In [42]:
def process_dataset4train(_edges, _pos_edges, _neg_edges, _x_feature, _x_emb):
    _edges_index = []
    for edge in _edges:
        _edges_index.append([cpc_id[c] for c in edge])

    label_edges = _pos_edges + _neg_edges
    label_edge_index = []
    for edge in label_edges:
        label_edge_index.append([cpc_id[c] for c in edge])

    label_edge_feature = []
    for edge in tqdm(label_edges):
        label_edge_feature.append(cal_feature(edge[0], edge[1], _x_feature))

    labels = [1]*len(_pos_edges) + [0]*len(_neg_edges)

    _x_feature_data = []
    for i in range(len(id_cpc)):
        _x_feature_data.append(_x_emb[id_cpc[i]])

    return {
        "x": _x_feature_data,
        "x_feature": [_x_emb[id_cpc[i]] for i in range(len(id_cpc))],
        "edge_index": _edges_index,
        "label_edge": label_edge_index,
        "label_edge_feature": label_edge_feature,
        "label": labels,
    }

In [43]:
train_dataset = process_dataset4train(train_edge, train_edge_label, train_neg_edge, train_feature, train_emb)

100%|██████████| 550508/550508 [00:46<00:00, 11752.83it/s]


In [44]:
test_dataset = process_dataset4train(test_edge, test_edge_label, test_neg_edge, test_feature, test_emb)

100%|██████████| 504964/504964 [00:48<00:00, 10385.49it/s]


In [45]:
dataset = Dataset.from_list(
    [
        {"type": "train", "data": train_dataset},
        {"type": "test", "data": test_dataset},
    ]
)

In [46]:
# dataset.save_to_disk('/Users/xiaoen/Documents/科研/论文/链接预测/TOD-Code/data/traindataset')

Saving the dataset (0/1 shards):   0%|          | 0/2 [00:00<?, ? examples/s]