In [1]:
import numpy as np
from datasets import Dataset, DatasetDict
from itertools import combinations
from tqdm import tqdm, trange
from collections import Counter
import math
import pandas as pd

## 0. load data

In [2]:
patent_data = DatasetDict.load_from_disk('/Users/xiaoen/Documents/科研/论文/链接预测/TOD-Code/data/AD_dataset')
patent_data.keys()

dict_keys(['AD_10', 'AD_11', 'AD_12', 'AD_13', 'AD_14', 'AD_15', 'AD_16', 'AD_17', 'AD_18', 'AD_19', 'AD_20', 'AD_21', 'AD_22', 'AD_23'])

In [3]:
dataset = Dataset.load_from_disk('/Users/xiaoen/Documents/科研/论文/链接预测/TOD-Code/data/traindataset')
test_dataset = dataset['data'][1]

In [4]:
def deal_cpc(cpc_list):
    if pd.isna(cpc_list):
        return []
    else:
        cpc_raw = cpc_list.split(' | ')
        cpcs = []
        for cpc in cpc_raw:
            a = cpc[:4]
            b = cpc[4:8].lstrip('0')
            c = cpc[8:]
            cpcs.append(a + b + '/' + c)
        return cpcs

In [5]:
def make_dataset(file_name):
    patents = []
    data_frame = pd.read_excel(file_name, header=1)
    for n in trange(len(data_frame)):
        p = {
            'PN': data_frame.iloc[n, 4] if not pd.isna(data_frame.iloc[n, 4]) else 'NaN',  # Publication_Number
            'TI': data_frame.iloc[n, 0] if not pd.isna(data_frame.iloc[n, 0]) else 'NaN',  # Title
            'AB': data_frame.iloc[n, 5] if not pd.isna(data_frame.iloc[n, 5]) else 'NaN',  # Abstract
            'CPC': deal_cpc(data_frame.iloc[n, 6]),  # CPC
            'PCN': int(data_frame.iloc[n, 1]) if not pd.isna(data_frame.iloc[n, 1]) else 0,  # Patent Claims Number
            'PA': data_frame.iloc[n, 7].split(' | ') if not pd.isna(data_frame.iloc[n, 7]) else [],  # Patent Assignee
            'PI': data_frame.iloc[n, 8].split(' | ') if not pd.isna(data_frame.iloc[n, 8]) else [],  # Patent Inventor
            'UPC': data_frame.iloc[n, 2].split(' | ') if not pd.isna(data_frame.iloc[n, 2]) else [],  # Ultimate Parent Company
            'PAT': data_frame.iloc[n, 11] if not pd.isna(data_frame.iloc[n, 11]) else 'NaN',  # Patent Application Time
            'PPT': data_frame.iloc[n, 3] if not pd.isna(data_frame.iloc[n, 3]) else 'NaN',  # Patent Publication Time
            'PCing': data_frame.iloc[n, 9].split(' | ') if not pd.isna(data_frame.iloc[n, 9]) else [],  # Patent Citing
            'PCed': data_frame.iloc[n, 10].split(' | ') if not pd.isna(data_frame.iloc[n, 10]) else [],  # Patent Cited
        }
        if len(p['CPC']) > 1:
            patents.append(p)
    return Dataset.from_list(patents)

In [6]:
case_data = make_dataset('/Users/xiaoen/Documents/科研/论文/链接预测/TOD-Code/case/ZD_23.xlsx')

100%|██████████| 7395/7395 [00:00<00:00, 9264.78it/s]


## 1. CPC2ID

In [7]:
id_data = list(patent_data.keys())[4:11]

In [8]:
cpc_list = []
for file in id_data:
    for i in range(len(patent_data[file])):
        l = patent_data[file][i]['CPC']
        cpc_list.extend(l)
cpc_set = list(set(cpc_list))

In [9]:
cpc_id = Dataset.load_from_disk('/Users/xiaoen/Documents/科研/论文/链接预测/TOD-Code/data/CPC2ID')
cpc_id = cpc_id.to_list()
cpc_id = {c['CPC']:c['id'] for c in cpc_id}
id_cpc = {c[1]:c[0] for c in cpc_id.items()}

## 2 statistical characteristics
### 2.0 Data set preparation

In [10]:
def make_cpc_list_data(data):
    cpc_l = []
    for f in data:
        for z in range(len(patent_data[f])):
            c_l = patent_data[f][z]
            c_l['CPC'] = list(set(c_l['CPC'])-(set(c_l['CPC'])-set(cpc_set)))
            cpc_l.append(c_l)
    return cpc_l

In [11]:
val_net = make_cpc_list_data(list(patent_data.keys())[4:13])
val_net_last = make_cpc_list_data(list(patent_data.keys())[10:13])

In [12]:
cpc_embedding = {}
for i in id_cpc:
    cpc_embedding[id_cpc[i]] = test_dataset['x'][i][-64:]

### 2.1 CPC statistical characteristics

In [13]:
def stat_feature(cpc_all, cpc_last):
    cpc_feature = {}
    for c in tqdm(cpc_id):
        cpc_feature[c] = {
                'last_patent':0,
                'all_patent':0,
                'grow_rate':0,
                'all_neighbor':[],
                'last_neighbor':[],
                'all_neighbor_num':0,
                'last_neighbor_num':0,
                'neighbor_grow_rate':0,
                'technology_emb':np.zeros(64),
                'assignee':[],
                'inventor':[],
                'company':[],
            }

        for cl in cpc_all:
            if c in cl['CPC']:
                cpc_feature[c]['all_patent'] += 1
                neighbor = []
                for cpc in cl['CPC']:
                    if cpc != c:
                        neighbor.append(cpc)
                cpc_feature[c]['all_neighbor'].extend(list(set(neighbor)))
                cpc_feature[c]['assignee'].extend(cl['PA'])
                cpc_feature[c]['inventor'].extend(cl['PI'])
                cpc_feature[c]['company'].extend(cl['UPC'])
        for cl in cpc_last:
            if c in cl['CPC']:
                cpc_feature[c]['last_patent'] += 1
                neighbor = []
                for cpc in cl['CPC']:
                    if cpc != c:
                        neighbor.append(cpc)
                cpc_feature[c]['last_neighbor'].extend(list(set(neighbor)))
        cpc_feature[c]['grow_rate'] = cpc_feature[c]['last_patent']/cpc_feature[c]['all_patent']
        cpc_feature[c]['all_neighbor'] = list(set(cpc_feature[c]['all_neighbor']))
        cpc_feature[c]['last_neighbor'] = list(set(cpc_feature[c]['last_neighbor']))
        cpc_feature[c]['all_neighbor_num'] = len(cpc_feature[c]['all_neighbor'])
        cpc_feature[c]['last_neighbor_num'] = len(cpc_feature[c]['last_neighbor'])
        cpc_feature[c]['neighbor_grow_rate'] = cpc_feature[c]['last_neighbor_num']/cpc_feature[c]['all_neighbor_num']
        cpc_feature[c]['assignee'] = list(set(cpc_feature[c]['assignee']))
        cpc_feature[c]['inventor'] = list(set(cpc_feature[c]['inventor']))
        cpc_feature[c]['company'] = list(set(cpc_feature[c]['company']))
        cpc_feature[c]['technology_emb'] = cpc_embedding[c]
    return cpc_feature

In [14]:
val_feature = stat_feature(val_net, val_net_last)

100%|██████████| 14670/14670 [00:42<00:00, 348.19it/s]


### 2.2 CPC feature construction

In [15]:
def make_cpc_emb(cpc_feature):
    cpc_emb = {}
    for c in tqdm(cpc_id):
        stand_last_patent = cpc_feature[c]['last_patent']
        stand_all_patent = cpc_feature[c]['all_patent']
        stand_grow_rate = cpc_feature[c]['grow_rate']
        stand_all_neighbor_num = cpc_feature[c]['all_neighbor_num']
        stand_last_neighbor_num = cpc_feature[c]['last_neighbor_num']
        stand_neighbor_growth_rate = cpc_feature[c]['neighbor_grow_rate']
        stat_emb = np.array([stand_last_patent, stand_all_patent, stand_grow_rate, stand_all_neighbor_num, stand_last_neighbor_num, stand_neighbor_growth_rate] * 6)
        tech_emb = cpc_feature[c]['technology_emb']
        cpc_emb[c]=np.concatenate([stat_emb, tech_emb])
    return cpc_emb

In [16]:
val_emb = make_cpc_emb(val_feature)

100%|██████████| 14670/14670 [00:00<00:00, 246767.30it/s]


### 2.3 build a network

In [17]:
def make_net(cpc_all):
    edges = []
    for c in cpc_all:
        edges.extend(list(combinations(c['CPC'],2))+[(i[-1],i[0]) for i in list(combinations(c['CPC'],2))])
    edge_counter = Counter(edges)
    return list(edge_counter.keys())

In [18]:
val_edge = make_net(val_net)

### 2.4 Calculate edge characteristics

In [19]:
def cal_feature(src_node, dst_node, edge_feature):
    sn = src_node
    dn = dst_node
    ass_s = set(edge_feature[sn]['assignee'])
    ass_d = set(edge_feature[dn]['assignee'])
    inv_s = set(edge_feature[sn]['inventor'])
    inv_d = set(edge_feature[dn]['inventor'])
    com_s = set(edge_feature[sn]['company'])
    com_d = set(edge_feature[dn]['company'])
    nei_s = set(edge_feature[sn]['all_neighbor'])
    nei_d = set(edge_feature[dn]['all_neighbor'])
    same_ass = len(ass_s.intersection(ass_d))
    same_inv = len(inv_s.intersection(inv_d))
    same_com = len(com_s.intersection(com_d))
    same_nei = nei_s.intersection(nei_d)
    all_nei = nei_s.union(nei_d)
    cn = len(same_nei)
    jc = cn/len(all_nei)
    ss = 2*len(same_nei)/(len(nei_s)+len(nei_d))
    st = len(same_nei)/((len(nei_s)*len(nei_d))**0.5)
    hp = cn/min(len(nei_s),len(nei_d))
    hd = cn/max(len(nei_s),len(nei_d))
    lhn = cn/(len(nei_s)*len(nei_d))
    pa = len(nei_s)*len(nei_d)
    aa = 0
    ra = 0
    for n in same_nei:
        if len(edge_feature[n]['all_neighbor'])>1:
            aa += 1 / math.log(len(edge_feature[n]['all_neighbor']))
            ra += 1 / len(edge_feature[n]['all_neighbor'])
        elif len(edge_feature[n]['all_neighbor'])>0:
            ra += 1 / len(edge_feature[n]['all_neighbor'])
    same_group = 1 if sn.split('/')[0] == dn.split('/')[0] else 0
    same_class = 1 if sn[:3]==dn[:3] else 0
    same_section = 1 if sn[0]==dn[0] else 0
    return [same_ass, same_inv, same_com, same_group, same_class, same_section, cn, jc, ss, st, hp, hd, lhn, pa, aa, ra]

## 3 prepare data
### Node Characteristics X

In [20]:
x = []
for i in range(len(id_cpc)):
    x.append(val_emb[id_cpc[i]])

### Network structure edge_index

In [21]:
edge_index = []
for edge in val_edge:
    edge_index.append([cpc_id[c] for c in edge])

### Prepare candidate links

In [22]:
target_cpc = 'B60W60'
target_cpcs = [c for c in cpc_id.keys() if c.split("/")[0]==target_cpc]

In [23]:
last_network_index = test_dataset["label_edge"]
last_network_label = test_dataset["label"]

last_network_cpc = []
for i in range(len(last_network_index)):
    if last_network_label[i]:
        last_network_cpc.append([id_cpc[last_network_index[i][0]], id_cpc[last_network_index[i][1]]])

exist_edge = [(t[0].split("/")[0], t[1].split("/")[0]) for t in last_network_cpc]
exist_edge = Counter(exist_edge)
exist_edge = sorted(exist_edge.items(), key=lambda o: o[1], reverse=True)

In [24]:
cand_cpc = []
for e in exist_edge:
    if e[0][0] == target_cpc:
        cand_cpc.append((e[0][1],e[1]))
cand_cpc[:10]

[('B60W30', 516),
 ('G05D1', 489),
 ('B60W60', 478),
 ('B60W2050', 473),
 ('G08G1', 461),
 ('B60W2554', 375),
 ('G01C21', 316),
 ('B60W50', 303),
 ('B60W40', 262),
 ('B60W2540', 224)]

In [25]:
cand_cpcs = []
for cand in cand_cpc[:10]:
    cand_cpcs.extend([c for c in cpc_id.keys() if c.split("/")[0]==cand[0]])
cand_cpcs = list(set(cand_cpcs))

In [26]:
case_edges_ = []
for p in case_data:
    case_edges_.extend(list(combinations(p['CPC'],2))+[(i[-1],i[0]) for i in list(combinations(p['CPC'],2))])
case_edge_counter = Counter(case_edges_)
new_edge = list(case_edge_counter.keys())

In [27]:
cand_edges = []
cand_label = []
for c in cand_cpcs:
    for t in target_cpcs:
        if (t,c) not in cand_edges and (c,t) not in cand_edges and c!=t:
            cand_edges.append((t,c))
            if (t,c) in new_edge or (c,t) in new_edge:
                cand_label.append(1)
            else:
                cand_label.append(0)

In [28]:
len(target_cpcs), len(cand_cpcs), len(cand_edges), sum(cand_label)

(35, 598, 20300, 1758)

### edge_feature，edge_label

In [29]:
label_edge_feature = []
for edge in tqdm(cand_edges):
    label_edge_feature.append(cal_feature(edge[0], edge[1], val_feature))

100%|██████████| 20300/20300 [00:02<00:00, 7767.63it/s]


In [30]:
labels = cand_label

In [31]:
label_edge_index = []
for edge in cand_edges:
    label_edge_index.append([cpc_id[edge[0]], cpc_id[edge[1]]])

In [32]:
np.array(x).shape, np.array(edge_index).shape, np.array(label_edge_index).shape, np.array(label_edge_feature).shape, np.array(labels).shape

((14670, 100), (1148886, 2), (20300, 2), (20300, 16), (20300,))

In [33]:
val_dataset = {
    'x': x,
    'x_feature': val_feature,
    'edge_index': edge_index,
    'label_edge_index': label_edge_index,
    'label_edge_feature': label_edge_feature,
    'label': labels
}

In [34]:
dataset = Dataset.from_list(
    [
        {"type": "case", "data": val_dataset},
    ]
)

In [35]:
# dataset.save_to_disk('case_dataset')

Saving the dataset (0/1 shards):   0%|          | 0/1 [00:00<?, ? examples/s]