In [1]:
from datasets import Dataset
import random
from tqdm import tqdm
import math

In [2]:
random.seed(64)

In [3]:
patent_path = "/Users/xiaoen/Documents/科研/论文/GraphRAG/Code/DataDeal/PatentDataset"

In [4]:
patent_data = Dataset.load_from_disk(patent_path)
patent_data

Dataset({
    features: ['Publication Number', 'Title', 'Abstract', 'Claims', 'Publication Date', 'CPC', 'Novelty', 'Purpose', 'Advantage', 'Patentee', 'Inventor', 'UPC', 'Cited Number', 'Citing Number', 'Family Number', 'Strategic Importance', 'Field Importance', 'Comprehensive Importance'],
    num_rows: 55120
})

## 原始数据

In [5]:
patent_raw = [(cpc,time[:4]) for cpc,time in zip(patent_data['CPC'],patent_data['Publication Date'])]

In [6]:
all_cpc = []
for cpc,_ in patent_raw:
    all_cpc.extend(cpc)
all_cpc = set(all_cpc)
all_time = set([time for _,time in patent_raw])

In [7]:
len(all_cpc),all_time

(27496,
 {'1999',
  '2000',
  '2001',
  '2003',
  '2004',
  '2005',
  '2006',
  '2007',
  '2008',
  '2009',
  '2010',
  '2011',
  '2012',
  '2013',
  '2014',
  '2015',
  '2016',
  '2017',
  '2018',
  '2019',
  '2020',
  '2021',
  '2022',
  '2023',
  '2024'})

## 分段数据

In [8]:
train_input_cpc = [cpc for cpc,time in patent_raw if time <= '2021']
train_output_cpc = [cpc for cpc,time in patent_raw if time == '2022']

In [9]:
test_input_cpc = [cpc for cpc,time in patent_raw if time <= '2022']
test_output_cpc = [cpc for cpc,time in patent_raw if time == '2023']

In [10]:
all_input_cpc = [cpc for cpc,time in patent_raw]

## 数据预处理

In [11]:
def edge_process(cpc_list):
    ## 所有CPC
    all_cpc_dict = {cpc:[] for cpc in all_cpc}
    ## 专利CPC加入
    for cpc in cpc_list:
        for c in cpc:
            all_cpc_dict[c].extend(cpc)
    ## 去重
    for cpc in all_cpc_dict:
        all_cpc_dict[cpc] = list(set(all_cpc_dict[cpc]))
        if cpc in all_cpc_dict[cpc]:
            all_cpc_dict[cpc].remove(cpc)
    return all_cpc_dict

In [12]:
train_input_cpc_dict = edge_process(train_input_cpc)
train_output_cpc_dict = edge_process(train_output_cpc)
test_input_cpc_dict = edge_process(test_input_cpc)
test_output_cpc_dict = edge_process(test_output_cpc)
all_input_cpc_dict = edge_process(all_input_cpc)

In [13]:
raw_datasets = [
    {'type': 'train_input', 'data':train_input_cpc_dict},
    {'type': 'train_label', 'data':train_output_cpc_dict},
    {'type': 'test_input', 'data':test_input_cpc_dict},
    {'type': 'test_label', 'data':test_output_cpc_dict},
    {'type': 'all_input', 'data':all_input_cpc_dict}
]

In [14]:
raw_datasets = Dataset.from_list(raw_datasets)

In [15]:
raw_datasets.save_to_disk("RawData")

Saving the dataset (0/1 shards):   0%|          | 0/5 [00:00<?, ? examples/s]

## 训练/测试数据

In [16]:
def process_edge(cpc_dict):
    edges = []
    for cpc in cpc_dict:
        for c in cpc_dict[cpc]:
            edges.append((cpc,c))
    return edges

In [17]:
def unprecedented_edge(history_edge, new_edge):
    new_edge_set = set(new_edge)
    history_edge_set = set(history_edge)
    ## 前所为有
    # unpreced_edges = new_edge_set - history_edge_set
    ## 所有
    unpreced_edges = new_edge_set
    
    unpreced_cpc_dict = {}
    for edge in unpreced_edges:
        src = edge[0]
        dst = edge[1]
        if src not in unpreced_cpc_dict:
            unpreced_cpc_dict[src] = []
        unpreced_cpc_dict[src].append(dst)
    
    pos = list(unpreced_edges)
    neg = []
    for cpc in unpreced_cpc_dict:
        neg_cpc = list(all_cpc - set(unpreced_cpc_dict[cpc]))
        for c in random.sample(neg_cpc,len(unpreced_cpc_dict[cpc])):
            neg.append((cpc,c))
    
    return pos, neg

In [18]:
def compute_feature(edge_tuple, cpc_dict):
    src = edge_tuple[0]
    dst = edge_tuple[1]
    src_neigh = set(cpc_dict[src])
    dst_neigh = set(cpc_dict[dst])
    z_list = src_neigh & dst_neigh
    
    cn = len(z_list)
    jc = cn / len(src_neigh | dst_neigh) if len(src_neigh | dst_neigh) != 0 else 0
    ss = 2 * cn / (len(src_neigh) + len(dst_neigh)) if len(src_neigh) + len(dst_neigh) != 0 else 0
    st = cn / math.sqrt(len(src_neigh) * len(dst_neigh)) if len(src_neigh) * len(dst_neigh) != 0 else 0
    hp = cn / min(len(src_neigh), len(dst_neigh)) if min(len(src_neigh), len(dst_neigh)) != 0 else 0
    hd = cn / max(len(src_neigh), len(dst_neigh)) if max(len(src_neigh), len(dst_neigh)) != 0 else 0
    lhn = cn / len(src_neigh)*len(dst_neigh) if len(src_neigh)*len(dst_neigh) != 0 else 0
    pa = len(src_neigh) * len(dst_neigh)
    
    aa = 0
    ra = 0
    if z_list:
        for z in z_list:
            aa += 1 / math.log(len(cpc_dict[z])) if len(cpc_dict[z]) > 1 else 0
            ra += 1 / len(cpc_dict[z])
    
    return cn,jc,ss,st,hp,hd,lhn,pa,aa,ra

In [19]:
def make_dataset(history_dict, last_dict):
    pos,neg = unprecedented_edge(process_edge(history_dict), process_edge(last_dict))
    pos_feature = [compute_feature(edge, history_dict) for edge in tqdm(pos)]
    neg_feature = [compute_feature(edge, history_dict) for edge in tqdm(neg)]
    pos_label = [1 for _ in pos]
    neg_label = [0 for _ in neg]
    edges = pos + neg
    features = pos_feature + neg_feature
    labels = pos_label + neg_label
    return edges,features,labels

In [20]:
train_edges,train_features,train_labels = make_dataset(train_input_cpc_dict,train_output_cpc_dict)

100%|██████████| 366648/366648 [00:35<00:00, 10414.49it/s]
100%|██████████| 366648/366648 [00:05<00:00, 63802.07it/s] 


In [21]:
test_edges,test_features,test_labels = make_dataset(test_input_cpc_dict,test_output_cpc_dict)

100%|██████████| 357074/357074 [00:41<00:00, 8707.31it/s]
100%|██████████| 357074/357074 [00:06<00:00, 51882.98it/s] 


In [22]:
train_data = []
for edge,feature,label in zip(train_edges,train_features,train_labels):
    train_data.append({'edge':edge,'feature':feature,'label':label})

In [23]:
test_data = []
for edge,feature,label in zip(test_edges,test_features,test_labels):
    test_data.append({'edge':edge,'feature':feature,'label':label})

## 保存数据

In [24]:
train_datasets = Dataset.from_list(train_data)
test_datasets = Dataset.from_list(test_data)

In [25]:
train_datasets.save_to_disk("TrainData")
test_datasets.save_to_disk("TestData")

Saving the dataset (0/1 shards):   0%|          | 0/733296 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/714148 [00:00<?, ? examples/s]