## 1. 加载数据和整理数据

In [1]:
import pandas as pd
import pickle

In [48]:
# asce数据128条
asce_data = pd.read_csv("./data/Asce_2_Result.csv", encoding="utf-8") # encoding="ISO-8859-1"能够表示大部分欧洲的文字
# WoS数据207条
wos_data = pd.read_csv("./data/WoS_2_Result.csv", encoding="utf-8")
# Scopus数据247条
scopus_data = pd.read_csv("./data/Scopus_2_Result.csv", encoding="utf-8")

In [49]:
wos_list = []
for i in range(len(wos_data)):
    paper = {}
    paper["title"] = wos_data.iloc[i]["Article Title"].lower().replace("–","-").replace("’","'").replace("—","-").replace("&nbsp;", " ").replace("−", "-")
    paper["source"] = wos_data.iloc[i]["Source Title"].lower()
    paper["abstract"] = wos_data.iloc[i]["Abstract"].lower().replace("–","-").replace("’","'").replace("—","-").replace("&nbsp;", " ").replace("−", "-")
    if wos_data.iloc[i]["publication type"] == "J":
        paper["type"] = "article"
    elif wos_data.iloc[i]["publication type"] == "C":
        paper["type"] = "proceeding"
    paper["authors"] = wos_data.iloc[i]["Authors"].replace(" /", "; ")
    paper["year"] = wos_data.iloc[i]["Publication Year"]
    if pd.isnull(wos_data.iloc[i]["DOI Link"]) == False:
        paper["doi"] = "https://doi.org/" + wos_data.iloc[i]["DOI"]
    else:
        paper["doi"] = None
    wos_list.append(paper)

In [50]:
scopus_list = []
for i in range(len(scopus_data)):
    paper = {}
    paper["title"] = scopus_data.iloc[i]["Title"].lower().replace("–","-").replace("’","'").replace("—","-").replace("&nbsp;", " ").replace("−", "-")
    paper["source"] = scopus_data.iloc[i]["Source title"].lower()
    paper["abstract"] = scopus_data.iloc[i]["Abstract"].lower().replace("–","-").replace("’","'").replace("—","-").replace("&nbsp;", " ").replace("−", "-")
    
    if scopus_data.iloc[i]["Document Type"] == "Article":
        paper["type"] = "article"
    else:
        paper["type"] = "proceeding"

    paper["authors"] = scopus_data.iloc[i]["authors"].replace(" /", "; ")
    paper["year"] = scopus_data.iloc[i]["Year"]
    
    if pd.isnull(scopus_data.iloc[i]["DOI"]) == False:
        paper["doi"] = "https://doi.org/" + scopus_data.iloc[i]["DOI"]
    else:
        paper["doi"] = None
    scopus_list.append(paper)

In [54]:
asce_data["Authors"] = asce_data["Authors"].apply(lambda x:x.replace("\n",", ").replace("\xa0"," ").replace(" and ", ", "))

In [55]:
asce_data.loc[144]["Authors"]

'Yongchao Yang, Shunlong Li, Satish Nagarajaiah, Hui Li, Peng Zhou'

In [144]:
asce_data.to_csv("./data/Asce_2_Result_modified.csv", encoding="utf-8")

In [57]:
asce_list = []
for i in range(len(asce_data)):
    paper = {}
    paper["title"] = asce_data.iloc[i]["title"].lower().replace("–","-").replace("’","'").replace("—","-").replace("&nbsp;", " ").replace("−", "-")
    paper["source"] = asce_data.iloc[i]["source"].lower()
    paper["abstract"] = asce_data.iloc[i]["abstract"].lower().replace("–","-").replace("’","'").replace("—","-").replace("&nbsp;", " ").replace("−", "-")
    
    if asce_data.iloc[i]["type"] == "Journal Article":
        paper["type"] = "article"
    else:
        paper["type"] = "proceeding"

    paper["authors"] = asce_data.iloc[i]["Authors"][2:].replace(" /", "; ")
    paper["year"] = asce_data.iloc[i]["pubyear"]
    
    if pd.isnull(asce_data.iloc[i]["doi"]) == False:
        paper["doi"] = asce_data.iloc[i]["doi"]
    else:
        paper["doi"] = None
    asce_list.append(paper)

In [62]:
asce_list[0]

{'title': 'annotating 2d imagery with 3d kinematically configurable assets of construction equipment for training pose-informed activity analysis and safety monitoring algorithms',
 'source': 'computing in civil engineering 2019: visualization, information modeling, and simulation',
 'abstract': "the availability of inexpensive and high-quality cameras has enabled research towards computer vision systems for tracking construction productivity and safety by detecting, tracking, and estimating pose of construction resources and recognizing their activities. to verify these algorithms can generalize to novel visual scenes before deployment, large amounts of labelled training and real-world validation samples are needed. while automatic generation of synthetic data helps with the former, obtaining the latter is less practical. to address this gap, we introduce a tool with which designated annotators perform pose annotation by interactively aligning 3d kinematically configurable equipment m

## 2. 对title和abstract进行编码

In [2]:
import torch
from sklearn.metrics.pairwise import cosine_similarity
import random
import numpy as np

from tqdm import tqdm
import time
import copy

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [4]:
def set_seed(seed):
    torch.manual_seed(seed) # 为cpu设置种子
    torch.cuda.manual_seed_all(seed)# 为所有GPU设置种子
    
    np.random.seed(seed)
    random.seed(seed)

    torch.backends.cudnn.deterministic = True

### 2.1 用BERT进行编码

In [5]:
from transformers import BertTokenizer, BertModel

In [6]:
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [7]:
def getEmbeddings(sentence, tokenizer, model,device):
    tokens = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True).to(device)
    model = model.to(device)
    model.eval()
    with torch.no_grad():
        outputs = model(tokens["input_ids"])
        embeddings = outputs.last_hidden_state.mean(dim=1)
        return embeddings.cpu().numpy()

In [107]:
wos_list_ebds = []
wos_time_start = time.time()
for i in tqdm(range(len(wos_list))):
    paper = wos_list[i]
    title = paper["title"]
    abstract = paper["abstract"]
    title_ebds = getEmbeddings(title, tokenizer, model, device)
    abstract_ebds = getEmbeddings(abstract[:100], tokenizer, model, device)
    paper["title_ebds"] = title_ebds
    paper["abstract_ebds"] = abstract_ebds
    paper["duplicated"] = 0
    wos_list_ebds.append(paper)
wos_time_end = time.time()
print(f"total time: {wos_time_end - wos_time_start}s")
print(f"average time: {(wos_time_end - wos_time_start)/len(wos_list)}s")

100%|████████████████████████████████████████████████████████████████████████████████| 239/239 [00:11<00:00, 21.18it/s]

total time: 11.292421340942383s
average time: 0.04724862485749951s





In [108]:
scopus_list_ebds = []
scopus_time_start = time.time()
for i in tqdm(range(len(scopus_list))):
    paper = scopus_list[i]
    title = paper["title"]
    abstract = paper["abstract"]
    title_ebds = getEmbeddings(title, tokenizer, model, device)
    abstract_ebds = getEmbeddings(abstract[:100], tokenizer, model, device)
    paper["title_ebds"] = title_ebds
    paper["abstract_ebds"] = abstract_ebds
    paper["duplicated"] = 0
    scopus_list_ebds.append(paper)
scopus_time_end = time.time()
print(f"total time: {scopus_time_end - scopus_time_start}s")
print(f"average time: {(scopus_time_end - scopus_time_start)/len(scopus_list)}s")

100%|████████████████████████████████████████████████████████████████████████████████| 329/329 [00:16<00:00, 20.26it/s]

total time: 16.245072603225708s
average time: 0.049377120374546224s





In [109]:
asce_list_ebds = []
asce_time_start = time.time()
for i in tqdm(range(len(asce_list))):
    paper = asce_list[i]
    title = paper["title"]
    abstract = paper["abstract"]
    title_ebds = getEmbeddings(title, tokenizer, model, device)
    abstract_ebds = getEmbeddings(abstract[:100], tokenizer, model, device)
    paper["title_ebds"] = title_ebds
    paper["abstract_ebds"] = abstract_ebds
    paper["duplicated"] = 0
    asce_list_ebds.append(paper)
asce_time_end = time.time()
print(f"total time: {asce_time_end - asce_time_start}s")
print(f"average time: {(asce_time_end - asce_time_start)/len(asce_list)}s")

100%|████████████████████████████████████████████████████████████████████████████████| 145/145 [00:07<00:00, 19.32it/s]

total time: 7.507044076919556s
average time: 0.051772717771859s





In [110]:
wos_list_ebds_backup = copy.deepcopy(wos_list_ebds)
scopus_list_ebds_backup = copy.deepcopy(scopus_list_ebds)
asce_list_ebds_backup = copy.deepcopy(asce_list_ebds)

In [111]:
wos_list_ebds = wos_list_ebds_backup.copy()
scopus_list_ebds = scopus_list_ebds_backup.copy()
asce_list_ebds = asce_list_ebds_backup.copy()

In [112]:
combined_list_ebds = []
combined_list_ebds.extend(wos_list_ebds)
combined_list_ebds.extend(scopus_list_ebds)
combined_list_ebds.extend(asce_list_ebds)
combined_list_ebds = sorted(combined_list_ebds, key=lambda x:x["title"])

In [116]:
deleteList = []
start_time = time.time()
for i in range(0, len(combined_list_ebds)):
    if i in deleteList:
        continue
    paper_i = combined_list_ebds[i]
    paper_i_title_ebds = paper_i["title_ebds"]

    for j in range(i+1, len(combined_list_ebds)):
        paper_j = combined_list_ebds[j]
        paper_j_title_ebds = paper_j["title_ebds"]
        title_similarity = cosine_similarity(paper_i_title_ebds, paper_j_title_ebds)
        if title_similarity > 0.95:
            deleteList.append(j)
            
            if len(paper_i["abstract"]) <= len(paper_j["abstract"]):
                paper_i["abstract"] = paper_i["abstract"]
            else:
                paper_i["abstract"] = paper_j["abstract"]
                paper_i["abstract_ebds"] = paper_j["abstract_ebds"]
                
            paper_i["authors"] = paper_i["authors"] if len(paper_i["authors"]) >= len(paper_j["authors"]) else paper_j["authors"]
            
            if paper_i["doi"] == None and paper_j["doi"] != None:
                paper_i["doi"] = paper_j["doi"]
                
            paper_j["duplicated"] = 1
        else:
            break
end_time = time.time()

print(f"processing time: {end_time - start_time}s")
print(f"avarage processing time: {(end_time - start_time) / len(combined_list_ebds)}")
print(f"total item: {len(combined_list_ebds)}")
print(f"total filtered item: {len(combined_list_ebds) - len(deleteList)}")

# 注1：后续可以通过paper["duplicated"] == 0 获取unique paper。如下面代码所示：
# for paper in combined_list_ebds:
#     if paper["duplicated"] == 0:
#         print(paper["title"])

processing time: 0.5564508438110352s
avarage processing time: 0.0007804359660743831
total item: 713
total filtered item: 496


In [118]:
len(retained_list_ebds[0]["title_ebds"][0])

768

In [117]:
# 结果后处理：将unique的结果保存到本地
retained_list_ebds = [paper for paper in combined_list_ebds if paper["duplicated"] == 0]
with open("./results/result_2_bert.pkl","wb") as fp:
    pickle.dump(retained_list_ebds,fp)

In [339]:
# Scopus + WoS = 354
# Scopus + Wos + ASCE = 354 + 74 = 428

### 2.2 用OpenAI_Embedding进行编码

In [10]:
!pip install dotenv

Collecting dotenv
  Downloading dotenv-0.0.5.tar.gz (2.4 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'error'


  error: subprocess-exited-with-error
  
  python setup.py egg_info did not run successfully.
  exit code: 1
  
  [78 lines of output]
  E:\software_anaconda\envs\LLM\lib\site-packages\setuptools\__init__.py:94: _DeprecatedInstaller: setuptools.installer and fetch_build_eggs are deprecated.
  !!
  
          ********************************************************************************
          Requirements should be satisfied by a PEP 517 installer.
          If you are using pip, you can try `pip install --use-pep517`.
          ********************************************************************************
  
  !!
    dist.fetch_build_eggs(dist.setup_requires)
    error: subprocess-exited-with-error
  
    python setup.py egg_info did not run successfully.
    exit code: 1
  
    [16 lines of output]
    Traceback (most recent call last):
      File "<string>", line 2, in <module>
      File "<pip-setuptools-caller>", line 14, in <module>
      File "C:\Users\HKU-i5-Oscar\AppDat

In [9]:
import openai
import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())
openai.api_key = os.getenv("OPENAI_API_KEY")

ModuleNotFoundError: No module named 'dotenv'

In [135]:
client = openai.OpenAI()

In [136]:
def getEmbeddings_openai(sentence, model="text-embedding-3-small"):
    result = client.embeddings.create(
        model = model, # 0.0020
        input = sentence,
        encoding_format="float"
    )
    return np.array(result.data[0].embedding).reshape(1,-1)

In [94]:
getEmbeddings_openai(title)

array([[ 0.04569226,  0.00586341,  0.06200783, ...,  0.02112128,
        -0.01910353, -0.02716368]])

In [137]:
wos_list_ebds = []
wos_time_start = time.time()
for i in tqdm(range(len(wos_list))):
    paper = wos_list[i]
    title = paper["title"]
    abstract = paper["abstract"]
    title_ebds = getEmbeddings_openai(title)
    abstract_ebds = getEmbeddings_openai(abstract[:100])
    paper["title_ebds"] = title_ebds
    paper["abstract_ebds"] = abstract_ebds
    paper["duplicated"] = 0
    wos_list_ebds.append(paper)
wos_time_end = time.time()
print(f"total time: {wos_time_end - wos_time_start}s")
print(f"average time: {(wos_time_end - wos_time_start)/len(wos_list)}s")

100%|████████████████████████████████████████████████████████████████████████████████| 239/239 [04:34<00:00,  1.15s/it]

total time: 274.1270213127136s
average time: 1.1469749845720236s





In [138]:
scopus_list_ebds = []
scopus_time_start = time.time()
for i in tqdm(range(len(scopus_list))):
    paper = scopus_list[i]
    title = paper["title"]
    abstract = paper["abstract"]
    title_ebds = getEmbeddings_openai(title)
    abstract_ebds = getEmbeddings_openai(abstract[:100])
    paper["title_ebds"] = title_ebds
    paper["abstract_ebds"] = abstract_ebds
    paper["duplicated"] = 0
    scopus_list_ebds.append(paper)
scopus_time_end = time.time()
print(f"total time: {scopus_time_end - scopus_time_start}s")
print(f"average time: {(scopus_time_end - scopus_time_start)/len(scopus_list)}s")

100%|████████████████████████████████████████████████████████████████████████████████| 329/329 [06:08<00:00,  1.12s/it]

total time: 368.5307791233063s
average time: 1.1201543438398367s





In [139]:
asce_list_ebds = []
asce_time_start = time.time()
for i in tqdm(range(len(asce_list))):
    paper = asce_list[i]
    title = paper["title"]
    abstract = paper["abstract"]
    title_ebds = getEmbeddings_openai(title)
    abstract_ebds = getEmbeddings_openai(abstract[:100])
    paper["title_ebds"] = title_ebds
    paper["abstract_ebds"] = abstract_ebds
    paper["duplicated"] = 0
    asce_list_ebds.append(paper)
asce_time_end = time.time()
print(f"total time: {asce_time_end - asce_time_start}s")
print(f"average time: {(asce_time_end - asce_time_start)/len(asce_list)}s")

100%|████████████████████████████████████████████████████████████████████████████████| 145/145 [02:40<00:00,  1.10s/it]

total time: 160.12268996238708s
average time: 1.104294413533704s





In [140]:
wos_list_ebds_backup = copy.deepcopy(wos_list_ebds)
scopus_list_ebds_backup = copy.deepcopy(scopus_list_ebds)
asce_list_ebds_backup = copy.deepcopy(asce_list_ebds)

In [141]:
wos_list_ebds = wos_list_ebds_backup.copy()
scopus_list_ebds = scopus_list_ebds_backup.copy()
asce_list_ebds = asce_list_ebds_backup.copy()

In [142]:
combined_list_ebds = []
combined_list_ebds.extend(wos_list_ebds)
combined_list_ebds.extend(scopus_list_ebds)
combined_list_ebds.extend(asce_list_ebds)
combined_list_ebds = sorted(combined_list_ebds, key=lambda x:x["title"])

In [143]:
deleteList = []
start_time = time.time()
for i in range(0, len(combined_list_ebds)):
    if i in deleteList:
        continue
    paper_i = combined_list_ebds[i]
    paper_i_title_ebds = paper_i["title_ebds"]

    for j in range(i+1, len(combined_list_ebds)):
        paper_j = combined_list_ebds[j]
        paper_j_title_ebds = paper_j["title_ebds"]
        title_similarity = cosine_similarity(paper_i_title_ebds, paper_j_title_ebds)
        if title_similarity > 0.95:
            deleteList.append(j)
            
            if len(paper_i["abstract"]) <= len(paper_j["abstract"]):
                paper_i["abstract"] = paper_i["abstract"]
            else:
                paper_i["abstract"] = paper_j["abstract"]
                paper_i["abstract_ebds"] = paper_j["abstract_ebds"]
                
            paper_i["authors"] = paper_i["authors"] if len(paper_i["authors"]) >= len(paper_j["authors"]) else paper_j["authors"]
            
            if paper_i["doi"] == None and paper_j["doi"] != None:
                paper_i["doi"] = paper_j["doi"]
                
            paper_j["duplicated"] = 1
        else:
            break
end_time = time.time()

print(f"processing time: {end_time - start_time}s")
print(f"avarage processing time: {(end_time - start_time) / len(combined_list_ebds)}")
print(f"total item: {len(combined_list_ebds)}")
print(f"total filtered item: {len(combined_list_ebds) - len(deleteList)}")

# 注1：后续可以通过paper["duplicated"] == 0 获取unique paper。如下面代码所示：
# for paper in combined_list_ebds:
#     if paper["duplicated"] == 0:
#         print(paper["title"])

processing time: 0.24077630043029785s
avarage processing time: 0.0003376946710102354
total item: 713
total filtered item: 496


In [102]:
# 结果后处理：将unique的结果保存到本地
retained_list_ebds = [paper for paper in combined_list_ebds if paper["duplicated"] == 0]
with open("./results/result_2_openai.pkl","wb") as fp:
    pickle.dump(retained_list_ebds,fp)

In [105]:
len(retained_list_ebds[0]["title_ebds"][0])

1536

### 2.3 用Llama 3进行编码

In [119]:
import ollama

In [120]:
def getEmbeddings_ollama(sentence, model="llama2"):
    outputs = ollama.embeddings(model=model, prompt=sentence)
    return np.array(outputs["embedding"]).reshape(1,-1)

In [121]:
wos_list_ebds = []
wos_time_start = time.time()
for i in tqdm(range(len(wos_list))):
    paper = wos_list[i]
    title = paper["title"]
    abstract = paper["abstract"]
    title_ebds = getEmbeddings_ollama(title)
    abstract_ebds = getEmbeddings_ollama(abstract[:100])
    paper["title_ebds"] = title_ebds
    paper["abstract_ebds"] = abstract_ebds
    paper["duplicated"] = 0
    wos_list_ebds.append(paper)
wos_time_end = time.time()
print(f"total time: {wos_time_end - wos_time_start}s")
print(f"average time: {(wos_time_end - wos_time_start)/len(wos_list)}s")

100%|████████████████████████████████████████████████████████████████████████████████| 239/239 [00:31<00:00,  7.68it/s]

total time: 31.120624542236328s
average time: 0.13021181816835284s





In [122]:
scopus_list_ebds = []
scopus_time_start = time.time()
for i in tqdm(range(len(scopus_list))):
    paper = scopus_list[i]
    title = paper["title"]
    abstract = paper["abstract"]
    title_ebds = getEmbeddings_ollama(title)
    abstract_ebds = getEmbeddings_ollama(abstract[:100])
    paper["title_ebds"] = title_ebds
    paper["abstract_ebds"] = abstract_ebds
    paper["duplicated"] = 0
    scopus_list_ebds.append(paper)
scopus_time_end = time.time()
print(f"total time: {scopus_time_end - scopus_time_start}s")
print(f"average time: {(scopus_time_end - scopus_time_start)/len(scopus_list)}s")

100%|████████████████████████████████████████████████████████████████████████████████| 329/329 [00:39<00:00,  8.34it/s]

total time: 39.464702129364014s
average time: 0.11995350191296053s





In [123]:
asce_list_ebds = []
asce_time_start = time.time()
for i in tqdm(range(len(asce_list))):
    paper = asce_list[i]
    title = paper["title"]
    abstract = paper["abstract"]
    title_ebds = getEmbeddings_ollama(title)
    abstract_ebds = getEmbeddings_ollama(abstract[:100])
    paper["title_ebds"] = title_ebds
    paper["abstract_ebds"] = abstract_ebds
    paper["duplicated"] = 0
    asce_list_ebds.append(paper)
asce_time_end = time.time()
print(f"total time: {asce_time_end - asce_time_start}s")
print(f"average time: {(asce_time_end - asce_time_start)/len(asce_list)}s")

100%|████████████████████████████████████████████████████████████████████████████████| 145/145 [00:17<00:00,  8.35it/s]

total time: 17.375646829605103s
average time: 0.11983204710072484s





In [124]:
wos_list_ebds_backup = copy.deepcopy(wos_list_ebds)
scopus_list_ebds_backup = copy.deepcopy(scopus_list_ebds)
asce_list_ebds_backup = copy.deepcopy(asce_list_ebds)

In [125]:
wos_list_ebds = wos_list_ebds_backup.copy()
scopus_list_ebds = scopus_list_ebds_backup.copy()
asce_list_ebds = asce_list_ebds_backup.copy()

In [126]:
combined_list_ebds = []
combined_list_ebds.extend(wos_list_ebds)
combined_list_ebds.extend(scopus_list_ebds)
combined_list_ebds.extend(asce_list_ebds)
combined_list_ebds = sorted(combined_list_ebds, key=lambda x:x["title"])

In [127]:
deleteList = []
start_time = time.time()
for i in range(0, len(combined_list_ebds)):
    if i in deleteList:
        continue
    paper_i = combined_list_ebds[i]
    paper_i_title_ebds = paper_i["title_ebds"]

    for j in range(i+1, len(combined_list_ebds)):
        paper_j = combined_list_ebds[j]
        paper_j_title_ebds = paper_j["title_ebds"]
        title_similarity = cosine_similarity(paper_i_title_ebds, paper_j_title_ebds)
        if title_similarity > 0.95:
            deleteList.append(j)
            
            if len(paper_i["abstract"]) <= len(paper_j["abstract"]):
                paper_i["abstract"] = paper_i["abstract"]
            else:
                paper_i["abstract"] = paper_j["abstract"]
                paper_i["abstract_ebds"] = paper_j["abstract_ebds"]
                
            paper_i["authors"] = paper_i["authors"] if len(paper_i["authors"]) >= len(paper_j["authors"]) else paper_j["authors"]
            
            if paper_i["doi"] == None and paper_j["doi"] != None:
                paper_i["doi"] = paper_j["doi"]
                
            paper_j["duplicated"] = 1
        else:
            break
end_time = time.time()

print(f"processing time: {end_time - start_time}s")
print(f"avarage processing time: {(end_time - start_time) / len(combined_list_ebds)}")
print(f"total item: {len(combined_list_ebds)}")
print(f"total filtered item: {len(combined_list_ebds) - len(deleteList)}")

# 注1：后续可以通过paper["duplicated"] == 0 获取unique paper。如下面代码所示：
# for paper in combined_list_ebds:
#     if paper["duplicated"] == 0:
#         print(paper["title"])

processing time: 0.5686345100402832s
avarage processing time: 0.0007975238569989946
total item: 713
total filtered item: 496


In [128]:
# 结果后处理：将unique的结果保存到本地
retained_list_ebds = [paper for paper in combined_list_ebds if paper["duplicated"] == 0]
with open("./results/result_2_llama.pkl","wb") as fp:
    pickle.dump(retained_list_ebds,fp)

In [129]:
len(retained_list_ebds[0]["title_ebds"][0])

4096

### 保存结果

In [130]:
with open("./results/result_2_bert.pkl", "rb") as fp:
    papers_info = pickle.load(fp)

In [131]:
# columns = ["title", "source", "abstract", "type", "authors", "year", "doi"]
columns = ["year", "title", "abstract"]
papers_df = pd.DataFrame(columns=columns)
for paper in papers_info:
    # paperList = pd.DataFrame([{
    #     "title":paper["title"], 
    #     "source":paper["source"], 
    #     "abstract":paper["abstract"], 
    #     "type":paper["type"], 
    #     "authors":paper["authors"], 
    #     "year":paper["year"], 
    #     "doi":paper["doi"]
    # }])
    paperList = pd.DataFrame([{
        "year":paper["year"], 
        "title":paper["title"], 
        "abstract":paper["abstract"]
    }])
    papers_df = pd.concat([papers_df, paperList], ignore_index=True)

In [133]:
papers_df.to_csv("./results/results_2.csv", encoding="utf-8")