### 0. 引入基本的库

In [1]:
import time
import json
from tqdm import tqdm
import copy
import pickle

import pandas as pd
import numpy as np

import torch
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [33]:
# asce数据128条
asce_data = pd.read_csv("./data/Asce_Result.csv", encoding="ISO-8859-1") # encoding="ISO-8859-1"能够表示大部分欧洲的文字
# WoS数据207条
wos_data = pd.read_csv("./data/WoS_Result.csv", encoding="ISO-8859-1")
# Scopus数据247条
scopus_data = pd.read_csv("./data/Scopus_Result.csv", encoding="ISO-8859-1")

In [34]:
wos_list = []
for i in range(len(wos_data)):
    paper = {}
    paper["title"] = wos_data.iloc[i]["Article Title"].lower().replace("–","-").replace("’","'").replace("—","-").replace("&nbsp;", " ").replace("−", "-")
    paper["source"] = wos_data.iloc[i]["Source Title"].lower()
    paper["abstract"] = wos_data.iloc[i]["Abstract"].lower().replace("–","-").replace("’","'").replace("—","-").replace("&nbsp;", " ").replace("−", "-")
    if wos_data.iloc[i]["ï»¿publication type"] == "J":
        paper["type"] = "article"
    elif wos_data.iloc[i]["ï»¿publication type"] == "C":
        paper["type"] = "proceeding"
    paper["authors"] = wos_data.iloc[i]["Authors"].replace(" /", "; ")
    paper["year"] = wos_data.iloc[i]["Publication Year"]
    if pd.isnull(wos_data.iloc[i]["DOI Link"]) == False:
        paper["doi"] = "https://doi.org/" + wos_data.iloc[i]["DOI"]
    else:
        paper["doi"] = None
    wos_list.append(paper)

In [35]:
scopus_list = []
for i in range(len(scopus_data)):
    paper = {}
    paper["title"] = scopus_data.iloc[i]["Title"].lower().replace("–","-").replace("’","'").replace("—","-").replace("&nbsp;", " ").replace("−", "-")
    paper["source"] = scopus_data.iloc[i]["Source title"].lower()
    paper["abstract"] = scopus_data.iloc[i]["Abstract"].lower().replace("–","-").replace("’","'").replace("—","-").replace("&nbsp;", " ").replace("−", "-")
    
    if scopus_data.iloc[i]["Document Type"] == "Article":
        paper["type"] = "article"
    else:
        paper["type"] = "proceeding"

    paper["authors"] = scopus_data.iloc[i]["ï»¿authors"].replace(" /", "; ")
    paper["year"] = scopus_data.iloc[i]["Year"]
    
    if pd.isnull(scopus_data.iloc[i]["DOI"]) == False:
        paper["doi"] = "https://doi.org/" + scopus_data.iloc[i]["DOI"]
    else:
        paper["doi"] = None
    scopus_list.append(paper)

In [36]:
asce_list = []
for i in range(len(asce_data)):
    paper = {}
    paper["title"] = asce_data.iloc[i]["title"].lower().replace("–","-").replace("’","'").replace("—","-").replace("&nbsp;", " ").replace("−", "-")
    paper["source"] = asce_data.iloc[i]["source"].lower()
    paper["abstract"] = asce_data.iloc[i]["abstract"].lower().replace("–","-").replace("’","'").replace("—","-").replace("&nbsp;", " ").replace("−", "-")
    
    if asce_data.iloc[i]["type"] == "Journal Article":
        paper["type"] = "article"
    else:
        paper["type"] = "proceeding"

    paper["authors"] = asce_data.iloc[i]["Authors"][2:].replace(" /", "; ")
    paper["year"] = asce_data.iloc[i]["pubyear"]
    
    if pd.isnull(asce_data.iloc[i]["doi"]) == False:
        paper["doi"] = asce_data.iloc[i]["doi"]
    else:
        paper["doi"] = None
    asce_list.append(paper)

### 1. 文件读取

In [37]:
from transformers import BertTokenizer, BertModel

model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)



In [38]:
def getEmbeddings(sentence, tokenizer, model,device):
    tokens = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True).to(device)
    model = model.to(device)
    model.eval()
    with torch.no_grad():
        outputs = model(tokens["input_ids"])
        embeddings = outputs.last_hidden_state.mean(dim=1)
        return embeddings.cpu().numpy()

In [42]:
wos_list_ebds = []
wos_time_start = time.time()
for i in tqdm(range(len(wos_list))):
    paper = wos_list[i]
    title = paper["title"]
    abstract = paper["abstract"]
    title_ebds = getEmbeddings(title, tokenizer, model, device)
    abstract_ebds = getEmbeddings(abstract[:100], tokenizer, model, device)
    paper["title_ebds"] = title_ebds
    paper["abstract_ebds"] = abstract_ebds
    paper["duplicated"] = 0
    wos_list_ebds.append(paper)
wos_time_end = time.time()
print(f"total time: {wos_time_end - wos_time_start}s")
print(f"average time: {(wos_time_end - wos_time_start)/len(wos_list)}s")

  attn_output = torch.nn.functional.scaled_dot_product_attention(
100%|██████████████████████████████████████████████████████████████| 206/206 [00:15<00:00, 13.11it/s]

total time: 15.714799404144287s
average time: 0.07628543400070043s





In [43]:
scopus_list_ebds = []
scopus_time_start = time.time()
for i in tqdm(range(len(scopus_list))):
    paper = scopus_list[i]
    title = paper["title"]
    abstract = paper["abstract"]
    title_ebds = getEmbeddings(title, tokenizer, model, device)
    abstract_ebds = getEmbeddings(abstract[:100], tokenizer, model, device)
    paper["title_ebds"] = title_ebds
    paper["abstract_ebds"] = abstract_ebds
    paper["duplicated"] = 0
    scopus_list_ebds.append(paper)
scopus_time_end = time.time()
print(f"total time: {scopus_time_end - scopus_time_start}s")
print(f"average time: {(scopus_time_end - scopus_time_start)/len(scopus_list)}s")

100%|██████████████████████████████████████████████████████████████| 247/247 [00:23<00:00, 10.65it/s]

total time: 23.195196866989136s
average time: 0.0939076796234378s





In [45]:
asce_list_ebds = []
asce_time_start = time.time()
for i in tqdm(range(len(asce_list))):
    paper = asce_list[i]
    title = paper["title"]
    abstract = paper["abstract"]
    title_ebds = getEmbeddings(title, tokenizer, model, device)
    abstract_ebds = getEmbeddings(abstract[:100], tokenizer, model, device)
    paper["title_ebds"] = title_ebds
    paper["abstract_ebds"] = abstract_ebds
    paper["duplicated"] = 0
    asce_list_ebds.append(paper)
asce_time_end = time.time()
print(f"total time: {asce_time_end - asce_time_start}s")
print(f"average time: {(asce_time_end - asce_time_start)/len(asce_list)}s")

100%|██████████████████████████████████████████████████████████████| 128/128 [00:14<00:00,  8.94it/s]

total time: 14.32786250114441s
average time: 0.1119364257901907s





In [48]:
wos_list_ebds_backup = copy.deepcopy(wos_list_ebds)
scopus_list_ebds_backup = copy.deepcopy(scopus_list_ebds)
asce_list_ebds_backup = copy.deepcopy(asce_list_ebds)

wos_list_ebds = wos_list_ebds_backup.copy()
scopus_list_ebds = scopus_list_ebds_backup.copy()
asce_list_ebds = asce_list_ebds_backup.copy()

combined_list_ebds = []
combined_list_ebds.extend(wos_list_ebds)
combined_list_ebds.extend(scopus_list_ebds)
combined_list_ebds.extend(asce_list_ebds)
combined_list_ebds = sorted(combined_list_ebds, key=lambda x:x["title"])

In [51]:
deleteList = []
start_time = time.time()
for i in range(0, len(combined_list_ebds)):
    if i in deleteList:
        continue
    paper_i = combined_list_ebds[i]
    paper_i_title_ebds = paper_i["title_ebds"]

    for j in range(i+1, len(combined_list_ebds)):
        paper_j = combined_list_ebds[j]
        paper_j_title_ebds = paper_j["title_ebds"]
        title_similarity = cosine_similarity(paper_i_title_ebds, paper_j_title_ebds)
        if title_similarity > 0.95:
            deleteList.append(j)
            
            if len(paper_i["abstract"]) >= len(paper_j["abstract"]):
                paper_i["abstract"] = paper_i["abstract"]
            else:
                paper_i["abstract"] = paper_j["abstract"]
                paper_i["abstract_ebds"] = paper_j["abstract_ebds"]
                
            paper_i["authors"] = paper_i["authors"] if len(paper_i["authors"]) >= len(paper_j["authors"]) else paper_j["authors"]
            
            if paper_i["doi"] == None and paper_j["doi"] != None:
                paper_i["doi"] = paper_j["doi"]
                
            paper_j["duplicated"] = 1
        else:
            break
end_time = time.time()

print(f"processing time: {end_time - start_time}s")
print(f"avarage processing time: {(end_time - start_time) / len(combined_list_ebds)}")
print(f"total item: {len(combined_list_ebds)}")
print(f"total filtered item: {len(combined_list_ebds) - len(deleteList)}")

# 注1：后续可以通过paper["duplicated"] == 0 获取unique paper。如下面代码所示：
# for paper in combined_list_ebds:
#     if paper["duplicated"] == 0:
#         print(paper["title"])

processing time: 0.4702026844024658s
avarage processing time: 0.0008092989404517484
total item: 581
total filtered item: 401


In [75]:
# 1. 将数据combined_list_ebds存到本地
# with open("./results/results_2.pkl", "wb") as fp:
#     pickle.dump(combined_list_ebds, fp)

In [76]:
# 2. 将数据combined_list_ebds加载内存
with open("./results/results_2.pkl", "rb") as fp:
    combined_list_ebds = pickle.load(fp)

In [57]:
columns = ["year", "title", "abstract"]
papers_df = pd.DataFrame(columns = columns)
for i, paper in enumerate(combined_list_ebds):
    if paper["duplicated"] == 0:
        paperList = pd.DataFrame([{
            "year":paper["year"],
            "title":paper["title"],
            "abstract":paper["abstract"]
        }])
        papers_df = pd.concat([papers_df, paperList], ignore_index=True)

In [58]:
# 1. 将数据(title abstract)写到本地
# papers_df.to_csv("./results/results_2.csv", encoding="ISO-8859-1")

In [207]:
# 2. 获取测试数据(将数据(title abstract)加载到内存)
papers_df = pd.read_csv("./results/results_2.csv", index_col="num")

In [208]:
papers_gt_df = pd.read_csv("./results/results_gt_2.csv", index_col="num")

#### 1.1 设置随机种子&提取测试数据

In [5]:
def set_seed(seed):
    np.random.seed(seed)

In [228]:
all_indexes = np.arange(len(papers_df))
selected_indexes = np.array([])

set_seed(42)
indexes_1 = np.sort(np.random.choice(
    len(papers_df)
    , 100,  replace=False))
selected_indexes = np.concatenate((selected_indexes, indexes_1), axis=0)

set_seed(42)
indexes_2 = np.sort(np.random.choice(
    all_indexes[~np.isin(all_indexes, selected_indexes)]
    , 100, replace=False))
selected_indexes = np.concatenate((selected_indexes, indexes_2), axis=0)

set_seed(42)
indexes_3 = np.sort(np.random.choice(
    all_indexes[~np.isin(all_indexes, selected_indexes)]
    , 100, replace=False))
selected_indexes = np.concatenate((selected_indexes, indexes_3), axis=0)

indexes_4 = all_indexes[~np.isin(all_indexes, selected_indexes)]

In [177]:
len(indexes_4)

101

In [239]:
defined_indexes =  [4,169,378]

In [243]:
test_data = papers_df.iloc[indexes_4].copy()
gt_data = papers_gt_df.iloc[indexes_4].copy()

test_data["groundTruth"] = gt_data["type"]
test_data["prediction"] = 0

### 2. LLM进行相关性判断

In [1]:
import os
import openai
from dotenv import find_dotenv, load_dotenv

_ = load_dotenv(find_dotenv())
openai.api_key = os.getenv("OPENAI_API_KEY")

In [6]:
from langchain_core.prompts import PromptTemplate
from langchain_community.chat_models import ChatOpenAI
from langchain_core.output_parsers import JsonOutputParser

In [244]:
template1 = """
    Task: 
    You are required to perform a text classification task over the provided materials.
    You will be provided with the title and abstract of an research paper. 
    Based on the title and abstract, you are required to determine whether the paper meets all the following requirements： 
        Requirement 1. The research falls in the construction domain or construction site management domain.
        
        Requirement 2. The research uses visual materials (limited to video or image sequences) or point cloud as data.
        
        Requirement 3. The paper has applied object tracking method or object tracking algorithm in the research. The topic can be any one of the followings:
                a. "applying computer vision and/or machine learning to achieve object or worker or equipment tracking in construction domain" or \
                b. "using object tracking algorithm to solve part of the research problem" or \
                c. "investigating 2-dimensional and/or 3-dimensional object tracking in construction domain" or \
                d. "improving or optimizing object tracking algorithm in construction domain" or \
                e. "solving research problem using object tracking method to track object or worker or equipment" or \
                f. "supervising construction site using techniques including object tracking techniques or object tracking system".
                       
    For the research paper meets all three requirement, the paper should be classified as type 1;
    For the research paper does not meet any one of the requirements, the paper should be classified as type 2.
    
    Instructions:
    1. For paper using LiDAR, UWB, GPS, RFID, bluetooth, IOT, GIS, ZIGBEE, AR or other non-image and non-video sensors to achieve \
    object or worker tracking, it does not meet the requirement 2. Therefore, this paper should be categorized as type 2.

    2. If the research paper only focuses on monitoring or object detecting/detection or locating/localization or positioning or matching, without applying object tracking algorithm, \
    the paper is out of the scope of this task, as it does not meet the Requirement 3. Therefore, this paper should be categorized as type 2.
    If the research paper focuses on monitoring or object detecting/detection or locating/localization or positioning or matching, \
    at the same time, the paper explicitly mentioned using object tracking algorithm or object tracking method or tracking object. The paper is within the scope of this task.
    
    3. If a paper applies computer vision and/or machine learning, but not to achieve tracking function, it is out of the scope of this task.

    4. If a paper used vision-based algorithm, then the paper meets the requirement 2.

    5. If the title and abstract do not explicitly mention using "object tracking method" or mention "tracking object or worker", \
    then the paper is out of the scope of this task, as it does not meet requirement 3.

    6. Additional Information:
    (1) For paper related to "crane", it is out of the scope of this task. Therefore, the paper focusing on crane tracking should be categorized as type 2.
    (2) Domain like nuclear power plant is out of the scope of this task.
    (3) The application of SLAM technology is out of the scope of this task.
    (4) electric power monitor is out of the scope of this task.
    (5) tracking construction progress and tracking construction project are out of the scope of this task.
    (6) obtaining trajectory of the detected object and tracking object's movement are also regarded as tracking object.
    (7) The research outcomes is expected to be useful for tracking, then the paper is not related to applying tracking algorithm. Such paper is out \
    of the scope of this task.
    (8) automatic tracking camera system or camera control system is out of the scope of this task

    
    7. Please output the answer in JSON format. 
    {{
        "Title": "the title of the provided paper",
        "Type" : "the type of the provided paper",
        "RequirementsFulfilled": {{
            "Requirement_1": "Whether the paper meets requirement 1. Value should be true or false",
            "Requirement_2": "Whether the paper meets requirement 2. Value should be true or false",
            "Requirement_3": "Whether the paper meets requirement 3. Value should be true or false",
            "Reason": "When the requirement 3 is true, you should give a reason why, and provided reference from title or abstract"
        }}
    }}
    
    8. Here is an example:
    Title:
        Personnel tracking on construction sites using video cameras
    Abstract:
        This paper discusses the possibility of- and need for-tracking workforce on construction jobsites using video cameras. \
    An evaluation of algorithms and their associated results is presented. The principal objective of this paper is to test and \
    demonstrate the feasibility of tracking workers from statically placed and dynamically moving cameras. This paper also reviews \
    existing techniques to monitor workforce and describes areas where this work might be useful in engineering applications. \
    The main difficulties associated with tracking on a construction site is the significant amount of visual clutter, the changing \
    photometric visual content throughout the course of a day, and the presence of occluding and moving obstacles. The tracking of workers \
    within the field of view of the camera will involve four tracking techniques, density mean-shift, Bayesian segmentation, active contours, \
    and graph-cuts. Typical construction site video will be processed using the proposed algorithms and analyzed to determine the most appropriate \
    tracking method for the video presented.
    
    Answer:
    {{
        "Title": "Real-time positioning of moving objects by dynamic target tracking",
        "Type": "1",
        "RequirementsFulfilled": {{
            "Requirement_1": true,
            "Requirement_2": true,
            "Requirement_3": true,
            "Reason": "The abstract states that 'the objective of this paper is to test and demonstrate the feasibility of tracking workers from cameras', which meets all three requirements"
        }}
    }}
    
    Notes:
    1. Please think carefully and make sure that you get the correct answer.
    2. Do not include any explanations or apologies in your responses.
    3. Do not respond to any questions that might ask anything else than output the answer in JSON format.
    4. Please do not imply anything, answer the question based on the provided title, abstract, and Instruction.

    Title:
    {title}
    Abstract:
    {abstract}

    Now please complete the task:
"""


prompt1 = PromptTemplate.from_template(template=template1)
# chatllm1 = ChatOpenAI(model = "gpt-4o-mini-2024-07-18") #gpt-4o-mini-2024-07-18 # gpt-3.5-turbo-1106
# lcel1 = prompt1 | chatllm1 | JsonOutputParser()


In [None]:
# (3) The using of robot or robotics is out of the scope of this task.
# "Reason": "The abstract states that the 'objective of this paper is to test and demonstrate the feasibility of tracking workers from cameras', which meets all three requirements"
# "Reason": "When the requirement 3 is true, you should give a reason why"

In [313]:
id = 15
result = lcel1.invoke({"title": test_data.loc[id]["title"], "abstract": test_data.loc[id]["abstract"]})
result

In [245]:
results = {}
count = 0
for index, paper in tqdm(test_data.iterrows()):
    
    chatllm1 = ChatOpenAI(model = "gpt-4o-mini-2024-07-18") #gpt-4o-mini-2024-07-18 # gpt-3.5-turbo-1106
    lcel1 = prompt1 | chatllm1 | JsonOutputParser()

    title = paper["title"]
    abstract = paper["abstract"]
    result = lcel1.invoke({"title": title, "abstract": abstract})
    results[index] = result
    test_data.loc[index, "prediction"] = int(result["Type"])
    # count+=1
    # if count <= 50:
    #     continue

101it [07:48,  4.64s/it]


In [246]:
with open("./results/results_code2/result_5.json", "w") as fp:
    fp.write(json.dumps(results))

In [247]:
test_data.to_csv("./results/results_code2/results_pd_5.csv", encoding="ISO-8859-1")

In [242]:
results

{4: {'Title': 'computer vision-based video interpretation model for automated productivity analysis of construction operations',
  'Type': '2',
  'RequirementsFulfilled': {'Requirement_1': True,
   'Requirement_2': True,
   'Requirement_3': False,
   'Reason': "The abstract discusses 'detection and tracking of project resources', but does not explicitly mention using 'object tracking method' or 'tracking object or worker', which does not meet requirement 3."}},
 169: {'Title': 'predicting safety hazards among construction workers and equipment using computer vision and deep learning techniques',
  'Type': '2',
  'RequirementsFulfilled': {'Requirement_1': True,
   'Requirement_2': True,
   'Requirement_3': False,
   'Reason': 'The paper discusses monitoring and detecting locations and trajectories of workers and equipment, but it does not explicitly mention using an object tracking method or algorithm.'}},
 378: {'Title': 'efficient project management in construction sites to monitor an