#### Prompting for generate LESS4FD entities

In [1]:
import os
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv()
API_KEY = os.getenv('OPENAI_API_KEY')
client = OpenAI(api_key=API_KEY)

def chatgpt_response(system_prompt, user_prompt):
    try:
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            stream=False
        )
        return response.choices[0].message.content
    except Exception as e:
        print(f"Error: {e}")
        return None

# File paths
file_path = 'news_final.xlsx'
output_file_path = 'entity_responses.xlsx'
df = pd.read_excel(file_path)

system_prompt = (
    "Extract the following entities from the given news article: "
    "1. PERSON: Person Definition. 2. DATE: DATE Definition. "
    "3. LOC: LOC Definition. 4. ORG: ORG Definition. 5. MISC: MISC Definition. "
    "Return the results in a dictionary with corresponding keys.\n\n"
    "Example 1: \"The iPhone, created by Apple Inc., was released on June 29, 2007.\"\n"
    "Output1: {\"PERSON\": [\"None\"], \"DATE\": [\"June 29, 2007\"], \"LOC\": [\"None\"], \"ORG\": [\"Apple Inc.\"], \"MISC\": [\"iPhone\"]}"
)

def process_selected_rows(df, num_rows):
    prompts = df['text'].head(num_rows).tolist()
    responses = []

    for i, prompt in enumerate(prompts):
        if i % 10 == 0:
            print(f"Processing text {i + 1}/{num_rows}...")
        user_prompt = (
            f"Given news article: <{prompt}>\n"
            "Extract entities as instructed."
        )
        response = chatgpt_response(system_prompt, user_prompt)
        responses.append(response)

    return responses

num_rows_to_process = 483  # Change to set the number of rows
responses = process_selected_rows(df, num_rows_to_process)

output_df = pd.DataFrame({
    'news_id': df['news_id'].head(num_rows_to_process),
    'text': df['text'].head(num_rows_to_process),
    'entity_response': responses
})
output_df.to_excel(output_file_path, index=False)
print(f"Processed {num_rows_to_process} prompts and saved responses to {output_file_path}")


Processing text 1/483...
Processing text 11/483...
Processing text 21/483...
Processing text 31/483...
Processing text 41/483...
Processing text 51/483...
Processing text 61/483...
Processing text 71/483...
Processing text 81/483...
Processing text 91/483...
Processing text 101/483...
Processing text 111/483...
Processing text 121/483...
Processing text 131/483...
Processing text 141/483...
Processing text 151/483...
Processing text 161/483...
Processing text 171/483...
Processing text 181/483...
Processing text 191/483...
Processing text 201/483...
Processing text 211/483...
Processing text 221/483...
Processing text 231/483...
Processing text 241/483...
Processing text 251/483...
Processing text 261/483...
Processing text 271/483...
Processing text 281/483...
Processing text 291/483...
Processing text 301/483...
Processing text 311/483...
Processing text 321/483...
Processing text 331/483...
Processing text 341/483...
Processing text 351/483...
Processing text 361/483...
Processing t

#### BERT

In [2]:
from transformers import AutoTokenizer, AutoModel
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
model = AutoModel.from_pretrained("google-bert/bert-base-uncased").to(device)

  from .autonotebook import tqdm as notebook_tqdm


#### 提取每個去重的entity的embeddings，並賦予entity_id

In [3]:
import unicodedata
import pandas as pd

entity_responses_file = 'entity_responses.xlsx'
df_responses = pd.read_excel(entity_responses_file)
responses = df_responses['entity_response'].tolist()
embedding_output_file = 'Entity_Embeddings.xlsx'

def get_word_embedding(word):
    inputs = tokenizer(word, return_tensors="pt", padding=True).to(device)
    outputs = model(**inputs)
    last_hidden_state = outputs.last_hidden_state
    word_embedding = last_hidden_state[:, 0, :].squeeze(0).detach().cpu().numpy()
    return word_embedding

unique_entities = set()
all_entities = []

# check if a string contains special characters
def contains_special_characters(text):
    normalized_text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')
    return text != normalized_text  # If normalization changes the text, it contains special characters

for response in responses:
    try:
        # Remove "Output: " prefix if it exists
        if response.startswith("Output: "):
            response = response.replace("Output: ", "", 1)

        entity_dict = eval(response)
        for key, entities in entity_dict.items():
            # Filter entities: exclude "None" and entities with special characters
            filtered_entities = [
                entity for entity in entities
                if entity != "None" and not contains_special_characters(entity)
            ]
            all_entities.extend(filtered_entities)  # 收集所有valid entities，包括重複的
    except Exception as e:
        print(f"Error parsing response: {e}")
        continue

# Identify duplicates
all_entities_set = set(all_entities)
unique_entities = list(all_entities_set)
duplicates = [entity for entity in all_entities if all_entities.count(entity) > 1]

print("Duplicate entities that were removed:")
print(list(set(duplicates)))


# Generate embeddings and assign entity IDs
entity_data = []
for idx, entity in enumerate(unique_entities):
    entity_id = f"{idx}"
    embedding = get_word_embedding(entity)
    entity_data.append({
        'entity': entity,
        'entity_id': entity_id,
        'embedding': embedding
    })

entity_df = pd.DataFrame({
    'entity': [item['entity'] for item in entity_data],
    'entity_id': [item['entity_id'] for item in entity_data],
    'embedding': [item['embedding'].tolist() for item in entity_data]
})
entity_df.to_excel(embedding_output_file, index=False)

print(f"Saved entity embeddings to {embedding_output_file}")


Error parsing response: unterminated string literal (detected at line 1) (<string>, line 1)
Error parsing response: invalid syntax. Perhaps you forgot a comma? (<string>, line 5)
Duplicate entities that were removed:
['Northwestern University', 'The Associated Press', 'Washington, DC', 'House Minority Leader', 'Biola University', 'The Hill', 'Paris Agreement', 'Nazis', 'Detroit', 'Alabama', 'Nobel Peace Prize', 'San Francisco', '1945', 'Tuesday afternoon', 'Nancy Reagan', 'Mitt Romney', 'terrorist attack', 'Iraq war', '1941', 'The Collective PAC', 'George Will', 'Fox', 'Hamburg', 'Dylann Roof', '2030', 'Atlanta', 'David Hogg', 'Minneapolis', '2001', 'New Orleans', 'Weathermen', 'Newsweek', 'Harry Truman', '1950', 'Washington', 'Friday', 'Election Day', 'CRS', 'Washington D.C.', '1966', 'Southeast Asia', 'John F. Kennedy', 'DNC', 'Michigan', 'First Lady', '1983', 'Centers for Disease Control and Prevention', 'Sunni', 'John Kerry', 'Colorado College', 'Ukraine', 'Richard Cordray', 'Schum

#### 生成news2entity.xlsx

In [4]:
import json
import unicodedata
import pandas as pd

news_to_entity_mapping = []
df = pd.read_excel('entity_responses.xlsx')
print(df.shape)  # 檢查

# Iterate through the original entity_response and their IDs
for news_id, entity_respons in enumerate(df['entity_response']):
    for entity_data_item in entity_data:
        entity = entity_data_item['entity']
        entity_id = entity_data_item['entity_id']
        
        # Check if the entity appears in entity_respons
        if entity in entity_respons:
            news_to_entity_mapping.append({
                'news_id': news_id,
                'entity_id': entity_id
            })

# Convert mapping to DataFrame and sort by news_id
news_to_entity_df = pd.DataFrame(news_to_entity_mapping)
news_to_entity_df = news_to_entity_df.sort_values(by='news_id')

news_to_entity_file = 'news2entity.xlsx'
news_to_entity_df.to_excel(f"graph/edges/{news_to_entity_file}", index=False)

# 檢查news2entity.xlsx中的entity_id欄位中的數字有沒有涵蓋所有0~unique_entities的數字，並列出沒有涵蓋的數字
entity_ids = news_to_entity_df['entity_id'].unique()
missing_ids = [str(i) for i in range(len(unique_entities)) if str(i) not in entity_ids]
print("Missing entity IDs:")
print(missing_ids)

# 將找出的missing entity IDs加入到news2entity.xlsx中的entity_id欄位中(對應到每一個news_id欄位都要加入，若news_id有0~1999，則每個missing entity ID都要加入2000次)
num_rows_to_process = 483  # Change to set the number of rows
for missing_entity_id in missing_ids:
    new_rows = pd.DataFrame([{'news_id': news_id, 'entity_id': missing_entity_id} for news_id in range(num_rows_to_process)])
    news_to_entity_df = pd.concat([news_to_entity_df, new_rows], ignore_index=True)

news_to_entity_df.to_excel(f"graph/edges/{news_to_entity_file}", index=False)


print(f"Mapping of news_id to entity_id saved to {news_to_entity_file}")


(483, 3)
Missing entity IDs:
[]
Mapping of news_id to entity_id saved to news2entity.xlsx


#### 將 entity embedding 取出存為NumPy檔案

In [5]:
import pandas as pd
import numpy as np

input_file = "Entity_Embeddings.xlsx"
df = pd.read_excel(input_file)

# 將 'embeddings' 欄位中的內容轉換為 NumPy 陣列
embeddings_list = df['embedding'].apply(eval).tolist()
embeddings_array = np.array(embeddings_list)

output_file = "Embeddings/entity_embeddings.npy"
np.save(output_file, embeddings_array)

print(f"'embeddings' 已存為 {output_file}")
print(embeddings_array.shape)
print(embeddings_array)


'embeddings' 已存為 Embeddings/entity_embeddings.npy
(5978, 768)
[[-0.12820464  0.2316062  -0.05706552 ... -0.05890086  0.30982307
   0.01092581]
 [-0.94089025  0.26359418 -0.49305624 ... -0.09286062  0.86148417
   0.19169185]
 [-0.18431158 -0.0646422  -0.12452538 ... -0.61996818  0.41249692
   0.18600465]
 ...
 [-0.13189203  0.1236266  -0.12106375 ... -0.38044012  0.14535561
   0.12809297]
 [-0.13273901  0.04550967 -0.25311995 ... -0.12791023  0.32580224
   0.27101624]
 [ 0.10960056  0.38902029 -0.23688376 ... -0.31205121  0.24298176
   0.17988455]]


### 根據已經存在的所有excel檔案生成三個 _index.npy檔，內容是字典(不重複的)

In [7]:
import numpy as np
import pandas as pd

def generate_indices(dataset, num_topics):
    
    news_df = pd.read_excel(f'/home/blueee/LESS4FD/Data/{dataset}/news_final.xlsx')
    news_ids = news_df['news_id'].tolist()
    news_index = {int(news_id): idx for idx, news_id in enumerate(news_ids)}
    np.save(f'/home/blueee/LESS4FD/Data/{dataset}/graph/nodes/news_index.npy', news_index)
    print("news_index.npy 已生成")

    
    entity_df = pd.read_excel(f'/home/blueee/LESS4FD/Data/{dataset}/graph/edges/news2entity.xlsx')
    entity_ids = entity_df['entity_id'].unique().tolist()
    entity_index = {int(entity_id): idx for idx, entity_id in enumerate(entity_ids)}
    np.save(f'/home/blueee/LESS4FD/Data/{dataset}/graph/nodes/entity_index.npy', entity_index)
    print("entity_index.npy 已生成")

    
    topic_df = pd.read_excel(f'/home/blueee/LESS4FD/Data/{dataset}/graph/edges/news2topic_{num_topics}.xlsx')
    topic_ids = topic_df['topic_id'].unique().tolist()
    topic_index = {int(topic_id): idx for idx, topic_id in enumerate(topic_ids)}
    np.save(f'/home/blueee/LESS4FD/Data/{dataset}/graph/nodes/topic_index_{num_topics}.npy', topic_index)
    print(f"topic_index_{num_topics}.npy 已生成")

    print(f"news_index: {news_index}")
    print(f"entity_index: {entity_index}")
    print(f"topic_index: {topic_index}")


dataset = "PolitiFact_nontrun"  # 數據集名稱
num_topics = 30  # 替換為主題數
generate_indices(dataset, num_topics)


news_index.npy 已生成
entity_index.npy 已生成
topic_index_30.npy 已生成
news_index: {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 67: 67, 68: 68, 69: 69, 70: 70, 71: 71, 72: 72, 73: 73, 74: 74, 75: 75, 76: 76, 77: 77, 78: 78, 79: 79, 80: 80, 81: 81, 82: 82, 83: 83, 84: 84, 85: 85, 86: 86, 87: 87, 88: 88, 89: 89, 90: 90, 91: 91, 92: 92, 93: 93, 94: 94, 95: 95, 96: 96, 97: 97, 98: 98, 99: 99, 100: 100, 101: 101, 102: 102, 103: 103, 104: 104, 105: 105, 106: 106, 107: 107, 108: 108, 109: 109, 110: 110, 111: 111, 112: 112, 113: 113, 114:

### 生成全局索引

In [8]:
import numpy as np

dataset = "PolitiFact_nontrun"  # 數據集
num_topics = 30  # 主題數
news_index = np.load(f'/home/blueee/LESS4FD/Data/{dataset}/graph/nodes/news_index.npy', allow_pickle=True).item()
entity_index = np.load(f'/home/blueee/LESS4FD/Data/{dataset}/graph/nodes/entity_index.npy', allow_pickle=True).item()
topic_index = np.load(f'/home/blueee/LESS4FD/Data/{dataset}/graph/nodes/topic_index_{num_topics}.npy', allow_pickle=True).item()

global_index = {}

# 1. 添加新聞節點到全局索引
current_index = 0
for news_id, local_index in news_index.items():
    global_index[f"news_{news_id}"] = current_index
    current_index += 1

# 2. 添加實體節點到全局索引
for entity_id, local_index in entity_index.items():
    global_index[f"entity_{entity_id}"] = current_index
    current_index += 1

# 3. 添加主題節點到全局索引
for topic_id, local_index in topic_index.items():
    global_index[f"topic_{topic_id}"] = current_index
    current_index += 1


print(f"Global index dictionary: {global_index}")
print(f"Total nodes: {len(global_index)}")  # 應該等於 (news數) + (entity數) + (topic數)

np.save(f'/home/blueee/LESS4FD/Data/{dataset}/graph/nodes/global_index_{num_topics}.npy', global_index)
print(f"global_index_{num_topics}.npy 已生成！")


Global index dictionary: {'news_0': 0, 'news_1': 1, 'news_2': 2, 'news_3': 3, 'news_4': 4, 'news_5': 5, 'news_6': 6, 'news_7': 7, 'news_8': 8, 'news_9': 9, 'news_10': 10, 'news_11': 11, 'news_12': 12, 'news_13': 13, 'news_14': 14, 'news_15': 15, 'news_16': 16, 'news_17': 17, 'news_18': 18, 'news_19': 19, 'news_20': 20, 'news_21': 21, 'news_22': 22, 'news_23': 23, 'news_24': 24, 'news_25': 25, 'news_26': 26, 'news_27': 27, 'news_28': 28, 'news_29': 29, 'news_30': 30, 'news_31': 31, 'news_32': 32, 'news_33': 33, 'news_34': 34, 'news_35': 35, 'news_36': 36, 'news_37': 37, 'news_38': 38, 'news_39': 39, 'news_40': 40, 'news_41': 41, 'news_42': 42, 'news_43': 43, 'news_44': 44, 'news_45': 45, 'news_46': 46, 'news_47': 47, 'news_48': 48, 'news_49': 49, 'news_50': 50, 'news_51': 51, 'news_52': 52, 'news_53': 53, 'news_54': 54, 'news_55': 55, 'news_56': 56, 'news_57': 57, 'news_58': 58, 'news_59': 59, 'news_60': 60, 'news_61': 61, 'news_62': 62, 'news_63': 63, 'news_64': 64, 'news_65': 65, 'new