#### BERT

In [1]:
import torch
from transformers import AutoTokenizer, AutoModel

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
model = AutoModel.from_pretrained("google-bert/bert-base-uncased").to(device)

  from .autonotebook import tqdm as notebook_tqdm


#### Get similarity knowledge results

In [None]:
from openai import OpenAI
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from googleapiclient.discovery import build
from dotenv import load_dotenv
import os
import re
import time
from googleapiclient.errors import HttpError
from googleapiclient.http import set_user_agent
import socket

load_dotenv()
API_KEY = os.getenv("OPENAI_API_KEY")
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
GOOGLE_CSE_ID = os.getenv("GOOGLE_CSE_ID")
client = OpenAI(api_key=API_KEY)

def generate_sub_questions(article_text):
    prompt = f"""
    Please analyze the following article and break it down into 2-3 key factual claims 
    that need to be verified. Format them as numbered questions:
    
    Article: {article_text}
    
    Please list the sub-questions in this format:
    1. [First question]
    2. [Second question]
    3. [Third question]
    """
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are an expert in breaking down articles into verifiable claims."},
            {"role": "user", "content": prompt},
        ]
    )
    return response.choices[0].message.content

def get_bert_embedding(text):
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=512
    ).to(device)

    with torch.no_grad():
        outputs = model(**inputs)
        cls_embedding = outputs.last_hidden_state[:, 0, :]  # 取 [CLS] 向量

    return cls_embedding.cpu().numpy()

def search_google(query, retries=3, timeout_sec=10):
    service = build("customsearch", "v1", developerKey=GOOGLE_API_KEY)
    for attempt in range(retries):
        try:
            # 設定 socket 全域 timeout
            socket.setdefaulttimeout(timeout_sec)
            res = service.cse().list(q=query, cx=GOOGLE_CSE_ID).execute()
            return [item['snippet'] for item in res.get('items', []) if 'snippet' in item]
        except HttpError as e:
            if e.resp.status == 500:
                print(f"Internal server error on attempt {attempt + 1}. Retrying...")
                time.sleep(1)  # 等待1秒後重試
            else:
                raise e
        except TimeoutError:
            print(f"Timeout on attempt {attempt + 1}. Retrying in 2 seconds...")
            time.sleep(2)
        except Exception as e:
            print(f"Unexpected error: {e}. Retrying in 2 seconds...")
            time.sleep(2)
    return []

# 計算相似度並選擇最相似的結果
def calculate_similarity(question, knowledge_results):
    question_embedding = get_bert_embedding(question)
    knowledge_embeddings = [get_bert_embedding(k) for k in knowledge_results]
    cosine_similarities = [cosine_similarity(question_embedding, k_emb.reshape(1, -1))[0][0] for k_emb in knowledge_embeddings]
    sorted_indices = np.argsort(cosine_similarities)[::-1]
    return [knowledge_results[i] for i in sorted_indices[:5]]  # change to 對於每個子問題取前5相似的知識結果

NUM_NEWS = 2000    # Change this to the number of news articles you want to process
df = pd.read_excel('news_final.xlsx')
df_to_process = df.head(NUM_NEWS)  # 只處理前 NUM_NEWS 篇
knowledge_data = []
for idx, row in df_to_process.iterrows():
    if idx % 50 == 0:
            print(f"Processing text {idx + 1}/{NUM_NEWS}...")
    news_id = row['news_id']
    article_text = row['text']

    # 生成子問題
    sub_questions = generate_sub_questions(article_text)
    sub_questions_list = [q.strip() for q in sub_questions.split('\n') if re.match(r"^\d+\.", q.strip())]

    # 對每個子問題進行 Google Search 和相似度計算
    for sub_question in sub_questions_list:
        search_results = search_google(sub_question)
        if search_results:
            selected_results = calculate_similarity(sub_question, search_results)
            for result in selected_results:
                knowledge_data.append({"news_id": news_id, "knowledge_result": result})

    if not sub_questions_list or not knowledge_data:
        knowledge_data.append({"news_id": news_id, "knowledge_result": "No results found"})

knowledge_df = pd.DataFrame(knowledge_data)
knowledge_df.to_excel('knowledge_results.xlsx', index=False)

print("結果已存到 knowledge_results.xlsx")


#### 提取knowledge result的embeddings，並賦予entity_id

In [2]:
import torch
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModel

np.set_printoptions(suppress=True)  # 禁用科學記數法

def get_word_embedding(word):
    inputs = tokenizer(word, return_tensors="pt", padding=True).to(device)
    outputs = model(**inputs)
    last_hidden_state = outputs.last_hidden_state
    word_embedding = last_hidden_state[:, 0, :].squeeze(0).detach().cpu().numpy()
    return word_embedding

knowledge_results_df = pd.read_excel("knowledge_results.xlsx")
entity_embeddings = []
for idx, row in knowledge_results_df.iterrows():
    entity = row["knowledge_result"]
    entity_id = idx

    embedding = get_word_embedding(entity)
    entity_embeddings.append({
        "entity": entity,
        "entity_id": entity_id,
        "embedding": embedding
    })
    knowledge_results_df.at[idx, "entity_id"] = entity_id

entity_embeddings_df = pd.DataFrame(entity_embeddings)
entity_embeddings_df.to_excel("Entity_Embeddings.xlsx", index=False)
knowledge_results_df.to_excel("knowledge_results.xlsx", index=False)

print("已生成 Entity_Embeddings.xlsx 和 knowledge_results.xlsx ")


已生成 Entity_Embeddings.xlsx 和 knowledge_results.xlsx 


#### 生成news2entity.xlsx(開始跑)

In [1]:
import torch
from transformers import AutoTokenizer, AutoModel

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
model = AutoModel.from_pretrained("google-bert/bert-base-uncased").to(device)

def get_bert_embedding(text):
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=512
    ).to(device)

    with torch.no_grad():
        outputs = model(**inputs)
        cls_embedding = outputs.last_hidden_state[:, 0, :]  # 取 [CLS] 向量

    return cls_embedding.cpu().numpy()

  from .autonotebook import tqdm as notebook_tqdm


## 新增llm choose 外部知識
### Gemini api

In [2]:
import os
import google.generativeai as genai
from dotenv import load_dotenv

load_dotenv()
# 設定 Gemini API 密鑰
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))

def ask_gemini_for_most_relevant(news_text, entities_list, top_k=15, news_id=None):
    """
    使用 ChatGPT API 從候選實體列表中選出最相關的實體（最多 15 個）
    如果 ChatGPT 挑不到 15 個，就補滿最相似的實體
    """
    # 準備提示
    entities_text = "\n".join([f"{i+1}. {entity}" for i, entity in enumerate(entities_list)])
    prompt = f"""
    I have a news article with the following content:
    
    {news_text}
    
    I also have 30 potentially related knowledge snippets:
    
    {entities_text}
    
    Please carefully analyze the content of the news article and select the 15 most relevant snippets from the 30 provided. Only return the snippet numbers you find most relevant, in the format of a comma-separated list of numbers, such as "1,3,5,7,9,11,13,15,17,19,21,23,25,27,29"
    """
    
    for attempt in range(3):  # 最多重試 3 次
        try:
            # 使用 Gemini API
            model = genai.GenerativeModel('gemini-2.0-flash-lite')
            response = model.generate_content(prompt)
            
            result = response.text.strip()
            # 提取數字
            selected_indices = []
            for num in result.replace(" ", "").split(","):
                try:
                    idx = int(num) - 1  # 轉換為0-based索引
                    if 0 <= idx < len(entities_list):
                        selected_indices.append(idx)
                except ValueError:
                    continue

            # 如果 Gemini 沒有提供有效數字，印出問題和原始回應
            if not selected_indices:
                print(f"\n⚠️ Warning: Gemini returned no valid indices for news_id: {news_id}, fallback to top_k.")
                print(f"📤 Gemini raw response: \"{result}\"\n")
                
            # 若少於 top_k，則補滿
            if len(selected_indices) < top_k:
                remaining_indices = [i for i in range(len(entities_list)) if i not in selected_indices]
                # 依照原本排序（也就是相似度排序）從剩下的挑前面的補
                num_to_fill = top_k - len(selected_indices)
                selected_indices += remaining_indices[:num_to_fill]
            
            return selected_indices
        except Exception as e:
            print(f"API 調用錯誤 (第 {attempt + 1} 次): {e}")
            time.sleep(2)
        
    # 三次都失敗時 fallback
    print(f"API 重試 3 次仍失敗，使用 fallback top_k for news_id {news_id}")
    return list(range(min(top_k, len(entities_list))))

In [3]:
import re
import os
import time
import numpy as np
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv
from sklearn.metrics.pairwise import cosine_similarity


news_final_df = pd.read_excel("news_final.xlsx")
knowledge_results_df = pd.read_excel("knowledge_results.xlsx")
entity_embeddings_df = pd.read_excel("Entity_Embeddings.xlsx")

def fix_embedding_format(embedding_str):
    """修正嵌入格式"""
    try:
        # 將所有數字分隔的空格替換為逗號和空格，確保數字之間有正確分隔符號
        fixed_str = re.sub(r"(?<![,\[])\s+", ", ", embedding_str.strip())
        return eval(fixed_str)
    except Exception as e:
        print(f"Error fixing embedding: {embedding_str} -> {e}")
        return None

entity_embeddings_df["embedding"] = entity_embeddings_df["embedding"].apply(fix_embedding_format)  # 修正格式
entity_embedding_matrix = np.vstack(entity_embeddings_df["embedding"].dropna().to_numpy())  # 將嵌入轉換為矩陣
merged_df = pd.merge(knowledge_results_df, news_final_df, on="news_id")
print(f"entity共有 {len(merged_df)} 筆數據。")

# 建立 entity_id 到 entity 文本的映射
entity_id_to_text = dict(zip(entity_embeddings_df["entity_id"], entity_embeddings_df["entity"]))

news2entity_data = []
processed_news_ids = set()  # 記錄已處理過的 news_id

for idx, row in merged_df.iterrows():
    news_id = row["news_id"]
    
    # 檢查是否已處理過該 news_id
    if news_id in processed_news_ids:
        continue
    processed_news_ids.add(news_id)
    
    news_text = row["text"]
    news_embedding = get_bert_embedding(news_text)
    
    # 1. 找出該新聞原本對應的 entity_id
    original_entity_ids = knowledge_results_df[knowledge_results_df["news_id"] == news_id]["entity_id"].tolist()
    for entity_id in original_entity_ids:
        news2entity_data.append({"news_id": news_id, "entity_id": entity_id})
    
    # 2. 計算與其他非對應 entity 的相似度
    other_entity_ids = entity_embeddings_df[~entity_embeddings_df["entity_id"].isin(original_entity_ids)]["entity_id"].tolist()
    other_entity_embeddings = entity_embedding_matrix[[i for i in range(len(entity_embedding_matrix)) if i not in original_entity_ids]]
    similarities = cosine_similarity(news_embedding.reshape(1, -1), other_entity_embeddings).flatten()
    
    # 找出前30個最相似的實體
    top_30_indices = np.argsort(similarities)[-30:][::-1]
    top_30_entity_ids = [other_entity_ids[i] for i in top_30_indices]
    
    # 獲取這30個實體的文本內容
    top_30_entities_text = [entity_id_to_text.get(entity_id, f"Entity {entity_id}") for entity_id in top_30_entity_ids]
    
    # 使用 ChatGPT 選擇相關的實體
    selected_indices = ask_gemini_for_most_relevant(news_text, top_30_entities_text)
    
    # 使用ChatGPT返回的實體
    selected_entity_ids = [top_30_entity_ids[i] for i in selected_indices]
    
    # 將選定的實體添加到關聯列表
    for entity_id in selected_entity_ids:
        news2entity_data.append({"news_id": news_id, "entity_id": entity_id})

news2entity_df = pd.DataFrame(news2entity_data)
news2entity_df.to_excel("graph/edges/news2entity.xlsx", index=False)
print("news2entity.xlsx 已生成")

entity共有 28094 筆數據。

📤 Gemini raw response: "1.  28
2.  18
3.  25
4.  15
5.  29
6.  20
7.  27
8.  30
9.  10
10. 16
11. 22
12. 26
13. 11
14. 7
15. 8"


📤 Gemini raw response: "Here's my selection of the 15 most relevant snippets, based on the provided news article:

1.  (Given the "Selena Gomez" mention)
2.  (May contain relevant content due to possible celebrity focus)
3.  (May contain relevant content due to possible celebrity focus)
4.  (May contain relevant content due to possible celebrity focus)
5.  (May contain relevant content due to possible celebrity focus)
6.  (May contain relevant content due to possible celebrity focus)
7.  (May contain relevant content due to possible celebrity focus)
8.  (May contain relevant content due to possible celebrity focus)
9.  (May contain relevant content due to possible music focus)
10. (May contain relevant content due to possible celebrity focus)
11. (May contain relevant content due to possible celebrity focus)
12. (Potentially relevant due

In [4]:
# 數據處理沒問題後修改 Entity_Embeddings.xlsx
entity_embeddings_df.to_excel("Entity_Embeddings.xlsx", index=False)

#### 將 entity embedding 取出存為NumPy檔案

In [5]:
import pandas as pd
import numpy as np

input_file = "Entity_Embeddings.xlsx"
df = pd.read_excel(input_file)

# 將 'embeddings' 欄位中的內容轉換為 NumPy 陣列
embeddings_list = df['embedding'].apply(eval).tolist()
embeddings_array = np.array(embeddings_list)

output_file = "Embeddings/entity_embeddings.npy"
np.save(output_file, embeddings_array)

print(f"'embeddings' 已存為 {output_file}")
print(embeddings_array.shape)
print(embeddings_array)


'embeddings' 已存為 Embeddings/entity_embeddings.npy
(28094, 768)
[[-0.46327636 -0.42651764  0.569577   ... -0.22263594  0.6463397
   0.27724758]
 [ 0.00774809 -0.43700978  0.73024756 ... -0.6115501   0.35315475
   0.0806242 ]
 [-0.23157343  0.05381721  1.0471402  ... -0.18854885  0.08356763
   0.35302582]
 ...
 [-0.01674841  0.16828081  0.5083257  ... -0.27942416  0.6135716
   0.4093037 ]
 [-0.07317522 -0.2897664  -0.18686338 ... -0.3084366   1.0422757
   0.50079864]
 [-0.38301653  0.19434644  0.06675266 ... -0.59949386  0.9190997
  -0.0091194 ]]


### 根據已經存在的所有excel檔案生成三個 _index.npy檔，內容是字典(不重複的)

In [6]:
import numpy as np
import pandas as pd

def generate_indices(dataset, num_topics):
    
    news_df = pd.read_excel(f'/home/blueee/LESS4FD/Data/{dataset}/news_final.xlsx')
    news_ids = news_df['news_id'].tolist()
    news_index = {int(news_id): idx for idx, news_id in enumerate(news_ids)}
    np.save(f'/home/blueee/LESS4FD/Data/{dataset}/graph/nodes/news_index.npy', news_index)
    print("news_index.npy 已生成")

    
    entity_df = pd.read_excel(f'/home/blueee/LESS4FD/Data/{dataset}/graph/edges/news2entity.xlsx')
    entity_ids = entity_df['entity_id'].unique().tolist()
    entity_index = {int(entity_id): idx for idx, entity_id in enumerate(entity_ids)}
    np.save(f'/home/blueee/LESS4FD/Data/{dataset}/graph/nodes/entity_index.npy', entity_index)
    print("entity_index.npy 已生成")

    
    topic_df = pd.read_excel(f'/home/blueee/LESS4FD/Data/{dataset}/graph/edges/news2topic_{num_topics}.xlsx')
    topic_ids = topic_df['topic_id'].unique().tolist()
    topic_index = {int(topic_id): idx for idx, topic_id in enumerate(topic_ids)}
    np.save(f'/home/blueee/LESS4FD/Data/{dataset}/graph/nodes/topic_index_{num_topics}.npy', topic_index)
    print(f"topic_index_{num_topics}.npy 已生成")

    print(f"news_index: {news_index}")
    print(f"entity_index: {entity_index}")
    print(f"topic_index: {topic_index}")


dataset = "Knowledge_llmchoose_GossipCop-30-15"  # 數據集
num_topics = 30  # 主題數
generate_indices(dataset, num_topics)


news_index.npy 已生成
entity_index.npy 已生成
topic_index_30.npy 已生成
news_index: {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 67: 67, 68: 68, 69: 69, 70: 70, 71: 71, 72: 72, 73: 73, 74: 74, 75: 75, 76: 76, 77: 77, 78: 78, 79: 79, 80: 80, 81: 81, 82: 82, 83: 83, 84: 84, 85: 85, 86: 86, 87: 87, 88: 88, 89: 89, 90: 90, 91: 91, 92: 92, 93: 93, 94: 94, 95: 95, 96: 96, 97: 97, 98: 98, 99: 99, 100: 100, 101: 101, 102: 102, 103: 103, 104: 104, 105: 105, 106: 106, 107: 107, 108: 108, 109: 109, 110: 110, 111: 111, 112: 112, 113: 113, 114:

### 生成全局索引

In [7]:
import numpy as np

dataset = "Knowledge_llmchoose_GossipCop-30-15"  # 數據集
num_topics = 30  # 主題數
news_index = np.load(f'/home/blueee/LESS4FD/Data/{dataset}/graph/nodes/news_index.npy', allow_pickle=True).item()
entity_index = np.load(f'/home/blueee/LESS4FD/Data/{dataset}/graph/nodes/entity_index.npy', allow_pickle=True).item()
topic_index = np.load(f'/home/blueee/LESS4FD/Data/{dataset}/graph/nodes/topic_index_{num_topics}.npy', allow_pickle=True).item()

global_index = {}

# 1. 添加新聞節點到全局索引
current_index = 0
for news_id, local_index in news_index.items():
    global_index[f"news_{news_id}"] = current_index
    current_index += 1

# 2. 添加實體節點到全局索引
for entity_id, local_index in entity_index.items():
    global_index[f"entity_{entity_id}"] = current_index
    current_index += 1

# 3. 添加主題節點到全局索引
for topic_id, local_index in topic_index.items():
    global_index[f"topic_{topic_id}"] = current_index
    current_index += 1


print(f"Global index dictionary: {global_index}")
print(f"Total nodes: {len(global_index)}")  # 應該等於 (news數) + (entity數) + (topic數)

np.save(f'/home/blueee/LESS4FD/Data/{dataset}/graph/nodes/global_index_{num_topics}.npy', global_index)
print(f"global_index_{num_topics}.npy 已生成！")


Global index dictionary: {'news_0': 0, 'news_1': 1, 'news_2': 2, 'news_3': 3, 'news_4': 4, 'news_5': 5, 'news_6': 6, 'news_7': 7, 'news_8': 8, 'news_9': 9, 'news_10': 10, 'news_11': 11, 'news_12': 12, 'news_13': 13, 'news_14': 14, 'news_15': 15, 'news_16': 16, 'news_17': 17, 'news_18': 18, 'news_19': 19, 'news_20': 20, 'news_21': 21, 'news_22': 22, 'news_23': 23, 'news_24': 24, 'news_25': 25, 'news_26': 26, 'news_27': 27, 'news_28': 28, 'news_29': 29, 'news_30': 30, 'news_31': 31, 'news_32': 32, 'news_33': 33, 'news_34': 34, 'news_35': 35, 'news_36': 36, 'news_37': 37, 'news_38': 38, 'news_39': 39, 'news_40': 40, 'news_41': 41, 'news_42': 42, 'news_43': 43, 'news_44': 44, 'news_45': 45, 'news_46': 46, 'news_47': 47, 'news_48': 48, 'news_49': 49, 'news_50': 50, 'news_51': 51, 'news_52': 52, 'news_53': 53, 'news_54': 54, 'news_55': 55, 'news_56': 56, 'news_57': 57, 'news_58': 58, 'news_59': 59, 'news_60': 60, 'news_61': 61, 'news_62': 62, 'news_63': 63, 'news_64': 64, 'news_65': 65, 'new