# Part 1: Config

In [None]:
import chromedriver_autoinstaller
from selenium import webdriver
import base64
import json
from google import genai
from google.genai import types
import os
from PIL import Image

In [None]:
from selenium.webdriver.common.by import By
import concurrent.futures
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from selenium.common.exceptions import (
    TimeoutException, NoSuchElementException, ElementClickInterceptedException
)

In [None]:
import time
import requests
import random

In [None]:
# Install Chromedriver
chromedriver_autoinstaller.install()

# Configure Chrome options
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')  # Run in headless mode
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--window-size=1920x1080')  # Ensure the window size is large enough

chrome_options.add_argument("--start-maximized")
chrome_options.add_argument("accept-language=en-US,en;q=0.9")
chrome_options.add_argument("referer=https://www.google.com/")
chrome_options.add_argument("--disable-blink-features=AutomationControlled")

chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option("useAutomationExtension", False)

chrome_options.add_argument(
    "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.6478.57 Safari/537.36"
)
# chrome_options.binary_location = '/usr/bin/chromium-browser'

GEMINI_API_KEYS = [
    "AIzaSyATfpBIyIKcajRMxGy9-1tbNP5xwgLua3U",
    "AIzaSyDRSjpj3Gb6TUqrpLVKQfewxzGwJspjEIQ",
    "AIzaSyBWaCqJzCHVHUeWuv9XEeSo6ne9cV39vec",
    "AIzaSyAhm_7o7Lnj4C5FoWU9QkOcTacjnj_YiMU"
]

def switch_api_key(current_key_index):
    global model
    new_key_index = (current_key_index + 1) % len(GEMINI_API_KEYS)
    client = genai.Client(api_key=GEMINI_API_KEYS[new_key_index])
    return new_key_index

current_key_index = 0
client = genai.Client(api_key=GEMINI_API_KEYS[current_key_index])

MINIMUM_K = 2

# Part 2: Crawl related articles

In [None]:
def search_relevant_links(query):
  driver = webdriver.Chrome(options=chrome_options)

  prompt = f'https://www.bing.com/search?q={query}'
  print(prompt)
  driver.get(prompt)
  time.sleep(random.uniform(1, 10))
  # print(driver.page_source)

  articles = driver.find_elements(By.CSS_SELECTOR, "#b_results li.b_algo")
  link_articles = []
  # link_articles.append({
  #     'title': query[:30],
  #     'link': prompt,
  #     # 'summary': summary
  # })
  print(f"Found {len(articles)} relevant links:\n{articles}")
  for article in articles[:MINIMUM_K]:  # Giới hạn lấy 5 kết quả đầu tiên
    try:
      title_element = article.find_element(By.TAG_NAME, "h2").find_element(By.TAG_NAME, "a")
      title = title_element.text
      link = title_element.get_attribute('href')
      # summary = article.find_element(By.CLASS_NAME, 'css-16nhkrn').text
      # local = local_query(link)
      link_articles.append({
          'title': title,
          'link': link,
          # 'summary': summary
      })
      print(title)
      print(link)
    except Exception as e:
        print("Lỗi khi trích xuất bài viết:", e)
  driver.quit()

  print(f"Found {len(link_articles)} relevant links:\n{link_articles}")

  return link_articles

In [None]:
def try_dismiss_popups(driver):
    try:
        # Các nút phổ biến cần nhấn
        popup_texts = [
            "Accept Cookies", "Accept All Cookies", "I Accept",
            "Agree", "Press & Hold", "Continue"
        ]
        for text in popup_texts:
            try:
                btn = driver.find_element(
                    By.XPATH,
                    f"//button[contains(translate(., 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), '{text.lower()}')]"
                )
                btn.click()
                print(f"✅ Clicked popup button: '{text}'")
                break
            except NoSuchElementException:
                continue
            except ElementClickInterceptedException:
                continue

        # Tìm các nút có class name chứa 'close'
        close_candidates = driver.find_elements(By.XPATH, "//button[contains(@class, 'close') or contains(translate(@aria-label, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'close')]")

        for btn in close_candidates:
            try:
                btn.click()
                print("✅ Clicked a close button")
                break
            except (ElementClickInterceptedException, Exception):
                continue

    except Exception as e:
        print(f"⚠️ Error while dismissing popup: {e}")

def process_article_link(article, max_retries=5):
    """Hàm xử lý một liên kết riêng lẻ và trả về nội dung gộp các thẻ <p>"""
    article_crawl = {
        "title": article['title'],
        "src": article['link'],
        "contents": ""  # gộp tất cả <p> vào 1 chuỗi
    }

    success = False
    wait_time = 10  # thời gian chờ ban đầu (giây)

    for attempt in range(1, max_retries + 1):
        driver = webdriver.Chrome(options=chrome_options)
        try:
            print(f"⏳ Attempt {attempt}: Crawling {article['link']} with wait_time={wait_time}s")
            driver.get(article['link'])
            time.sleep(wait_time)
            try_dismiss_popups(driver)

            all_elements = driver.find_elements(By.XPATH, ".//p")
            contents = []

            for element in all_elements:
                if element.tag_name == "p":
                    text_content = element.get_attribute("innerText").strip()
                    if text_content:
                        contents.append(text_content)

            article_crawl["contents"] = "\n".join(contents)
            print(f'✅ Crawled content from {article["link"]}:\n{article_crawl["contents"][:500]}...')  # in 500 ký tự đầu tiên
            success = True
            break  # thành công thì thoát

        except Exception as e:
            print(f"⚠️ Attempt {attempt} failed for {article['link']}: {e}")
            wait_time += 300  # tăng thời gian chờ thêm 10s cho mỗi lần thử lại

        finally:
            driver.quit()

    if not success:
        print(f"❌ Failed to crawl article from {article['link']} after {max_retries} attempts.")

    return article_crawl



def crawl_articles(query, article_url, crawl_json):
    """Hàm chính để crawl các trang khác"""
    
    url_articles = [{
          'title': "Article contains caption",
          'link': article_url,
          # 'summary': summary
    }]
    
    try:
        url_articles.extend(search_relevant_links(query))
        
    except Exception as e:
        print(f"⚠️ Error while searching for relevant links: {e}")

    # Giới hạn số lượng link cần crawl
    url_articles = url_articles[:MINIMUM_K + 1]
    
    print(url_articles)

    # Sử dụng Multi-threading để chạy nhiều request song song
    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        result = list(executor.map(process_article_link, url_articles))

    # Gộp kết quả vào crawl_json
    crawl_json.extend(result)

In [None]:
# query = "Julian Castro at his announcement in San Antonio, Tex., on Saturday. Mr. Castro, the former secretary of housing and urban development, would be one of the youngest presidents if elected."
# crawl_json = []
# article_url = "https://www.nytimes.com/2019/06/13/us/politics/julian-castro-fox-town-hall.html"

# crawl_articles(query, article_url , crawl_json)

In [None]:
# crawl_json

# Part 3: Fact-check

In [None]:
import time


def lable(item):
  try:

    image_path = f"../data/images_test_mmsys/{item['img_local_path']}"  # Điều chỉnh đường dẫn hình ảnh
    image = Image.open(image_path)
    
    image

    caption1 = item['caption1']
    caption2 = item['caption2']
    article_texts = item['article_texts']
    prompt = f"""A new is considered fake if the tag of the aritcle is one of these tags: 'Research In Progress', 'True', 'Mostly True', 'Mixture', 'Mostly False', 'False, Unproven', 'Unfounded', 'Outdated', 'Miscaptioned', 'Correct Attribution', 'Misattributed', 'Legend', 'Scam', 'Legit', 'Labeled Satire', 'Originated as Satire', 'Recall', 'Lost Legend', 'Fake'.
    Return 1 if the news in file is fake or the caption not included in the content, else return 0."""

    response = client.models.generate_content(
        model='gemini-2.5-flash',
        contents=[article_texts, image, caption1, caption2, prompt],
        config={
            'response_mime_type': 'text/x.enum',
            'response_schema': {
                "type": "STRING",
                "enum": ["0", "1"],
            },
            'safety_settings': [
                types.SafetySetting(
                    category=types.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
                    threshold=types.HarmBlockThreshold.BLOCK_NONE,
                ),
            ]
            
        },

    )
    
    response 

    if response and hasattr(response, 'text') and response.text:
        result = response.text
    else:
        result = "No response or blocked prompt"

  except Exception as e:
        result = f"Error: {str(e)}"

  return result



In [None]:
# item = {"img_local_path": "public_test_mmsys/0.jpg", "caption1": "Julian Castro at his announcement in San Antonio, Tex., on Saturday. Mr. Castro, the former secretary of housing and urban development, would be one of the youngest presidents if elected.", "caption2": "Julian Castro at his announcement in San Antonio, Tex., on Saturday, Jan. 12, 2019.", "context_label": 0, "article_url": "https://www.nytimes.com/2019/06/13/us/politics/julian-castro-fox-town-hall.html", "maskrcnn_bboxes": [[389.9706726074219, 72.9228744506836, 505.0566711425781, 373.24993896484375], [89.46248626708984, 312.29644775390625, 190.55088806152344, 396.4997253417969], [116.25189971923828, 225.38841247558594, 161.36624145507812, 288.41522216796875], [180.07785034179688, 225.37646484375, 207.3575439453125, 271.2514953613281], [579.815185546875, 193.33509826660156, 597.6293334960938, 249.89108276367188], [217.98863220214844, 225.41282653808594, 256.5491638183594, 267.5371398925781], [67.05160522460938, 237.61740112304688, 92.31876373291016, 275.8415222167969], [29.469621658325195, 240.86349487304688, 64.6895980834961, 276.5841369628906], [229.984375, 298.4461669921875, 251.81227111816406, 330.3661804199219], [89.82146453857422, 205.71160888671875, 104.25022888183594, 228.6795196533203]], "caption1_modified": "PERSON at his announcement in GPE, GPE, on DATE. Mr. PERSON, the former secretary of housing and urban development, would be one of the youngest presidents if elected.", "caption1_entities": [["Julian Castro", "PERSON"], ["San Antonio", "GPE"], ["Tex.", "GPE"], ["Saturday", "DATE"], ["Castro", "PERSON"]], "caption2_modified": "PERSON at his announcement in GPE, GPE, on DATE.", "caption2_entities": [["Julian Castro", "PERSON"], ["San Antonio", "GPE"], ["Tex.", "GPE"], ["Saturday, Jan. 12, 2019", "DATE"]], "bert_base_score": "0.5769946", "bert_large_score": "0.60118324"}
# crawl_json = [item for item in crawl_json if item is not None]

# # Nối lại toàn bộ nội dung: thêm title và content mỗi bài
# article_texts = "\n\n".join(
#     f"### {item.get('title', 'No Title')}\n{item.get('contents', '').strip()}"
#     for item in crawl_json
#     if item.get("contents")
# )

# item['article_texts'] = article_texts

# result = lable(item)

In [1]:
import json

# Replace with your actual file path
file_path = '../data/cosmos_anns_mmsys/mmsys_anns/public_test_mmsys_final.json'
output_file = "cheapfake_results.json"

START_INDEX = 0  # <- Sửa chỗ này để bắt đầu từ test thứ mấy
current_key_index = 0  # Giả định bạn dùng nhiều API key

# Load data
data = []
with open(file_path, 'r') as file:
    for line in file:
        try:
            json_object = json.loads(line)
            data.append(json_object)
        except json.JSONDecodeError as e:
            print(f"Line: {line}, error: {e}")

# Nếu đã có file kết quả thì tiếp tục từ đó
try:
    with open(output_file, 'r') as f:
        results = json.load(f)
    completed_ids = {item["img_local_path"] for item in results}
except FileNotFoundError:
    results = []
    completed_ids = set()

correct_tests = 0
total_tests = 0

for i, item in enumerate(data[START_INDEX:], start=START_INDEX):
    if item["img_local_path"] in completed_ids:
        continue  # Skip nếu đã xử lý rồi

    crawl_json = []
    # try:
    #     crawl_articles(item['caption1'], crawl_json)  
    # except Exception as e:
    #     print(f"Error crawling caption1: {e}")
    
    try:
        crawl_articles(item['caption2'], item['article_url'], crawl_json)
    except Exception as e:
        print(f"Error crawling caption2: {e}")
    # crawl_articles(item['caption1'], crawl_json)
    # crawl_articles(item['caption2'], crawl_json)

    crawl_json = [x for x in crawl_json if x is not None]

    article_texts = "\n\n".join(
        f"### {x.get('title', 'No Title')}\n{x.get('contents', '').strip()}"
        for x in crawl_json if x.get("contents")
    )
    item['article_texts'] = article_texts

    lable_item = lable(item)
    while lable_item not in {'0', '1'}:
        print(f"Invalid label: {lable_item}")
        lable_item = lable(item)

    item['cheapfake_label'] = int(lable_item)
    results.append(item)

    print("-" * 100)
    print(f"[{i}] Result: {item['cheapfake_label']} - Ground Truth: {item['context_label']}")

    if item['cheapfake_label'] == item['context_label']:
        correct_tests += 1
    total_tests += 1

    # Lưu kết quả ngay sau mỗi mẫu
    with open(output_file, 'w') as f:
        json.dump(results, f, indent=4)

    print(f"✅ Saved {len(results)} results so far. Accuracy: {correct_tests / total_tests:.2%}")
    current_key_index = switch_api_key(current_key_index)


Error crawling caption2: name 'crawl_articles' is not defined


NameError: name 'lable' is not defined