In [1]:
import re
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


In [8]:
# Configure Selenium
def setup_selenium():
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in background
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36")
    
    # Use ChromeDriverManager to auto-download driver (or specify path)
    service = Service(executable_path="chromedriver.exe")  # Update path if needed
    driver = webdriver.Chrome(service=service, options=chrome_options)
    return driver

In [9]:
# Scrape WHO FAQ page
def scrape_who_gmo_faq():
    url = "https://www.who.int/news-room/q-a-detail/food-genetically-modified"
    driver = setup_selenium()
    
    try:
        driver.get(url)
        
        # Wait for main content to load (adjust timeout as needed)
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, ".sf-content-block, article, .q-a-detail"))
        )
        
        soup = BeautifulSoup(driver.page_source, "html.parser")
        main_content = soup.find("div", class_="sf-content-block") or soup.find("article")
        
        if not main_content:
            raise ValueError("Main content container not found")
        
        # Extract questions (h3/h4) and answers (p)
        qa_pairs = []
        current_question = None
        current_answer = []
        
        for element in main_content.find_all(["h3", "h4", "p"]):
            if element.name in ["h3", "h4"]:
                if current_question and current_answer:
                    qa_pairs.append({
                        "question": current_question,
                        "answer": " ".join(current_answer).strip()
                    })
                current_question = element.get_text(strip=True)
                current_answer = []
            elif element.name == "p" and current_question:
                current_answer.append(element.get_text(strip=True))
        
        if current_question and current_answer:
            qa_pairs.append({
                "question": current_question,
                "answer": " ".join(current_answer).strip()
            })
        
        return qa_pairs
    finally:
        driver.quit()


In [10]:
# Match misconceptions with rebuttals
def generate_rebuttals(qa_pairs):
    misconception_map = {
        "GMOs cause cancer": {
            "keywords": [r'cancer', r'carcinogen'],
            "type": "health"
        },
        "GMOs are unsafe": {
            "keywords": [r'safe', r'safety', r'risk', r'unsafe'],
            "type": "safety"
        },
        "GMOs cause diseases": {
            "keywords": [r'disease', r'illness', r'health effect'],
            "type": "health"
        },
        "GMOs are not regulated": {
            "keywords": [r'regulat', r'approval', r'assessment', r'authorities'],
            "type": "regulation"
        },
        "GMOs lead to infertility": {
            "keywords": [r'infertility', r'reproductive'],
            "type": "health"
        }
    }
    
    data = []
     
    for claim, claim_data in misconception_map.items():
        pattern = re.compile('|'.join(claim_data['keywords']), re.IGNORECASE)
        
        for qa in qa_pairs:
            if pattern.search(qa['question']) or pattern.search(qa['answer']):
                clean_answer = re.sub(r'\s+', ' ', qa['answer'])
                clean_answer = clean_answer[:1000]  # Limit length
                
                data.append({
                    "Claim_text": claim,
                    "Claim_type": claim_data["type"],
                    "Rebuttal_text": clean_answer,
                    "Source_abbreviation": "WHO",
                    "Tone": "Scientific",
                    "Label": "false",
                    "Rebuttal_strength": 5
                })
                break  # Use first match
    
    return data

In [11]:
# Main execution
if __name__ == "__main__":
    try:
        print("🔄 Scraping WHO GMO FAQ page...")
        qa_pairs = scrape_who_gmo_faq()
        
        if not qa_pairs:
            print("⚠️ No Q&A pairs found. Check if the page structure changed.")
        else:
            print(f"✅ Found {len(qa_pairs)} Q&A pairs")
            rebuttals = generate_rebuttals(qa_pairs)
            
            if rebuttals:
                df = pd.DataFrame(rebuttals)
                df.to_csv("gmo_rebuttals_selenium.csv", index=False)
                print(f"✅ Saved {len(df)} rebuttals to gmo_rebuttals_selenium.csv")
            else:
                print("⚠️ No matching rebuttals found. Adjust keywords or page selectors.")
    
    except Exception as e:
        print(f"❌ Error: {e}")

🔄 Scraping WHO GMO FAQ page...
❌ Error: name 'Options' is not defined
