In [2]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager


In [3]:
# Configure Chrome options
options = webdriver.ChromeOptions()
options.add_argument("--headless=new")
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36")

# Initialize driver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

def scrape_efsa_gmo_claims():
    """Scrape EFSA for GMO safety information and debunk common myths"""
    base_url = "https://www.efsa.europa.eu/en/topics/topic/genetically-modified-organisms"
    driver.get(base_url)
    time.sleep(3)
    
    try:
        # Accept cookies if popup appears
        try:
            cookie_btn = WebDriverWait(driver, 5).until(
                EC.element_to_be_clickable((By.ID, "onetrust-accept-btn-handler")))
            cookie_btn.click()
            time.sleep(1)
        except:
            pass
        
        # Dictionary of common GMO myths we want to debunk
        target_myths = {
            "GMOs cause cancer": ["cancer", "carcinogen", "tumor", "onco"],
            "GMOs cause diseases": ["disease", "illness", "health risk", "pathogen"],
            "GMOs are unsafe": ["unsafe", "dangerous", "hazard", "toxic"],
            "GMOs cause allergies": ["allerg", "immune response", "hypersensitivity"],
            "GMOs harm the environment": ["environment", "ecosystem", "biodiversity"]
        }
        
        results = []
        
        # Scrape all relevant content sections
        content_sections = driver.find_elements(By.CSS_SELECTOR, ".ecl-content-block, .ecl-editor, article")
        
        for section in content_sections:
            try:
                text = section.text.strip()
                if not text or len(text) < 100:
                    continue
                
                # Check for myth rebuttals
                for myth, keywords in target_myths.items():
                    if any(keyword.lower() in text.lower() for keyword in keywords):
                        # Find the most relevant paragraph
                        paragraphs = section.find_elements(By.TAG_NAME, "p")
                        for p in paragraphs:
                            p_text = p.text.strip()
                            if any(keyword.lower() in p_text.lower() for keyword in keywords):
                                # Extract publication date if available
                                date = "Not specified"
                                try:
                                    date_el = section.find_element(By.CSS_SELECTOR, ".ecl-date-block__item")
                                    date = date_el.text.strip()
                                except:
                                    pass
                                
                                results.append({
                                    "Claim": myth,
                                    "Claim_Type": "Health" if "cancer" in myth or "disease" in myth else "Safety",
                                    "EFSA_Response": p_text,
                                    "Source_URL": driver.current_url,
                                    "Publication_Date": date,
                                    "Scientific_Basis": "EFSA Risk Assessment",
                                    "Rebuttal_Strength": 5,  # Scale 1-5
                                    "Tags": ", ".join([k for k in keywords if k in p_text.lower()])
                                })
                                break
            except Exception as e:
                print(f"Error processing section: {e}")
                continue
        
        # If no results found, try searching specific documents
        if not results:
            results = search_efsa_documents(driver, target_myths)
            
        return results
    
    except Exception as e:
        print(f"Error scraping EFSA: {e}")
        return None

SessionNotCreatedException: Message: session not created: This version of ChromeDriver only supports Chrome version 114
Current browser version is 136.0.7103.93 with binary path C:\Program Files (x86)\Google\Chrome\Application\chrome.exe
Stacktrace:
Backtrace:
	GetHandleVerifier [0x0078A813+48355]
	(No symbol) [0x0071C4B1]
	(No symbol) [0x00625358]
	(No symbol) [0x006461AC]
	(No symbol) [0x00641EF3]
	(No symbol) [0x00640579]
	(No symbol) [0x00670C55]
	(No symbol) [0x0067093C]
	(No symbol) [0x0066A536]
	(No symbol) [0x006482DC]
	(No symbol) [0x006493DD]
	GetHandleVerifier [0x009EAABD+2539405]
	GetHandleVerifier [0x00A2A78F+2800735]
	GetHandleVerifier [0x00A2456C+2775612]
	GetHandleVerifier [0x008151E0+616112]
	(No symbol) [0x00725F8C]
	(No symbol) [0x00722328]
	(No symbol) [0x0072240B]
	(No symbol) [0x00714FF7]
	BaseThreadInitThunk [0x75EC5D49+25]
	RtlInitializeExceptionChain [0x77C4CFFB+107]
	RtlGetAppContainerNamedObjectPath [0x77C4CF81+561]


In [None]:
def search_efsa_documents(driver, target_myths):
    """Search EFSA scientific opinions for GMO safety information"""
    search_url = "https://www.efsa.europa.eu/en/search/site/GMO%20safety"
    driver.get(search_url)
    time.sleep(3)
    
    results = []
    
    try:
        # Get all search results
        items = driver.find_elements(By.CSS_SELECTOR, ".search-results .search-result")
        
        for item in items:
            try:
                title = item.find_element(By.CSS_SELECTOR, "h3 a").text
                link = item.find_element(By.CSS_SELECTOR, "h3 a").get_attribute("href")
                date = item.find_element(By.CSS_SELECTOR, ".search-result-date").text
                snippet = item.find_element(By.CSS_SELECTOR, ".search-result-snippet").text
                
                # Check for target myths in the content
                for myth, keywords in target_myths.items():
                    if any(keyword.lower() in snippet.lower() for keyword in keywords):
                        results.append({
                            "Claim": myth,
                            "Claim_Type": "Health" if "cancer" in myth or "disease" in myth else "Safety",
                            "EFSA_Response": f"{title}: {snippet}",
                            "Source_URL": link,
                            "Publication_Date": date,
                            "Scientific_Basis": "EFSA Scientific Opinion",
                            "Rebuttal_Strength": 5,
                            "Tags": ", ".join([k for k in keywords if k in snippet.lower()])
                        })
            except:
                continue
                
    except Exception as e:
        print(f"Error searching EFSA documents: {e}")
    
    return results


In [4]:
# Main execution
if __name__ == "__main__":
    print("🔄 Scraping EFSA for GMO misinformation rebuttals...")
    data = scrape_efsa_gmo_claims()
    
    if data:
        df = pd.DataFrame(data)
        
        # Add additional metadata
        df["Source_Organization"] = "EFSA"
        df["Last_Updated"] = pd.Timestamp.now().strftime("%Y-%m-%d")
        
        # Reorder columns
        columns = [
            "Claim", "Claim_Type", "EFSA_Response", "Scientific_Basis",
            "Rebuttal_Strength", "Publication_Date", "Source_Organization",
            "Source_URL", "Tags", "Last_Updated"
        ]
        df = df[columns]
        
        # Save to CSV
        df.to_csv("efsa_gmo_rebuttals.csv", index=False, encoding='utf-8-sig')
        print(f"✅ Saved {len(df)} rebuttals to efsa_gmo_rebuttals.csv")
        print("\nSample record:")
        print(df.iloc[0].to_dict())
    else:
        print("❌ No data was scraped")
    
    driver.quit()

🔄 Scraping EFSA for GMO misinformation rebuttals...


NameError: name 'scrape_efsa_gmo_claims' is not defined