In [None]:
!pip install selenium
!pip install webdriver-manager
# Install Chrome and ChromeDriver
!apt-get update
!apt-get install -y chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin


Collecting selenium
  Downloading selenium-4.31.0-py3-none-any.whl.metadata (7.5 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.30.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.31.0-py3-none-any.whl (9.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m54.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio-0.30.0-py3-none-any.whl (499 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m499.2/499.2 kB[0m [31m29.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio_websocket-0.12.2-py3-none-any.whl (21 kB)
Downloading outcome-1.3.0.post0-py2.py3-

In [None]:
# Fake News Crawler for Fact-Checking Websites (Google Colab Version)
# This version uses a special Colab-compatible approach for Selenium

# Install required packages
!pip install selenium pandas requests beautifulsoup4

# Additional installations specific for Colab environment
!apt-get update
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin/
!pip install webdriver-manager

# Import necessary libraries
import pandas as pd
import time
import random
import re
from datetime import datetime
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException

# Configure Chrome options specifically for Colab
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--disable-extensions')
chrome_options.add_argument('--disable-infobars')
chrome_options.add_argument('--mute-audio')
chrome_options.add_argument('--remote-debugging-port=9222')
chrome_options.add_argument('--user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36"')

# Colab-specific WebDriver setup
def setup_driver():
    try:
        # First method with System Path
        driver = webdriver.Chrome(options=chrome_options)
        return driver
    except Exception as e1:
        print(f"First method failed: {e1}")
        try:
            # Second method with service object
            service = Service('/usr/bin/chromedriver')
            driver = webdriver.Chrome(service=service, options=chrome_options)
            return driver
        except Exception as e2:
            print(f"Second method failed: {e2}")
            # Fallback to requests and BeautifulSoup if Selenium fails
            print("Selenium setup failed, will use requests and BeautifulSoup instead")
            return None

# Alternative approach using requests and BeautifulSoup if Selenium fails
def crawl_with_requests(url, source_domain):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36'
    }
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')

            # Generic extraction logic that works across sites
            title = ""
            if soup.find('h1'):
                title = soup.find('h1').text.strip()

            # Extract paragraphs for content
            paragraphs = soup.find_all('p')
            article_text = ' '.join([p.text.strip() for p in paragraphs])

            # Extract date - generic approach
            date_published = "Unknown"
            time_tag = soup.find('time')
            if time_tag:
                date_attr = time_tag.get('datetime')
                if date_attr:
                    date_published = date_attr.split('T')[0]
                else:
                    date_published = time_tag.text.strip()

            # Try to find a verdict based on common patterns
            verdict = "Unknown"
            # Look for verdict-related elements
            verdict_terms = ['false', 'true', 'pants on fire', 'mostly true', 'half true',
                            'mostly false', 'pants-on-fire', 'mixture', 'unproven']

            # Check for verdict in specific elements
            rating_elements = soup.find_all(['span', 'div'], class_=lambda x: x and ('rating' in x.lower() or 'verdict' in x.lower()))
            for element in rating_elements:
                verdict = element.text.strip()
                break

            # If not found, check for verdict keywords in the article
            if verdict == "Unknown":
                lower_text = article_text.lower()
                for term in verdict_terms:
                    if term in lower_text:
                        paragraph_with_term = next((p.text.strip() for p in paragraphs if term in p.text.lower()), None)
                        if paragraph_with_term:
                            # Take the sentence containing the verdict term
                            sentences = paragraph_with_term.split('.')
                            for sentence in sentences:
                                if term in sentence.lower():
                                    verdict = sentence.strip()
                                    break
                            break

            # Extract claim
            claim = "See article for details"
            claim_elements = soup.find_all(['div', 'p'], class_=lambda x: x and ('claim' in x.lower()))
            if claim_elements:
                claim = claim_elements[0].text.strip()

            return {
                'title': title,
                'claim': claim,
                'verdict': verdict,
                'article_text': article_text,
                'url': url,
                'date_published': date_published,
                'date_crawled': datetime.now().strftime("%Y-%m-%d"),
                'source_domain': source_domain
            }
        else:
            print(f"Failed to retrieve {url}, status code: {response.status_code}")
            return None
    except Exception as e:
        print(f"Error crawling {url} with requests: {str(e)}")
        return None

# Create empty DataFrame to store results
columns = ['title', 'claim', 'verdict', 'article_text', 'url', 'date_published', 'date_crawled', 'source_domain']
fake_news_db = pd.DataFrame(columns=columns)

# Function to get article URLs from Snopes
def get_snopes_article_urls(num_pages=3):
    article_urls = []
    base_url = "https://www.snopes.com/fact-check/page/{}/"

    for page in range(1, num_pages + 1):
        try:
            url = base_url.format(page)
            print(f"Fetching URLs from Snopes page {page}...")

            response = requests.get(url)
            if response.status_code == 200:
                soup = BeautifulSoup(response.text, 'html.parser')

                # Find article links
                article_cards = soup.find_all('article', class_='list-group-item')
                for card in article_cards:
                    link = card.find('a', href=True)
                    if link and 'href' in link.attrs:
                        article_urls.append(link['href'])

                # Be polite with delay
                time.sleep(random.uniform(1.0, 2.0))
            else:
                print(f"Failed to fetch Snopes page {page}, status code: {response.status_code}")
        except Exception as e:
            print(f"Error fetching Snopes page {page}: {str(e)}")

    return article_urls

# Function to get article URLs from PolitiFact
def get_politifact_article_urls(num_pages=3):
    article_urls = []
    base_url = "https://www.politifact.com/factchecks/?page={}"

    for page in range(1, num_pages + 1):
        try:
            url = base_url.format(page)
            print(f"Fetching URLs from PolitiFact page {page}...")

            response = requests.get(url)
            if response.status_code == 200:
                soup = BeautifulSoup(response.text, 'html.parser')

                # Find article links
                article_items = soup.find_all('li', class_='o-listicle__item')
                for item in article_items:
                    link = item.find('a', class_='m-statement__link', href=True)
                    if link and 'href' in link.attrs:
                        article_urls.append('https://www.politifact.com' + link['href'] if not link['href'].startswith('http') else link['href'])

                # Be polite with delay
                time.sleep(random.uniform(1.0, 2.0))
            else:
                print(f"Failed to fetch PolitiFact page {page}, status code: {response.status_code}")
        except Exception as e:
            print(f"Error fetching PolitiFact page {page}: {str(e)}")

    return article_urls

# Function to get article URLs from FactCheck.org
def get_factcheck_article_urls(num_pages=3):
    article_urls = []
    base_url = "https://www.factcheck.org/fake-news/page/{}/"

    for page in range(1, num_pages + 1):
        try:
            url = base_url.format(page)
            print(f"Fetching URLs from FactCheck.org page {page}...")

            response = requests.get(url)
            if response.status_code == 200:
                soup = BeautifulSoup(response.text, 'html.parser')

                # Find article links
                article_items = soup.find_all('article', class_='post')
                for item in article_items:
                    link = item.find('h3', class_='entry-title').find('a', href=True)
                    if link and 'href' in link.attrs:
                        article_urls.append(link['href'])

                # Be polite with delay
                time.sleep(random.uniform(1.0, 2.0))
            else:
                print(f"Failed to fetch FactCheck.org page {page}, status code: {response.status_code}")
        except Exception as e:
            print(f"Error fetching FactCheck.org page {page}: {str(e)}")

    return article_urls

# Main crawling function that tries Selenium first, falls back to requests
def crawl_article(url, source_domain, driver=None):
    if driver is not None:
        try:
            # Try with Selenium
            driver.get(url)
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.TAG_NAME, "h1"))
            )

            # Extract data based on the source domain
            if source_domain == 'snopes.com':
                # Extract title
                title = driver.find_element(By.CSS_SELECTOR, "h1.title").text.strip()

                # Get rating/verdict
                try:
                    verdict = driver.find_element(By.CSS_SELECTOR, "span.rating-label-with-symbol").text.strip()
                except NoSuchElementException:
                    try:
                        verdict = driver.find_element(By.CSS_SELECTOR, "div.rating-wrapper").text.strip()
                    except:
                        verdict = "Unknown"

                # Get claim
                try:
                    claim = driver.find_element(By.CSS_SELECTOR, "div.claim-text").text.strip()
                except:
                    claim = "No claim specified"

                # Get article text
                try:
                    paragraphs = driver.find_elements(By.CSS_SELECTOR, "div.single-body p")
                    article_text = ' '.join([p.text for p in paragraphs])
                except:
                    article_text = ""

                # Get date
                try:
                    date_str = driver.find_element(By.CSS_SELECTOR, "time.date-published").get_attribute('datetime')
                    date_published = date_str.split('T')[0]  # Get just the date part
                except:
                    date_published = "Unknown"

            elif source_domain == 'politifact.com':
                # Extract title
                title = driver.find_element(By.CSS_SELECTOR, "h1.c-title").text.strip()

                # Get rating/verdict
                try:
                    verdict_img = driver.find_element(By.CSS_SELECTOR, "div.c-image img.c-image__original").get_attribute('alt')
                    verdict = verdict_img if verdict_img else "Unknown"
                except:
                    try:
                        verdict = driver.find_element(By.CSS_SELECTOR, "div.meter").get_attribute('class')
                        # Clean up the verdict string
                        if verdict:
                            verdict_match = re.search(r'rating--(\w+)', verdict)
                            verdict = verdict_match.group(1) if verdict_match else "Unknown"
                    except:
                        verdict = "Unknown"

                # Get claim
                try:
                    claim = driver.find_element(By.CSS_SELECTOR, "div.statement__text").text.strip()
                except:
                    claim = "No claim specified"

                # Get article text
                try:
                    article_div = driver.find_element(By.CSS_SELECTOR, "article.article__text")
                    paragraphs = article_div.find_elements(By.TAG_NAME, "p")
                    article_text = ' '.join([p.text for p in paragraphs])
                except:
                    article_text = ""

                # Get date
                try:
                    date_str = driver.find_element(By.CSS_SELECTOR, "span.statement__date").text.strip()
                    date_published = date_str
                except:
                    date_published = "Unknown"

            elif source_domain == 'factcheck.org':
                # Extract title
                title = driver.find_element(By.CSS_SELECTOR, "h1.entry-title").text.strip()

                # Get claim (factcheck.org doesn't always have a clear claim section)
                claim = "See article for details"

                # Get article text
                try:
                    content_div = driver.find_element(By.CSS_SELECTOR, "div.entry-content")
                    paragraphs = content_div.find_elements(By.TAG_NAME, "p")
                    article_text = ' '.join([p.text for p in paragraphs])
                except:
                    article_text = ""

                # For FactCheck.org, we don't have a clear verdict label, so we'll use keywords
                if any(word in article_text.lower() for word in ['false', 'incorrect', 'misleading', 'fake']):
                    verdict = "False"
                elif any(word in article_text.lower() for word in ['partially true', 'partly true', 'half true']):
                    verdict = "Partially True"
                elif any(word in article_text.lower() for word in ['true', 'correct', 'accurate']):
                    verdict = "True"
                else:
                    verdict = "Unrated"

                # Get date
                try:
                    date_str = driver.find_element(By.CSS_SELECTOR, "time.entry-date").text.strip()
                    date_published = date_str
                except:
                    date_published = "Unknown"

            else:
                # Generic extraction for unknown domains
                title = driver.find_element(By.TAG_NAME, "h1").text.strip()
                claim = "See article for details"
                verdict = "Unknown"

                # Get article text
                try:
                    paragraphs = driver.find_elements(By.TAG_NAME, "p")
                    article_text = ' '.join([p.text for p in paragraphs])
                except:
                    article_text = ""

                date_published = "Unknown"

            return {
                'title': title,
                'claim': claim,
                'verdict': verdict,
                'article_text': article_text,
                'url': url,
                'date_published': date_published,
                'date_crawled': datetime.now().strftime("%Y-%m-%d"),
                'source_domain': source_domain
            }

        except Exception as e:
            print(f"Selenium crawling of {url} failed: {str(e)}")
            # Fall back to requests method
            return crawl_with_requests(url, source_domain)
    else:
        # If no driver is provided, use requests method
        return crawl_with_requests(url, source_domain)

# Main execution
try:
    # Try to set up Selenium WebDriver
    driver = setup_driver()

    # Get article URLs from each site
    print("\nFetching article URLs from fact-checking sites...")
    snopes_urls = get_snopes_article_urls(num_pages=2)
    politifact_urls = get_politifact_article_urls(num_pages=2)
    factcheck_urls = get_factcheck_article_urls(num_pages=2)

    print(f"\nFound {len(snopes_urls)} Snopes articles")
    print(f"Found {len(politifact_urls)} PolitiFact articles")
    print(f"Found {len(factcheck_urls)} FactCheck.org articles")

    # Crawl each article URL
    all_results = []

    print("\nCrawling Snopes articles...")
    for url in snopes_urls:
        print(f"Crawling: {url}")
        result = crawl_article(url, 'snopes.com', driver)
        if result:
            all_results.append(result)
            print(f"Successfully crawled: {result['title']}")
        time.sleep(random.uniform(1.0, 2.0))

    print("\nCrawling PolitiFact articles...")
    for url in politifact_urls:
        print(f"Crawling: {url}")
        result = crawl_article(url, 'politifact.com', driver)
        if result:
            all_results.append(result)
            print(f"Successfully crawled: {result['title']}")
        time.sleep(random.uniform(1.0, 2.0))

    print("\nCrawling FactCheck.org articles...")
    for url in factcheck_urls:
        print(f"Crawling: {url}")
        result = crawl_article(url, 'factcheck.org', driver)
        if result:
            all_results.append(result)
            print(f"Successfully crawled: {result['title']}")
        time.sleep(random.uniform(1.0, 2.0))

    # Update the DataFrame
    fake_news_db = pd.concat([fake_news_db, pd.DataFrame(all_results)], ignore_index=True)

except Exception as e:
    print(f"Error during crawling process: {str(e)}")

finally:
    # Close the WebDriver if it was initialized
    if 'driver' in locals() and driver is not None:
        driver.quit()

# Display stats
print("\n--- Crawling Complete ---")
print(f"Total articles collected: {len(fake_news_db)}")
print(f"Articles by source:")
print(fake_news_db['source_domain'].value_counts())
print(f"Articles by verdict:")
print(fake_news_db['verdict'].value_counts())

# Save to CSV
fake_news_db.to_csv('fake_news_dataset.csv', index=False)
print("\nDataset saved to 'fake_news_dataset.csv'")

# Save to SQLite database
import sqlite3
conn = sqlite3.connect('fake_news.db')
fake_news_db.to_sql('fact_checks', conn, if_exists='replace', index=False)
conn.close()
print("Dataset saved to SQLite database 'fake_news.db'")

# Display sample of the collected data
print("\n--- Sample Data ---")
fake_news_db.head()

# Basic data analysis
print("\n--- Basic Data Analysis ---")

# Count by verdict type
verdict_counts = fake_news_db['verdict'].value_counts()
print("Verdict distribution:")
print(verdict_counts)

# Most common words in false claims
if len(fake_news_db) > 0 and 'article_text' in fake_news_db.columns:
    print("\nAnalyzing text content...")

    # Filter for false claims
    false_claims = fake_news_db[fake_news_db['verdict'].str.contains('false|fake', case=False, na=False)]

    if len(false_claims) > 0:
        # Simple word count analysis
        all_text = ' '.join(false_claims['article_text'].fillna(''))

        # Remove common words
        common_words = ['the', 'to', 'and', 'a', 'in', 'of', 'that', 'is', 'it', 'for', 'on', 'with', 'as', 'by', 'at']
        word_counts = {}

        for word in re.findall(r'\b[a-zA-Z]{3,}\b', all_text.lower()):
            if word not in common_words:
                word_counts[word] = word_counts.get(word, 0) + 1

        # Display most common words
        sorted_words = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)
        print("Most common words in false claims:")
        for word, count in sorted_words[:20]:
            print(f"  {word}: {count}")

Hit:1 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading package lists... Done
Building dependency tree... Done
Reading

In [None]:

news_sources = [
    # US newspapers
    {"name": "New York Times", "url": "https://www.nytimes.com", "section": "/world"},
    {"name": "Washington Post", "url": "https://www.washingtonpost.com", "section": "/world"},
    {"name": "Wall Street Journal", "url": "https://www.wsj.com", "section": "/news/world"},
    {"name": "USA Today", "url": "https://www.usatoday.com", "section": "/news/world"},
    {"name": "Los Angeles Times", "url": "https://www.latimes.com", "section": "/world-nation"},

    # UK newspapers
    {"name": "The Guardian", "url": "https://www.theguardian.com", "section": "/international"},
    {"name": "BBC News", "url": "https://www.bbc.com", "section": "/news"},
    {"name": "The Telegraph", "url": "https://www.telegraph.co.uk", "section": "/news/world/"},
    {"name": "Financial Times", "url": "https://www.ft.com", "section": "/world"},
    {"name": "The Independent", "url": "https://www.independent.co.uk", "section": "/news/world"},

    # Other international English newspapers
    {"name": "Al Jazeera", "url": "https://www.aljazeera.com", "section": "/news"},
    {"name": "Reuters", "url": "https://www.reuters.com", "section": "/world"},
    {"name": "CNN", "url": "https://www.cnn.com", "section": "/world"},
    {"name": "The Times of India", "url": "https://timesofindia.indiatimes.com", "section": "/world"},
    {"name": "South China Morning Post", "url": "https://www.scmp.com", "section": "/news/world"},
]

# Function to extract article URLs from a newspaper homepage or section
def get_article_urls(source):
    article_urls = []
    source_name = source["name"]
    base_url = source["url"]
    section_url = base_url + source["section"]

    print(f"Fetching articles from {source_name} ({section_url})...")

    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(section_url, headers=headers, timeout=10)

        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')

            # Find all links on the page
            links = soup.find_all('a', href=True)

            # Extract article URLs based on patterns
            for link in links:
                href = link['href']

                # Skip navigation, category, tag links and javascript
                if (href.startswith('#') or
                    href.startswith('javascript:') or
                    '/tag/' in href or
                    '/category/' in href or
                    '/section/' in href or
                    '/author/' in href):
                    continue

                # Handle relative URLs
                if href.startswith('/'):
                    full_url = base_url + href
                elif href.startswith('http'):
                    full_url = href
                else:
                    continue

                # Check if URL belongs to the same domain
                if base_url.split('//')[1].split('/')[0] in full_url:
                    # Make sure it looks like an article (contains year or article indicators)
                    if (re.search(r'/202\d/', full_url) or
                        re.search(r'/\d{4}/\d{2}/\d{2}/', full_url) or
                        '/article/' in full_url or
                        '/story/' in full_url or
                        '/news/' in full_url):

                        # Make sure URL isn't already in our list
                        if full_url not in article_urls:
                            article_urls.append(full_url)

            print(f"Found {len(article_urls)} potential article URLs from {source_name}")

        else:
            print(f"Failed to fetch {source_name} homepage, status code: {response.status_code}")

    except Exception as e:
        print(f"Error fetching articles from {source_name}: {str(e)}")

    # Return a limited number of articles per source to avoid overwhelming
    return article_urls[:10]  # Limit to 10 articles per source

# Function to extract article content using newspaper3k library
def extract_article_content(url, source_name):
    print(f"Extracting content from: {url}")

    try:
        # Create an Article object
        article = Article(url, config=config)

        # Download and parse the article
        article.download()
        article.parse()

        # Natural language processing for keywords and summary
        try:
            article.nlp()
        except Exception as nlp_error:
            print(f"NLP processing error (non-critical): {str(nlp_error)}")

        # Extract the data
        return {
            'title': article.title,
            'text': article.text,
            'authors': ', '.join(article.authors) if article.authors else "Unknown",
            'publish_date': article.publish_date.strftime('%Y-%m-%d') if article.publish_date else "Unknown",
            'top_image': article.top_image,
            'url': url,
            'source': source_name,
            'keywords': ', '.join(article.keywords) if hasattr(article, 'keywords') else "",
            'summary': article.summary if hasattr(article, 'summary') else "",
            'crawl_date': datetime.now().strftime("%Y-%m-%d")
        }

    except Exception as e:
        print(f"Error extracting content from {url}: {str(e)}")
        return None

# Function to attempt alternative extraction if newspaper3k fails
def fallback_extract_article(url, source_name):
    print(f"Attempting fallback extraction for: {url}")

    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, headers=headers, timeout=15)

        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')

            # Extract title
            title = ""
            title_tag = soup.find('h1')
            if title_tag:
                title = title_tag.text.strip()

            # Extract content - look for article body, main content
            content_selectors = [
                'article',
                'main',
                'div[class*="article"]',
                'div[class*="content"]',
                'div[class*="story"]',
                'div[id*="article"]',
                'div[id*="content"]',
                'div[id*="story"]'
            ]

            content = ""
            for selector in content_selectors:
                main_content = soup.select(selector)
                if main_content:
                    # Extract all paragraphs from this content area
                    paragraphs = main_content[0].find_all('p')
                    if paragraphs:
                        content = ' '.join([p.text.strip() for p in paragraphs])
                        break

            # If still no content, just grab all paragraphs
            if not content:
                paragraphs = soup.find_all('p')
                content = ' '.join([p.text.strip() for p in paragraphs[:20]])  # Limit to first 20 paragraphs

            # Try to find publication date
            date = "Unknown"
            date_patterns = [
                'time',
                'span[class*="date"]',
                'div[class*="date"]',
                'p[class*="date"]',
                'meta[property="article:published_time"]'
            ]

            for pattern in date_patterns:
                date_element = soup.select_one(pattern)
                if date_element:
                    if date_element.name == 'meta' and date_element.get('content'):
                        date = date_element['content'].split('T')[0]
                    else:
                        date = date_element.text.strip()
                    break

            # Basic keyword extraction
            keywords = ""
            keyword_meta = soup.find('meta', {'name': 'keywords'})
            if keyword_meta and keyword_meta.get('content'):
                keywords = keyword_meta['content']

            return {
                'title': title,
                'text': content,
                'authors': "Unknown",  # Difficult to reliably parse authors
                'publish_date': date,
                'top_image': "",  # Skip image extraction for fallback
                'url': url,
                'source': source_name,
                'keywords': keywords,
                'summary': "",  # Skip summary for fallback
                'crawl_date': datetime.now().strftime("%Y-%m-%d")
            }

        else:
            print(f"Failed fallback extraction, status code: {response.status_code}")
            return None

    except Exception as e:
        print(f"Error in fallback extraction for {url}: {str(e)}")
        return None

# Main crawling function
def crawl_news_sources(sources=news_sources, articles_per_source=5):
    all_articles = []

    for source in sources:
        print(f"\n{'='*40}")
        print(f"Crawling {source['name']}...")
        print(f"{'='*40}")

        # Get article URLs
        article_urls = get_article_urls(source)

        # Limit to specified number of articles per source
        article_urls = article_urls[:articles_per_source]

        # Process each article
        for url in article_urls:
            print(f"\nProcessing: {url}")

            # Try to extract with newspaper3k
            article_data = extract_article_content(url, source['name'])

            # If failed, try fallback method
            if article_data is None or not article_data.get('text'):
                print("Primary extraction failed, attempting fallback...")
                article_data = fallback_extract_article(url, source['name'])

            # If successful, add to results
            if article_data and article_data.get('title') and article_data.get('text'):
                all_articles.append(article_data)
                print(f"Successfully extracted: {article_data['title']}")
            else:
                print(f"Failed to extract content from {url}")

            # Polite delay
            time.sleep(random.uniform(1.0, 3.0))

    return all_articles

# Function to analyze news articles for potential indicators of fake news
def analyze_news_content(df):
    # Add analysis columns
    df['text_length'] = df['text'].apply(lambda x: len(x.split()))
    df['clickbait_score'] = 0
    df['emotional_score'] = 0
    df['sensational_score'] = 0

    # Clickbait title indicators
    clickbait_patterns = [
        r'(?i)you won\'t believe',
        r'(?i)shocking',
        r'(?i)amazing',
        r'(?i)incredible',
        r'(?i)mind[-\s]?blowing',
        r'(?i)this is why',
        r'(?i)secret',
        r'(?i)surprising',
        r'(?i)unbelievable',
        r'(?i)\d+\s+(?:things|ways|reasons|facts|tricks|ideas|tips)',
        r'(?i)what happens next',
        r'(?i)this is what',
        r'(?i)must see',
        r'(?i)here\'s why',
        r'(?i)this is how'
    ]

    # Emotional language indicators
    emotional_words = [
        'outrage', 'angry', 'fury', 'furious', 'panic', 'terrified', 'terrifying',
        'horrific', 'devastating', 'tragic', 'heartbreaking', 'shocking', 'alarming',
        'disaster', 'crisis', 'catastrophe', 'emergency', 'scandal', 'bombshell', 'slams',
        'blasts', 'condemns', 'rips', 'destroys', 'annihilates'
    ]

    # Sensational phrases
    sensational_phrases = [
        'breaking news', 'exclusive', 'sources say', 'anonymous sources',
        'according to sources', 'experts say', 'scientists claim', 'doctors reveal',
        'studies show', 'research proves', 'government officials', 'officials say'
    ]

    # Analyze each article
    for idx, row in df.iterrows():
        title = row['title']
        text = row['text']

        # Check for clickbait patterns in title
        clickbait_count = sum(1 for pattern in clickbait_patterns if re.search(pattern, title))
        df.at[idx, 'clickbait_score'] = min(clickbait_count * 10, 100)  # Scale from 0-100

        # Check for emotional language
        text_lower = text.lower()
        emotional_count = sum(1 for word in emotional_words if word in text_lower)
        df.at[idx, 'emotional_score'] = min((emotional_count / max(len(text.split()) / 100, 1)) * 100, 100)

        # Check for sensational phrases
        sensational_count = sum(1 for phrase in sensational_phrases if phrase in text_lower)
        df.at[idx, 'sensational_score'] = min((sensational_count / max(len(text.split()) / 200, 1)) * 100, 100)

    # Calculate overall credibility score (inverse of problematic indicators)
    df['potential_fake_news_score'] = (df['clickbait_score'] + df['emotional_score'] + df['sensational_score']) / 3

    return df

# Main execution
print("Starting to crawl English language newspapers...\n")

try:
    # Execute the crawling
    articles = crawl_news_sources(articles_per_source=5)  # Adjust the number as needed

    # Convert to DataFrame
    news_db = pd.DataFrame(articles)

    # Basic analysis of the content
    if len(news_db) > 0:
        news_db = analyze_news_content(news_db)

    # Display stats
    print("\n--- Crawling Complete ---")
    print(f"Total articles collected: {len(news_db)}")
    print(f"Articles by source:")
    print(news_db['source'].value_counts())

    # Save to CSV
    news_db.to_csv('newspaper_dataset.csv', index=False)
    print("\nDataset saved to 'newspaper_dataset.csv'")

    # Save to SQLite database
    conn = sqlite3.connect('newspaper_articles.db')
    news_db.to_sql('articles', conn, if_exists='replace', index=False)
    conn.close()
    print("Dataset saved to SQLite database 'newspaper_articles.db'")

    # Display sample of the collected data
    print("\n--- Sample Data ---")
    sample_columns = ['title', 'source', 'publish_date', 'authors', 'potential_fake_news_score']
    print(news_db[sample_columns].head())

    # Basic analysis results
    if 'potential_fake_news_score' in news_db.columns:
        print("\n--- Fake News Analysis Results ---")
        print(f"Average potential fake news score: {news_db['potential_fake_news_score'].mean():.2f}/100")
        print("\nArticles with highest fake news indicators:")
        high_score_articles = news_db.sort_values('potential_fake_news_score', ascending=False).head(5)
        for idx, row in high_score_articles.iterrows():
            print(f"- {row['title']} ({row['source']}) - Score: {row['potential_fake_news_score']:.2f}")

        print("\nArticles with lowest fake news indicators:")
        low_score_articles = news_db.sort_values('potential_fake_news_score').head(5)
        for idx, row in low_score_articles.iterrows():
            print(f"- {row['title']} ({row['source']}) - Score: {row['potential_fake_news_score']:.2f}")

except Exception as e:
    print(f"An error occurred during execution: {str(e)}")

print("\nCrawling process completed.")

Starting to crawl English language newspapers...


Crawling New York Times...
Fetching articles from New York Times (https://www.nytimes.com/world)...
Found 0 potential article URLs from New York Times

Crawling Washington Post...
Fetching articles from Washington Post (https://www.washingtonpost.com/world)...
Error fetching articles from Washington Post: HTTPSConnectionPool(host='www.washingtonpost.com', port=443): Read timed out. (read timeout=10)

Crawling Wall Street Journal...
Fetching articles from Wall Street Journal (https://www.wsj.com/news/world)...
Failed to fetch Wall Street Journal homepage, status code: 401

Crawling USA Today...
Fetching articles from USA Today (https://www.usatoday.com/news/world)...
Found 20 potential article URLs from USA Today

Processing: https://www.usatoday.com/news/nation/
Extracting content from: https://www.usatoday.com/news/nation/
Error extracting content from https://www.usatoday.com/news/nation/: name 'Article' is not defined
Primary extrac

  df.at[idx, 'emotional_score'] = min((emotional_count / max(len(text.split()) / 100, 1)) * 100, 100)
  df.at[idx, 'sensational_score'] = min((sensational_count / max(len(text.split()) / 200, 1)) * 100, 100)
