# **AI Policy Document Scraper**

In [1]:
"""
AI Policy Document Scraper
Automated collection of AI-related policy documents from government sources
"""

import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import time
from datetime import datetime
import re
from urllib.parse import urljoin, urlparse
import logging
from typing import List, Dict, Optional
import os

class PolicyScraper:
    def __init__(self, config_path: str = "config/data_sources.yaml"):
        """Initialize the policy scraper with configuration"""
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        })

        # Setup logging
        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(__name__)

        # Load configuration
        self.config = self._load_config(config_path)

        # AI-related keywords for document filtering
        self.ai_keywords = [
            'artificial intelligence', 'machine learning', 'deep learning',
            'AI', 'ML', 'algorithmic', 'automated decision', 'neural network',
            'intelligence artificielle', 'apprentissage automatique',  # French
            'الذكاء الاصطناعي', 'التعلم الآلي'  # Arabic
        ]

    def _load_config(self, config_path: str) -> Dict:
        """Load configuration from file or return default config"""
        default_config = {
            "sources": {
                "algeria": {
                    "base_url": "https://www.joradp.dz/",
                    "search_patterns": ["/recherche", "/documents"],
                    "language": "ar"
                },
                "eu": {
                    "base_url": "https://digital-strategy.ec.europa.eu/",
                    "search_patterns": ["/policies/artificial-intelligence"],
                    "language": "en"
                },
                "usa": {
                    "base_url": "https://www.whitehouse.gov/",
                    "search_patterns": ["/briefing-room/presidential-actions", "/ai"],
                    "language": "en"
                }
            },
            "delay": 2,  # seconds between requests
            "timeout": 30
        }

        if os.path.exists(config_path):
            try:
                import yaml
                with open(config_path, 'r') as file:
                    return yaml.safe_load(file)
            except:
                self.logger.warning(f"Could not load config from {config_path}, using defaults")

        return default_config

    def scrape_government_sites(self, country_codes: List[str]) -> List[Dict]:
        """Scrape AI policies from government websites"""
        all_documents = []

        for country in country_codes:
            if country not in self.config["sources"]:
                self.logger.warning(f"No configuration found for country: {country}")
                continue

            self.logger.info(f"Scraping documents for {country}")
            country_docs = self._scrape_country_site(country)
            all_documents.extend(country_docs)

            # Respect rate limiting
            time.sleep(self.config.get("delay", 2))

        return all_documents

    def _scrape_country_site(self, country: str) -> List[Dict]:
        """Scrape documents from a specific country's government site"""
        source_config = self.config["sources"][country]
        base_url = source_config["base_url"]
        documents = []

        try:
            # Get main page
            response = self.session.get(base_url, timeout=self.config.get("timeout", 30))
            response.raise_for_status()

            soup = BeautifulSoup(response.content, 'html.parser')

            # Find links that might contain AI-related documents
            links = self._extract_relevant_links(soup, base_url, country)

            # Process each link
            for i, link in enumerate(links[:10]):  # Limit to first 10 links
                try:
                    doc_info = self._process_document_link(link, country)
                    if doc_info:
                        documents.append(doc_info)

                    # Rate limiting
                    if i % 3 == 0:
                        time.sleep(1)

                except Exception as e:
                    self.logger.error(f"Error processing link {link}: {str(e)}")
                    continue

        except Exception as e:
            self.logger.error(f"Error scraping {country}: {str(e)}")

        return documents

    def _extract_relevant_links(self, soup: BeautifulSoup, base_url: str, country: str) -> List[str]:
        """Extract links that might contain AI-related documents"""
        links = []

        # Find all links on the page
        for link in soup.find_all('a', href=True):
            href = link.get('href')
            text = link.get_text().lower()

            # Convert relative URLs to absolute
            full_url = urljoin(base_url, href)

            # Check if link text contains AI-related keywords
            if any(keyword.lower() in text for keyword in self.ai_keywords):
                links.append(full_url)

            # Check for document extensions
            if any(ext in href.lower() for ext in ['.pdf', '.doc', '.docx']):
                if any(keyword.lower() in text for keyword in self.ai_keywords):
                    links.append(full_url)

        return list(set(links))  # Remove duplicates

    def _process_document_link(self, url: str, country: str) -> Optional[Dict]:
        """Process a single document link and extract metadata"""
        try:
            response = self.session.get(url, timeout=self.config.get("timeout", 30))
            response.raise_for_status()

            # Determine if it's a PDF or HTML document
            content_type = response.headers.get('content-type', '').lower()

            if 'pdf' in content_type:
                return self._process_pdf_document(url, response.content, country)
            else:
                return self._process_html_document(url, response.text, country)

        except Exception as e:
            self.logger.error(f"Error processing document {url}: {str(e)}")
            return None

    def _process_html_document(self, url: str, content: str, country: str) -> Dict:
        """Process HTML document and extract metadata"""
        soup = BeautifulSoup(content, 'html.parser')

        # Extract title
        title = soup.find('title')
        title_text = title.get_text().strip() if title else "Unknown Title"

        # Extract main content
        main_content = self._extract_main_content(soup)

        # Check if content is AI-related
        if not self._is_ai_related(main_content):
            return None

        return {
            'url': url,
            'title': title_text,
            'country': country,
            'content_type': 'html',
            'content_preview': main_content[:500] + "..." if len(main_content) > 500 else main_content,
            'scraped_date': datetime.now().isoformat(),
            'document_type': self.classify_document_type(title_text + " " + main_content),
            'estimated_date': self._extract_date_from_content(main_content),
            'word_count': len(main_content.split()),
            'ai_relevance_score': self._calculate_ai_relevance(main_content)
        }

    def _process_pdf_document(self, url: str, content: bytes, country: str) -> Dict:
        """Process PDF document and extract metadata"""
        # For PDF processing, we'll extract basic metadata
        # In a full implementation, you'd use PyPDF2 or similar

        return {
            'url': url,
            'title': self._extract_title_from_url(url),
            'country': country,
            'content_type': 'pdf',
            'content_preview': "PDF document - content extraction requires additional processing",
            'scraped_date': datetime.now().isoformat(),
            'document_type': 'policy_document',  # Default for PDFs
            'file_size': len(content),
            'ai_relevance_score': 0.7  # Assume moderate relevance for PDFs found via AI keywords
        }

    def _extract_main_content(self, soup: BeautifulSoup) -> str:
        """Extract main content from HTML"""
        # Remove script and style elements
        for script in soup(["script", "style"]):
            script.decompose()

        # Try to find main content areas
        main_areas = soup.find_all(['main', 'article', 'div'], class_=re.compile(r'content|main|article'))

        if main_areas:
            content = ' '.join([area.get_text() for area in main_areas])
        else:
            # Fallback to body content
            body = soup.find('body')
            content = body.get_text() if body else soup.get_text()

        # Clean up whitespace
        content = re.sub(r'\s+', ' ', content).strip()
        return content

    def _is_ai_related(self, content: str) -> bool:
        """Check if content is AI-related"""
        content_lower = content.lower()
        return any(keyword.lower() in content_lower for keyword in self.ai_keywords)

    def _extract_date_from_content(self, content: str) -> Optional[str]:
        """Extract publication date from content"""
        # Look for common date patterns
        date_patterns = [
            r'\d{1,2}/\d{1,2}/\d{4}',  # MM/DD/YYYY or DD/MM/YYYY
            r'\d{4}-\d{2}-\d{2}',      # YYYY-MM-DD
            r'\d{1,2}\s+\w+\s+\d{4}',  # DD Month YYYY
        ]

        for pattern in date_patterns:
            match = re.search(pattern, content)
            if match:
                return match.group()

        return None

    def _extract_title_from_url(self, url: str) -> str:
        """Extract title from URL"""
        parsed = urlparse(url)
        filename = os.path.basename(parsed.path)

        # Remove extension and clean up
        title = os.path.splitext(filename)[0]
        title = title.replace('-', ' ').replace('_', ' ')
        return title.title()

    def _calculate_ai_relevance(self, content: str) -> float:
        """Calculate AI relevance score based on keyword frequency"""
        content_lower = content.lower()
        total_words = len(content.split())

        if total_words == 0:
            return 0.0

        ai_word_count = sum(content_lower.count(keyword.lower()) for keyword in self.ai_keywords)
        relevance_score = min(ai_word_count / total_words * 100, 1.0)  # Cap at 1.0

        return round(relevance_score, 3)

    def classify_document_type(self, text: str) -> str:
        """Classify document type based on content"""
        text_lower = text.lower()

        # Define classification keywords
        type_keywords = {
            'law': ['act', 'law', 'regulation', 'statute', 'legal', 'loi', 'règlement'],
            'strategy': ['strategy', 'plan', 'roadmap', 'framework', 'stratégie', 'plan'],
            'guideline': ['guideline', 'guidance', 'recommendation', 'best practice', 'guide'],
            'policy': ['policy', 'directive', 'politique', 'directive'],
            'report': ['report', 'study', 'analysis', 'research', 'rapport', 'étude'],
            'white_paper': ['white paper', 'position paper', 'livre blanc']
        }

        # Count matches for each type
        type_scores = {}
        for doc_type, keywords in type_keywords.items():
            score = sum(text_lower.count(keyword) for keyword in keywords)
            type_scores[doc_type] = score

        # Return type with highest score, or 'unknown' if no matches
        if max(type_scores.values()) > 0:
            return max(type_scores, key=type_scores.get)
        else:
            return 'unknown'

    def save_results(self, documents: List[Dict], output_path: str = "datasets/scraped_policies.json"):
        """Save scraped documents to file"""
        os.makedirs(os.path.dirname(output_path), exist_ok=True)

        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(documents, f, indent=2, ensure_ascii=False)

        # Also save as CSV for easier analysis
        csv_path = output_path.replace('.json', '.csv')
        df = pd.DataFrame(documents)
        df.to_csv(csv_path, index=False, encoding='utf-8')

        self.logger.info(f"Saved {len(documents)} documents to {output_path} and {csv_path}")

    def generate_summary_report(self, documents: List[Dict]) -> Dict:
        """Generate summary statistics for scraped documents"""
        if not documents:
            return {"error": "No documents to analyze"}

        df = pd.DataFrame(documents)

        summary = {
            "total_documents": len(documents),
            "documents_by_country": df['country'].value_counts().to_dict(),
            "documents_by_type": df['document_type'].value_counts().to_dict(),
            "average_ai_relevance": df['ai_relevance_score'].mean() if 'ai_relevance_score' in df.columns else 0,
            "content_types": df['content_type'].value_counts().to_dict(),
            "scraping_date": datetime.now().isoformat()
        }

        return summary

# Example usage
if __name__ == "__main__":
    # Initialize scraper
    scraper = PolicyScraper()

    # Scrape documents from specified countries
    countries = ['algeria', 'eu', 'usa']
    documents = scraper.scrape_government_sites(countries)

    # Save results
    scraper.save_results(documents)

    # Generate summary
    summary = scraper.generate_summary_report(documents)
    print("Scraping Summary:")
    print(json.dumps(summary, indent=2))

Scraping Summary:
{
  "total_documents": 6,
  "documents_by_country": {
    "eu": 6
  },
  "documents_by_type": {
    "law": 6
  },
  "average_ai_relevance": 1.0,
  "content_types": {
    "html": 6
  },
  "scraping_date": "2025-07-24T18:31:16.138054"
}
