In [19]:
import re,json, networkx as nx,requests, matplotlib.pyplot as plt, math, os
import numpy as np
from bs4 import BeautifulSoup
import time
from urllib.parse import urljoin

# Scrape all session report links from French National Assembly (Legislature 14)

In [31]:
def scrape_links(url, output_path):
    headers = {'User-Agent': 'Mozilla/5.0'}

    print(f"Fetching {url}...")
    response = requests.get(url, headers=headers)
    response.encoding = 'utf-8'

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find all sections with class 'tabs-content'
        content_sections = soup.find_all(class_='tabs-content')
        print(f"Found {len(content_sections)} content sections")
        
        all_links = {"asp": [], "folder": []}
        
        for section in content_sections:
            # Find 'liens-liste' within the section
            link_lists = section.find_all(class_='liens-liste')
            
            for link_list in link_lists:
                # Find all links within the list
                links = link_list.find_all('a', href=True)
                for link in links:
                    href = link.get('href')
                    if href:
                        # Make absolute URL
                        full_url = urljoin(url, href)
                        
                        # Determine key based on extension
                        if full_url.endswith('.asp'):
                            all_links["asp"].append(full_url)
                        else:
                            all_links["folder"].append(full_url)
        
        print(f"Total links collected: {len(all_links['asp']) + len(all_links['folder'])}")
        
        # Save to JSON
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        with open(output_path, "w", encoding="utf-8") as f:
            json.dump(all_links, f, ensure_ascii=False, indent=2)
        
        print(f"Links saved to {output_path}")
    else:
        print(f"Failed to fetch page. Status code: {response.status_code}")


In [32]:
def process_folder_links(json_path):
    # Load existing links
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    folder_links = data.get("folder", [])
    asp_links = data.get("asp", [])
    headers = {'User-Agent': 'Mozilla/5.0'}

    print(f"Processing {len(folder_links)} folder links...")

    for folder_url in folder_links:
        print(f"Scraping {folder_url}...")
        try:
            response = requests.get(folder_url, headers=headers)
            response.encoding = 'utf-8'
            
            if response.status_code == 200:
                soup = BeautifulSoup(response.text, 'html.parser')
                
                # Find all h1 tags with class 'seance' which contain the links
                seance_headers = soup.find_all('h1', class_='seance')
                
                count = 0
                for header in seance_headers:
                    link = header.find('a', href=True)
                    if link:
                        href = link.get('href')
                        # Construct absolute URL
                        full_url = urljoin(folder_url, href)
                        
                        # Avoid duplicates
                        if full_url not in asp_links:
                            asp_links.append(full_url)
                            count += 1
                print(f"  Found {count} new sessions.")
            else:
                print(f"  Failed to fetch {folder_url}: {response.status_code}")
                
        except Exception as e:
            print(f"  Error processing {folder_url}: {e}")
        
        # Be nice to the server
        time.sleep(0.5)

    # Update the data structure
    data["asp"] = asp_links
    print(f"Total ASP links: {len(asp_links)}")

    # Save updated JSON
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    print(f"Updated {json_path}")

In [48]:
def create_deputy_mapping(actors_dir):
    mapping = {}
    files = [f for f in os.listdir(actors_dir) if f.endswith('.json')]
    
    print(f"Processing {len(files)} actor files...")
    
    for filename in files:
        filepath = os.path.join(actors_dir, filename)
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                data = json.load(f)
                
            actor = data.get('acteur', {})
            uid = actor.get('uid', {}).get('#text')
            ident = actor.get('etatCivil', {}).get('ident', {})
            
            civ = ident.get('civ', '')
            prenom = ident.get('prenom', '')
            nom = ident.get('nom', '')
            
            full_name = f"{civ} {prenom} {nom}".strip()
            
            if uid and full_name:
                mapping[full_name] = uid
                
        except Exception as e:
            print(f"Error processing {filename}: {e}")
            
    return mapping

In [None]:
def scrape_all_speeches(json_path, deputees_json_path, deputy_mapping):
    # Load links
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    
    # Load existing deputees data
    with open(deputees_json_path, "r", encoding="utf-8") as f:
        deputees_data = json.load(f)
        
    asp_links = data.get("asp", [])
    headers = {'User-Agent': 'Mozilla/5.0'}
    
    print(f"Starting scrape of {len(asp_links)} sessions...")
    
    for i, url in enumerate(asp_links):
        print(f"Processing {i}/{len(asp_links)}: {url} \r")
        try:
            response = requests.get(url, headers=headers)
            if response.status_code == 200:
                response.encoding = 'utf-8'
                soup = BeautifulSoup(response.text, 'html.parser')
                
                current_speaker_id = None
                
                for p in soup.find_all('p'):
                    if p.get('class'):
                        continue
                    
                    b_tag = p.find('b')
                    
                    if b_tag:
                        b_text = b_tag.get_text(strip=True)
                        p_text = p.get_text(strip=True)
                        
                        if p_text.startswith(b_text):
                            # Clean speaker name
                            raw_name = b_text.strip(" .:,")
                            
                            # Try to find ID in mapping
                            # The mapping keys are like "M. Jean-Marc Ayrault"
                            # The raw_name might be "M. Jean-Marc Ayrault" or similar
                            if raw_name in deputy_mapping:
                                current_speaker_id = deputy_mapping[raw_name]
                            else:
                                # Try fuzzy match or partial match if needed, or skip
                                # For now, strict match or skip
                                current_speaker_id = None
                                # print(f"Unknown speaker: {raw_name}") 

                            b_tag.decompose()
                    
                    if current_speaker_id and current_speaker_id in deputees_data:
                        for tag in p.find_all(['i', 'b']):
                            tag.decompose()
                        
                        text = p.get_text(strip=True)
                        
                        # Clean leading punctuation
                        #text = text.lstrip(" .,; ,.")
                        
                        if text:
                            # Add speech to the specific deputy
                            deputees_data[current_speaker_id]["speeches"].append(text)
                            
        except Exception as e:
            print(f"Error scraping {url}: {e}")

    # Save the updated deputees data
    with open(deputees_json_path, "w", encoding="utf-8") as f:
        json.dump(deputees_data, f, ensure_ascii=False, indent=1)
        
    print(f"Scraping complete. Updated {deputees_json_path}")

In [52]:
base_url = "https://www.assemblee-nationale.fr/14/debats/index.asp"
links_file = "./data/scraping/links/session_links_14.json"
deputees_json = "./data/processed/deputees_14.json"

scrape_links(base_url, links_file)
process_folder_links(links_file)
deputy_map = create_deputy_mapping("./data/all_actors/acteur")
scrape_all_speeches(links_file, deputees_json, deputy_map)

Fetching https://www.assemblee-nationale.fr/14/debats/index.asp...
Found 6 content sections
Total links collected: 19
Links saved to ./data/scraping/links/session_links_14.json
Processing 18 folder links...
Scraping https://www.assemblee-nationale.fr/14/cri/2016-2017/...
Found 6 content sections
Total links collected: 19
Links saved to ./data/scraping/links/session_links_14.json
Processing 18 folder links...
Scraping https://www.assemblee-nationale.fr/14/cri/2016-2017/...
  Found 123 new sessions.
  Found 123 new sessions.
Scraping https://www.assemblee-nationale.fr/14/cri/2015-2016/...
Scraping https://www.assemblee-nationale.fr/14/cri/2015-2016/...
  Found 232 new sessions.
  Found 232 new sessions.
Scraping https://www.assemblee-nationale.fr/14/cri/2015-2016-extra/...
Scraping https://www.assemblee-nationale.fr/14/cri/2015-2016-extra/...
  Found 18 new sessions.
  Found 18 new sessions.
Scraping https://www.assemblee-nationale.fr/14/cri/2015-2016-extra2/...
Scraping https://www.asse

In [None]:
scrape_all_speeches(links_file, deputees_json, deputy_map)

Starting scrape of 1359 sessions...
Processing 50/1359: https://www.assemblee-nationale.fr/14/cri/2016-2017/20170073.asp
Processing 50/1359: https://www.assemblee-nationale.fr/14/cri/2016-2017/20170073.asp
Error scraping https://www.assemblee-nationale.fr/14/cri/2016-2017/20170028.asp: Response ended prematurely
Error scraping https://www.assemblee-nationale.fr/14/cri/2016-2017/20170028.asp: Response ended prematurely
Error scraping https://www.assemblee-nationale.fr/14/cri/2016-2017/20170029.asp: Response ended prematurely
Error scraping https://www.assemblee-nationale.fr/14/cri/2016-2017/20170029.asp: Response ended prematurely
Error scraping https://www.assemblee-nationale.fr/14/cri/2016-2017/20170024.asp: HTTPSConnectionPool(host='www.assemblee-nationale.fr', port=443): Max retries exceeded with url: /14/cri/2016-2017/20170024.asp (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002820B3AB1D0>, 'Connection to www.assemblee-nationale.fr timed out. (c