# Crawl diputades

In [1]:
import logging
import json
import time
from pathlib import Path
import gzip
from multiprocessing.pool import ThreadPool

from sqlalchemy import Column, Integer, String, ForeignKey, Boolean, UniqueConstraint, Text, Index
from sqlalchemy.orm import declarative_base, relationship, sessionmaker
from sqlalchemy import update
from pathlib import Path
from sqlalchemy import create_engine

import requests
from selenium import webdriver
import seleniumrequests
#from seleniumrequests import Chrome
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By

from sqlalchemy import create_engine, select
from sqlalchemy.orm import sessionmaker
from sqlalchemy import and_, or_
from sqlalchemy import update
from sqlalchemy import distinct, inspect
from json_stream import streamable_list

import pypdfium2 as pdfium
from bs4 import BeautifulSoup

## Database creation

In [2]:
Base = declarative_base()

class Document(Base):
    __tablename__ = 'document'
    
    pdf_url = Column(String, primary_key=True)  
    cve = Column(String)
    fecha = Column(String)
    fecha_mensaje = Column(String)
    mensaje = Column(String)
    ndia = Column(String)
    numdoc = Column(String)
    orga = Column(String)
    seri = Column(String)
    subi = Column(String)
    texto = Column(Text)
    secc = Column(String)
    legislatura = Column(String, ForeignKey('term.term'))
    desu = Column(String)
    desu1 = Column(String)
    desu2 = Column(String)
    
    term = relationship("Term", back_populates="documents")
    interventions = relationship("Intervention", back_populates="document")


class Term(Base):
    __tablename__ = 'term'
    term = Column(String, primary_key=True) 
    term_id = Column(Integer)  
    president = Column(String)
    init_date = Column(String)
    finish_date = Column(String)
    
    diputades = relationship("Diputades", back_populates="term")
    documents = relationship("Document", back_populates="term")

class Diputades(Base):
    __tablename__ = 'diputades'
    id = Column(Integer, primary_key=True)
    apellidos = Column(String)
    formacion = Column(String)
    apellidosNombre = Column(String)
    fchBaja = Column(String)
    genero = Column(Integer)
    fchAlta = Column(String)
    idLegislatura = Column(Integer)  
    grupo = Column(String)
    idCircunscripcion = Column(Integer)
    nombreCircunscripcion = Column(String)
    nombre = Column(String)
    codParlamentario = Column(Integer)
    charge = Column(String)

    term_id = Column(Integer, ForeignKey('term.term_id'))  # Proper foreign key reference
    term = relationship("Term", back_populates="diputades")

    interventions = relationship("Intervention", back_populates="speaker")




class Intervention(Base):
    __tablename__ = 'intervention'
    
    id = Column(String, primary_key=True)
    speaker_id = Column(Integer, ForeignKey('diputades.id'))
    text = Column(String)
    document_id = Column(String, ForeignKey('document.pdf_url'))
    fecha = Column(String)
    num_int = Column(Integer)

    document = relationship("Document", back_populates="interventions")
    speaker = relationship("Diputades", back_populates="interventions")



def create_database(engine):
    Base.metadata.create_all(engine)

## Class to crawl diputades

In [3]:
class DiputadesGenerator:
    def __init__(self, outer, id_legislatura):
        self.id_legislatura = id_legislatura
        self.outer = outer
        
        

    def __iter__(self):
        diputades_data = self.crawl_diputades_list(self.id_legislatura)
        print("Raw response:", diputades_data)  # Depuració
        diputades_list = diputades_data.get("data", [])  # Accedir correctament a la llista de diputats
        print(f"Number of diputades extracted: {len(diputades_list)}")

        for diputada in diputades_list:
            diputada["idLegislatura"] = self.id_legislatura
            yield diputada

    def crawl_diputades_list(self, id_legislatura):
        lg = id_legislatura
        url = "https://www.congreso.es/es/busqueda-de-diputados?p_p_id=diputadomodule&p_p_lifecycle=2&p_p_state=normal&p_p_mode=view&p_p_resource_id=searchDiputados&p_p_cacheability=cacheLevelPage&_diputadomodule_idLegislatura={legislatura}&_diputadomodule_genero=0&_diputadomodule_grupo=all&_diputadomodule_tipo=1&_diputadomodule_nombre=&_diputadomodule_apellidos=&_diputadomodule_formacion=all&_diputadomodule_filtroProvincias=%5B%5D&_diputadomodule_nombreCircunscripcion="
        self.outer.driver.get(url.format(legislatura=lg))
        e = self.outer.driver.find_element(By.TAG_NAME, 'body')
        b = e.text
        b = b.replace("\n", " ")
        return json.loads(b)


## Class to crawl

In [4]:
class CongresoCrawler:
    def __init__(self, db: str):
        self.engine = None
        self.session = None
        self.browser = None
        self.base_url = "https://www.congreso.es"
        self.base_dir = "doc"
        self.db = db

    def create_db(self):
        self.engine = create_engine(self.db, echo=True, future=True)
        create_database(self.engine)

    def connect_db(self, echo=True):
        self.engine = create_engine(self.db, echo=echo, future=True)
        session_class = sessionmaker(bind=self.engine)
        self.session = session_class()

    def crawl(self, echo=True):
        self.connect_db(echo)
        self.create_driver() #calling other functions
        self.click_accept_cookies()
        self.get_terms()
        terms = {term.term_id: term for term in self.session.query(Term).all()}
        sorted_term_ids = sorted(terms.keys(), reverse=False)
        for term_id in sorted_term_ids:
            self.crawl_term(terms[term_id])
        logging.info("Finished crawling Congreso de los Diputados")

    def create_driver(self):
        options = Options()
        #options.add_argument('--headless')
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-dev-shm-usage')
        self.driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
        
    def click_accept_cookies(self):
        self.driver.get(self.base_url)
        # Accept cookies
        continue_link = self.driver.find_element(By.LINK_TEXT, 'Aceptar todas')
        continue_link.click()
        
    def get_terms(self):
        self.driver.get("https://www.congreso.es/es/busqueda-de-diputados")
        e = self.driver.find_element(By.XPATH,'//*[@id="_diputadomodule_legislatura"]') #find XPATH containing publicaciones...
        for term_option in e.find_elements(By.TAG_NAME, 'option'): 
            term_id = int(term_option.get_attribute("value"))  # Integer ID
            term_name = term_option.text.split(" ")[0]  # Extract Roman numeral
            if term_id > 0:
                t = Term(term_id=term_id, term=term_name)
                self.session.merge(t)  # Merge to avoid duplicates
        self.session.commit()
        logging.info("Terms committed to DB")
        
    def crawl_term(self, term):
        logging.info(f"Starting crawling term {term.term}")
        it = DiputadesGenerator(self, term.term_id)
        for i, diputada_dict in enumerate(it):
            diputada_dict["term_id"] = term.term_id
            d = Diputades(**diputada_dict)
            print("Inserting into DB:", diputada_dict)
            self.session.add(d)
            if i % 10 == 0:
                self.session.commit()
        self.session.commit()
        logging.info(f"Finished crawling term {term.term}")
        

In [None]:
logging.basicConfig(level=logging.INFO)

# Initialize and run the crawler
cc = CongresoCrawler("sqlite:///interventions.db")
cc.create_db()
cc.crawl()