# Data segmentation

This notebook will attempt to segment the documents stored ind dictionaris stored in Data\ extraction/clean-data/..

In [1]:
from congreso import congreso as c 
import pandas as pd
from matplotlib import pyplot as plt
import json
import os
import re

In [2]:
folder_path = os.path.join('..', 'Data extraction', 'clean-data')  # Goes one level up, then into 'Data extraction/clean-data'
documents = {}

for year in range(2015, 2023):
    file_path = os.path.join(folder_path, f"d_{year}.json")
    
    if os.path.exists(file_path):
        with open(file_path, "r", encoding="utf-8") as f:
            documents[year] = json.load(f)
        print(f"Loaded data for year {year} into documents[{year}]")
    else:
        print(f"File for year {year} not found at {file_path}")


Loaded data for year 2015 into documents[2015]
Loaded data for year 2016 into documents[2016]
Loaded data for year 2017 into documents[2017]
Loaded data for year 2018 into documents[2018]
Loaded data for year 2019 into documents[2019]
Loaded data for year 2020 into documents[2020]
Loaded data for year 2021 into documents[2021]
Loaded data for year 2022 into documents[2022]


In [3]:
d_2015 = documents[2015]
d_2016 = documents[2016]
d_2017 = documents[2017]
d_2018 = documents[2018]
d_2019 = documents[2019]
d_2020 = documents[2020]
d_2021 = documents[2021]
d_2022 = documents[2022]

Now let's start with the actual segmentation, first, some segmentation for a single doc to define the formats.

## Segmentation

### End start segmentation

In [4]:
def segment_docs(docs):
    segmented_docs = []
    successfully_segmented_count = 0

    # Define start and end patterns
    start_patterns = [
        r"El señor (PRESIDENTE|VICEPRESIDENTE)(?: \([^\)]*\))?:",  
        r"La señora (PRESIDENTA|VICEPRESIDENTA)(?: \([^\)]*\))?:", 
    ]
    
    end_patterns = [
        r"[Ss]e (suspende|levanta) la sesión.*?(Eran|Era) (las|la) .*?(\.|\?|\!)",
        r"[Ss]e (suspende|levanta) la sesión.*?(Eran|Era) (las|la) .*?(\.|\?)\s*(?:\([^)]*\))?\s*[\.\?!]",
    ]
    
    for doc in docs:
        segmented_doc = {key: value for key, value in doc.items() if key != 'text'}
        
        # Access the 'text' field
        text = doc.get('texto', '')
        
        # Find the first occurrence of a start pattern
        start_match = None
        for pattern in start_patterns:
            match = re.search(pattern, text)
            if match:
                start_match = match.start()
                break

        # Find the last occurrence of an end pattern
        end_match = None
        for pattern in end_patterns:
            matches = list(re.finditer(pattern, text))
            if matches:
                end_match = matches[-1].end()  # Get the end of the last match
        
        # If no end pattern is matched, use the last period in the text
        if end_match is None:
            last_period = text.rfind('.')
            if last_period != -1:
                end_match = last_period + 1  # Include the period itself
            else:
                end_match = len(text)  # Fallback to the full text length if no period is found
        
        # Extract the segmented text if both start and end matches are found
        if start_match is not None and end_match is not None:
            segmented_text = text[start_match:end_match]
            segmented_doc['texto'] = segmented_text
            segmented_docs.append(segmented_doc)
            successfully_segmented_count += 1
        else:
            segmented_doc['texto'] = ''  # Leave empty if no valid start/end is found
            segmented_docs.append(segmented_doc)
    
    total_success = successfully_segmented_count/c.num_docs_term(docs)
    return segmented_docs, total_success


In [5]:
ds_2015, success = segment_docs(d_2015)
print(success)
ds_2016, success = segment_docs(d_2016)
print(success)
ds_2017, success = segment_docs(d_2017)
print(success)
ds_2018, success = segment_docs(d_2018)
print(success)
ds_2019, success = segment_docs(d_2019)
print(success)
ds_2020, success = segment_docs(d_2020)
print(success)
ds_2021, success = segment_docs(d_2021)
print(success)
ds_2022, success = segment_docs(d_2022)
print(success)

1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0


### Segment interventions

In [35]:
def extract_interventions(doc):
    interventions = []
    text = doc.get("texto", "")
    date = doc.get("fecha", "unknown_date")
    
    # Define patterns
    # Captures names with or without parentheses
    intervention_pattern = r"(El|La)? ?(señor|señora) ([A-ZÀ-ÿ]+(?: [A-ZÀ-ÿ-]+)*)(?: \((.*?)\))?:"
    
    # Capture full titles like "PRESIDENTE DEL GOBIERNO"
    role_pattern = r"(PRESIDENTE(?: DEL GOBIERNO)?|PRESIDENTA(?: DEL GOBIERNO)?|VICEPRESIDENTE(?: DEL GOBIERNO)?|VICEPRESIDENTA(?: DEL GOBIERNO)?|MINISTRO|MINISTRA)(?: DE [A-ZÀ-ÿ ]+)?"

    if not text:
        print(f"Warning: 'texto' field is empty or missing in document with date {date}.")
        return interventions

    # Find all matches for interventions
    matches = list(re.finditer(intervention_pattern, text))
    
    if not matches:
        print(f"No interventions found in document with date {date}.")
        return interventions

    for i, match in enumerate(matches):
        start = match.start()
        end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
        intervention_text = text[start:end].strip()
        
        gender = "M" if match.group(2) == "señor" else "F" if match.group(2) == "señora" else "N"
        author = match.group(3).strip()
        alt_author = match.group(4).strip() if match.group(4) else None  
        
        if alt_author:
            author = alt_author  # Prefer the name in parentheses if available
        
        # Extract full role title
        charge_match = re.search(role_pattern, match.group(0))
        charge = charge_match.group(0).capitalize() if charge_match else None
        
        # Generate ID
        intervention_id = f"{date}.i{i + 1}"
        
        # Append to interventions
        interventions.append({
            "id": intervention_id,
            "autor": author,
            "charge": charge,
            "gender": gender,
            "text": intervention_text,
            "date": date
        })
    
    return interventions


In [40]:
int_20150120 = extract_interventions(ds_2015[4])
display(int_20150120)

[{'id': '20150212.i1',
  'autor': 'PRESIDENTE',
  'charge': 'Presidente',
  'gender': 'M',
  'text': 'El señor PRESIDENTE: Se reanuda la sesión.Punto del orden del día relativo a los dictámenes de la Comisión de Asuntos Exteriores sobre convenios internacionales con números de expediente 110/139 y 110/140. Algunos grupos han manifestado su deseo de intervenir. Les recuerdo quetienen cinco minutos por grupo. El Grupo Mixto, en principio, parece que no lo desea. Por el Grupo Vasco, PNV, tiene la palabra el señor Agirretxea.',
  'date': '20150212'},
 {'id': '20150212.i2',
  'autor': 'AGIRRETXEA URRESTI',
  'charge': None,
  'gender': 'M',
  'text': 'El señor AGIRRETXEA URRESTI: Gracias, señor presidente. Muy brevemente, respetando los cinco minutos asignados.En cuanto al convenio con Arabia Saudí, voy a empezar con un nombre: Raif Badawi. Sé que muchos de ustedes dirán que el convenio que hoy ratificamos no tiene nada que ver con eso, que es diferente y que lo de hoy es otra cosa, pero pa

In [41]:
import pandas as pd
pd.set_option("display.max_rows", None)  # Show all rows
ints = extract_interventions(ds_2015[1])
num_interventions = len(ints)

print(f"Number of interventions: {num_interventions}")

# Convert to DataFrame and exclude 'text' column
df = pd.DataFrame(ints).drop(columns=["text"])

# Display as a table
display(df)


Number of interventions: 333


Unnamed: 0,id,autor,charge,gender,date
0,20150121.i1,PRESIDENTE,Presidente,M,20150121
1,20150121.i2,GARZÓN ESPINOSA,,M,20150121
2,20150121.i3,PRESIDENTE,Presidente,M,20150121
3,20150121.i4,Rajoy Brey,Presidente del gobierno,M,20150121
4,20150121.i5,PRESIDENTE,Presidente,M,20150121
5,20150121.i6,GARZÓN ESPINOSA,,M,20150121
6,20150121.i7,PRESIDENTE,Presidente,M,20150121
7,20150121.i8,Rajoy Brey,Presidente del gobierno,M,20150121
8,20150121.i9,PRESIDENTE,Presidente,M,20150121
9,20150121.i10,PRESIDENTE,Presidente,M,20150121
