# Data segmentation

This notebook will attempt to segment the documents stored ind dictionaris stored in Data\ extraction/clean-data/..

In [1]:
from congreso import congreso as c 
import pandas as pd
from matplotlib import pyplot as plt
import json
import os
import re

In [2]:
folder_path = os.path.join('..', 'Data extraction', 'clean-data')  # Goes one level up, then into 'Data extraction/clean-data'
documents = {}

for year in range(2015, 2023):
    file_path = os.path.join(folder_path, f"d_{year}.json")
    
    if os.path.exists(file_path):
        with open(file_path, "r", encoding="utf-8") as f:
            documents[year] = json.load(f)
        print(f"Loaded data for year {year} into documents[{year}]")
    else:
        print(f"File for year {year} not found at {file_path}")


Loaded data for year 2015 into documents[2015]
Loaded data for year 2016 into documents[2016]
Loaded data for year 2017 into documents[2017]
Loaded data for year 2018 into documents[2018]
Loaded data for year 2019 into documents[2019]
Loaded data for year 2020 into documents[2020]
Loaded data for year 2021 into documents[2021]
Loaded data for year 2022 into documents[2022]


In [3]:
d_2015 = documents[2015]
d_2016 = documents[2016]
d_2017 = documents[2017]
d_2018 = documents[2018]
d_2019 = documents[2019]
d_2020 = documents[2020]
d_2021 = documents[2021]
d_2022 = documents[2022]

Now let's start with the actual segmentation, first, some segmentation for a single doc to define the formats.

## Segmentation

### End start segmentation

In [4]:
def segment_docs(docs):
    segmented_docs = []
    successfully_segmented_count = 0

    # Define start patterns (handles cases where there's no space after a period)
    start_patterns = [
        r"(?:\.|\s|^)El señor (PRESIDENTE|VICEPRESIDENTE)(?: \([^\)]*\))?:",  
        r"(?:\.|\s|^)La señora (PRESIDENTA|VICEPRESIDENTA)(?: \([^\)]*\))?:", 
    ]
    
    for doc in docs:
        segmented_doc = {key: value for key, value in doc.items() if key != 'text'}
        text = doc.get('texto', '')

        # Find the first occurrence of any start pattern
        start_match = None
        start_positions = []

        for pattern in start_patterns:
            matches = list(re.finditer(pattern, text))
            if matches:
                start_positions.append(matches[0].start())  # Get the first occurrence of each pattern

        if start_positions:
            start_match = min(start_positions)  # Get the earliest match
        
        # Use the last period in the text as fallback end
        end_match = text.rfind('.')
        if end_match == -1:
            end_match = len(text)  # Fallback to full text if no period found
        
        # Extract the segment if a start is found
        if start_match is not None:
            segmented_text = text[start_match:end_match]
            segmented_doc['texto'] = segmented_text
            segmented_docs.append(segmented_doc)
            successfully_segmented_count += 1
        else:
            segmented_doc['texto'] = ''  # Leave empty if no valid start is found
            segmented_docs.append(segmented_doc)

    total_success = successfully_segmented_count / len(docs) if docs else 0
    return segmented_docs, total_success


In [5]:
ds_2015, success = segment_docs(d_2015)
print(success)
ds_2016, success = segment_docs(d_2016)
print(success)
ds_2017, success = segment_docs(d_2017)
print(success)
ds_2018, success = segment_docs(d_2018)
print(success)
ds_2019, success = segment_docs(d_2019)
print(success)
ds_2020, success = segment_docs(d_2020)
print(success)
ds_2021, success = segment_docs(d_2021)
print(success)
ds_2022, success = segment_docs(d_2022)
print(success)

1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0


### Segment interventions

In [6]:
def extract_interventions(doc):
    interventions = []
    text = doc.get("texto", "")
    date = doc.get("fecha", "unknown_date")
    
    
    # Define patterns
    # Captures names with or without parentheses
    intervention_pattern = r"(El|La)? ?(señor|señora) ([A-ZÀ-ÿ]+(?: [A-ZÀ-ÿ-]+)*)(?: \((.*?)\))?:"
    
    # Capture full titles like "PRESIDENTE DEL GOBIERNO"
    role_pattern = r"(PRESIDENTE(?: DEL GOBIERNO)?|PRESIDENTA(?: DEL GOBIERNO)?|VICEPRESIDENTE(?: DEL GOBIERNO)?|VICEPRESIDENTA(?: DEL GOBIERNO)?|MINISTRO|MINISTRA)(?: DE [A-ZÀ-ÿ ]+)?"

    if not text:
        print(f"Warning: 'texto' field is empty or missing in document with date {date}.")
        return interventions

    # Find all matches for interventions
    matches = list(re.finditer(intervention_pattern, text))
    
    if not matches:
        print(f"No interventions found in document with date {date}.")
        return interventions

    for i, match in enumerate(matches):
        start = match.start()
        end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
        intervention_text = text[start:end].strip()
        
        gender = "M" if match.group(2) == "señor" else "F" if match.group(2) == "señora" else "N"
        author = match.group(3).strip()
        alt_author = match.group(4).strip() if match.group(4) else None  
        
        if alt_author:
            author = alt_author  # Prefer the name in parentheses if available
        
        # Extract full role title
        charge_match = re.search(role_pattern, match.group(0))
        charge = charge_match.group(0).capitalize() if charge_match else None
        
        # Generate ID
        intervention_id = f"{date}.i{i + 1}"
        
        # Append to interventions
        interventions.append({
            "id": intervention_id,
            "autor": author,
            "charge": charge,
            "gender": gender,
            "text": intervention_text,
            "date": date
        })
    
    return interventions

In [9]:
pd.set_option("display.max_rows", None)  # Show all rows
ints = extract_interventions(ds_2022[0])
num_interventions = len(ints)

print(f"Number of interventions: {num_interventions}")

# Convert to DataFrame and exclude 'text' column
df = pd.DataFrame(ints).drop(columns=["text"])

# Display as a table
display(df)


Number of interventions: 83


Unnamed: 0,id,autor,charge,gender,date
0,20220125.i1,PRESIDENTA,Presidenta,F,20220125
1,20220125.i2,Montero Cuadrado,Ministra de hacienda y función pública,F,20220125
2,20220125.i3,PRESIDENTA,Presidenta,F,20220125
3,20220125.i4,QUEVEDO ITURBE,,M,20220125
4,20220125.i5,PRESIDENTA,Presidenta,F,20220125
5,20220125.i6,ORAMAS GONZÁLEZ-MORO,,F,20220125
6,20220125.i7,PRESIDENTA,Presidenta,F,20220125
7,20220125.i8,MARTÍNEZ GRANADOS,,F,20220125
8,20220125.i9,PRESIDENTA,Presidenta,F,20220125
9,20220125.i10,GUIJARRO GARCÍA,,M,20220125


Problemas: 
1. la intervencio del bolaños no esta
2. no hi ha diferencia entre vicepresident i vicepresidenta de .... un es de la camara l'altra es del gobern!

3. Intervencions amb punts sense espai: Hola.El señor PRESIDENTE: no es reconeixes perque no hi ha el espai
4. els guions molt amples s'han transcrit com a \x97

Edito el text i afegeixo aquells espais i canvio els guions?

In [None]:
pd.set_option("display.max_rows", None)  # Show all rows
ints = extract_interventions(ds_2015[3])
num_interventions = len(ints)

print(f"Number of interventions: {num_interventions}")

# Convert to DataFrame and exclude 'text' column
df = pd.DataFrame(ints).drop(columns=["text"])

# Display as a table
display(df)

In [12]:
display(extract_interventions(ds_2015[3]))

[{'id': '20150211.i1',
  'autor': 'PRESIDENTE',
  'charge': 'Presidente',
  'gender': 'M',
  'text': 'El señor PRESIDENTE: Se reanuda la sesión.Preguntas dirigidas al señor presidente del Gobierno. En primer lugar, pregunta el diputado don Cayo Lara, de Izquierda Plural.',
  'date': '20150211'},
 {'id': '20150211.i2',
  'autor': 'LARA MOYA',
  'charge': None,
  'gender': 'M',
  'text': 'El señor LARA MOYA: Gracias, presidente.Señor Rajoy, ¿cree más urgente para los ciudadanos españoles firmar un acuerdo contra el terrorismo yihadista en lugar de uno contra la pobreza?',
  'date': '20150211'},
 {'id': '20150211.i3',
  'autor': 'PRESIDENTE',
  'charge': 'Presidente',
  'gender': 'M',
  'text': 'El señor PRESIDENTE: Gracias, señor Lara.Señor presidente.',
  'date': '20150211'},
 {'id': '20150211.i4',
  'autor': 'Rajoy Brey',
  'charge': 'Presidente del gobierno',
  'gender': 'M',
  'text': 'El señor PRESIDENTE DEL GOBIERNO (Rajoy Brey): Que el Gobierno y los dos principales partidos de la