# Embedding parliamentary dialogue

We want to measure manifesto alignment (distance between party manifestos and respective party utterances over election cycles). The present notebook embeds each party utterance per day using a pre-trained BERT model for Danish language. Output of the code is .npy embedding files to be loaded in in order to derive manifesto alignment.

In [3]:
directory_path = "/Users/pbrams/Desktop/AARHUS_UNIVERSITY/kandidat/data_sci/data_sci_project/predicting_manifesto_alignment"

In [5]:
# Read merged parliamentary dialogue data in
import pandas as pd

df = pd.read_csv(f"{directory_path}/data/preprocessed/clean/merged/parliamentary_dialogue_1997_2022.csv")

# Drop columns starting with 'Unnamed'
df = df.loc[:, ~df.columns.str.startswith('Unnamed')]

df

  df = pd.read_csv(f"{directory_path}/data/preprocessed/clean/merged/parliamentary_dialogue_1997_2022.csv")


Unnamed: 0,ID,Date,Start time_colFromDPC,End time_colFromDPC,Time_colFromDPC,Agenda item_colFromDPC,Case no_colFromDPC,Case type_colFromDPC,Agenda_colFromDPC,Subject 1_colFromDPC,...,antielite_salience,corrupt_salience,members_vs_leadership,mip_one,mip_two,mip_three,chesversion,party_name_full,processed_text,token_count
0,,1997-10-07,,,,,,,Statsministerens redegørelse i henhold til gru...,,...,,,,,,,,,år siden sagde flertal befolkningen ja europa ...,10600
1,,1997-10-09,,,,,,,2) Forhandling om redegørelse nr. R 1.,,...,,,,,,,,,står afgørende folketingssamling to valg vigti...,1339
2,,1997-10-09,,,,,,,2) Forhandling om redegørelse nr. R 1.,,...,,,,,,,,,taler indgreb imod stigende forbrug anledning...,62
3,,1997-10-09,,,,,,,2) Forhandling om redegørelse nr. R 1.,,...,,,,,,,,,ligesom åbningstalen kom udlændingeafsnittet s...,81
4,,1997-10-09,,,,,,,2) Forhandling om redegørelse nr. R 1.,,...,,,,,,,,,kan forstå udlændingepolitikken kraftigt nedpr...,74
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
549013,ParlaMint-DK_20220607215832,2022-06-07,,,,,,,,,...,,,,,,,,,bemærkningerne starten indlægget ja samle for...,25
549014,ParlaMint-DK_20220607215911,2022-06-07,,,,,,,,,...,,,,,,,,,se frem givtigt ny måske fik liste egentlig ...,33
549015,ParlaMint-DK_20220607215941,2022-06-07,,,,,,,,,...,,,,,,,,,både forhold tal refereret ellers ligger egne...,32
549016,ParlaMint-DK_20220607220032,2022-06-07,,,,,,,,,...,,,,,,,,,gange ens hjerte fyldt ord spilder teksten ...,105


In [10]:
df.dtypes

ID                               object
Date                     datetime64[ns]
Start time_colFromDPC            object
End time_colFromDPC              object
Time_colFromDPC                 float64
                              ...      
mip_three                       float64
chesversion                     float64
party_name_full                  object
processed_text                   object
token_count                       int64
Length: 111, dtype: object

For the analyses and since not every MP speaks every day, the data is grouped per party p and day d.

In [12]:
import pandas as pd

# Ensure datetime format
df['Date'] = pd.to_datetime(df['Date'])

# Find the earliest date in the dataset
min_date = df['Date'].min()
max_date = df['Date'].max()

# Filter data from the first logged day to the last
filtered_df = df[(df['Date'] >= min_date) & (df['Date'] <= max_date)]

# Ensure 'processed_text' is a string
filtered_df['processed_text'] = filtered_df['processed_text'].astype(str)

# Group data by full date and party
grouped_df = filtered_df.groupby(['Date', 'Speaker_party_name'])['processed_text'].apply(lambda x: ' '.join(x)).reset_index()

grouped_df = grouped_df.rename(columns={"Speaker_party_name": "Party_Name"})  # This is the one we'll use

# Pivot the data to have text for each party on separate rows for each date, just to see
pivoted_df = grouped_df.pivot(index='Date', columns='Party_Name', values='processed_text').reset_index()

# Ensure columns are present or manage missing data
pivoted_df = pivoted_df.fillna('')

# Take a look on the grouped_df (which is the one we'll use)
grouped_df

Unnamed: 0,Date,Party_Name,processed_text
0,1997-10-07,Socialdemokratiet,år siden sagde flertal befolkningen ja europa ...
1,1997-10-09,Dansk Folkeparti,ligesom åbningstalen kom udlændingeafsnittet s...
2,1997-10-09,Det Radikale Venstre,lad allerførst starte konstatere bennedsens v...
3,1997-10-09,Enhedslisten,taler indgreb imod stigende forbrug anledning...
4,1997-10-09,Konservative Folkeparti,oprigtig talt dybt beskæmmende kan prøve bort...
...,...,...,...
20040,2022-06-07,Konservative Folkeparti,egentlig bare helt grundlæggende godt tænke vi...
20041,2022-06-07,Nye Borgerlige,nye debat set haft siden valgt folketinget ...
20042,2022-06-07,Socialdemokratiet,simpelt hen glad dag simpelt hen nødt orde...
20043,2022-06-07,Socialistisk Folkeparti,ordet muligvis velset inden sommerferien forl...


### Embed each day, each party
We'll treat the embeddings separately for each party per day. This should allow for a nuanced analysis of each party's adherence to its manifesto independently (with all the caveats that come with that of course).

In [None]:
import os
import time
import pandas as pd
import numpy as np
from tqdm import tqdm
from transformers import AutoTokenizer
from danlp.models import load_bert_base_model

# Load the Danish BERT model
print("Starting load of load_bert_base_model()")
model = load_bert_base_model()

# Load the corresponding tokenizer
print("Start load of tokenizer.")
tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased') # Same tokenizer used as in documentation

# Define chunk size and overlap
CHUNK_SIZE = 512
OVERLAP = 50

def get_embedding(text, model, tokenizer, chunk_size=CHUNK_SIZE, overlap=OVERLAP):
    tokens = tokenizer.tokenize(text)
    #print(f"Total tokens: {len(tokens)}")  # Debugging print

    if len(tokens) <= chunk_size:
        #print(f"in get_embedding(), chunk is {len(tokens)}, so going to embed")
        _, embedding, _ = model.embed_text(text)
        return embedding
    else:
        chunk_embeddings = []
        #print(f"chunk is {len(tokens)}, so going to chunk")
        for i in range(0, len(tokens), chunk_size - overlap):
            end = i + chunk_size
            chunk = tokens[i:end] if end <= len(tokens) else tokens[i:]
           # print(f"Chunk start: {i}, Chunk end: {end}, Chunk length: {len(chunk)}")  # Debugging print
            
            if len(chunk) > chunk_size:
                #print(f"Chunk size {len(chunk)} exceeds limit of {chunk_size}. Splitting further.")  # Debugging print
                continue  # Skipping this chunk to ensure it does not exceed the limit

            chunk_text = tokenizer.convert_tokens_to_string(chunk)
            chunk_tokens = tokenizer.tokenize(chunk_text)
            #print(f"chunk_text token length is {len(chunk_tokens)}, chunk_text is '{chunk_text}'")  # Debugging print
            
            if len(chunk_tokens) > chunk_size:
                #print(f"Tokenized chunk_text length {len(chunk_tokens)} exceeds limit of {chunk_size}. Skipping.")  # Debugging print
                continue  # Skip embedding if it still exceeds the token limit

            #print(f"Going to embed chunk_text with token length {len(chunk_tokens)}")  # Debugging print
            _, chunk_embedding, _ = model.embed_text(chunk_text)
            chunk_embeddings.append(chunk_embedding)
        
        # Check if chunk_embeddings is empty
        if not chunk_embeddings:
            print("No valid chunks were processed. Returning None.")
            return None

        # Average the embeddings from each chunk
        document_embedding = np.mean(np.stack(chunk_embeddings), axis=0)
        return document_embedding

def save_embedding_speech(embedding, index, date, party_name, output_dir, filenames):
    if embedding is None:
        
        #print(f"Embedding is None for index {index}. Skipping save.")
        return
    
    filename = f"{index}_{date}_{party_name}.npy"
    filepath = os.path.join(output_dir, filename)
    np.save(filepath, embedding)
    filenames.append(filename)  # Append the filename to the list
    print(f"Saving embedding to {filename}")  # Debugging print

def process_and_save_embeddings(df, model, tokenizer, output_dir):
    start_time = time.time()
    total_rows = len(df)
    filenames = []  # List to store filenames
    with tqdm(total=total_rows) as pbar:
        for index, row in df.iterrows():
            try:
                print(f"Processing row {index}")  # Debugging print
                embedding = get_embedding(row['processed_text'], model, tokenizer)
                save_embedding_speech(embedding, index, row['Date'], row['Party_Name'], output_dir, filenames)
                pbar.update(1)
                pbar.set_description(f"Processed and saved embedding for index {index}")
            except Exception as e:
                print(f"Error processing index {index}: {e}")
                pbar.update(1)
    print(f"Total time elapsed: {time.time() - start_time} seconds.")
    return filenames

# Directory for saving embeddings
output_dir = f"{directory_path}/data/preprocessed/embeddings/speech1997_2022_BERT"
os.makedirs(output_dir, exist_ok=True)

# Process and save embeddings for the subset
filenames = process_and_save_embeddings(grouped_df, model, tokenizer, output_dir)

# Add the filenames to the fd
grouped_df['embedding_filename'] = filenames

# Save the updated df to a CSV file for logging
grouped_df.to_csv(f"{directory_path}/data/preprocessed/clean/merged/parliamentary_dialogue_1997_2022_with_embedding_filenames.csv", index=False)

# Take a look
grouped_df