# Embedding Party Manifestos

We want to measure manifesto alignment (distance between party manifestos and respective party utterances over election cycles). The present notebook embeds each manifesto published by each party through the years 1997-2022 using a pre-trained BERT model for Danish language. Output of the code is .npy embedding files to be loaded in in order to derive manifesto alignment.

In [1]:
directory_path = "/Users/pbrams/Desktop/AARHUS_UNIVERSITY/kandidat/data_sci/data_sci_project/predicting_manifesto_alignment"

In [3]:
# Read merged parliamentary dialogue data in
import pandas as pd

df = pd.read_csv(f"{directory_path}/data/preprocessed/clean/manifestos/clean_manifestos.csv", sep = ";")

# Drop columns starting with 'Unnamed'
df = df.loc[:, ~df.columns.str.startswith('Unnamed')]

df

Unnamed: 0,Party_Name,Year,text,processed_text,token_count
0,Alternativet,2015,Alternativet Alternativet er klar til valg Dan...,klar valg danmark står tre alvorlige kriser kl...,1858
1,Alternativet,2019,Vores Politik På denne side finder du Alternat...,politik side finder politik politiske visione...,15879
2,Dansk Folkeparti,1998,UDLÆNDINGEPOLITIKKEN UD med særlove og hovsa-l...,udlændingepolitikken særlove hovsaløsninger fo...,288
3,Dansk Folkeparti,2001,Fælles værdier – fælles ansvar Arbejdsprogram...,fælles værdier – fælles ansvar arbejdsprogram ...,19173
4,Dansk Folkeparti,2005,Vi vil have et trygt land Danmark skal være et...,trygt land danmark trygt sikkert sted land bor...,304
...,...,...,...,...,...
57,Venstre,2005,Valgløfter Danmark skal være verdensmestre i v...,valgløfter danmark verdensmestre viden frem af...,1879
58,Venstre,2007,Valggrundlag Folketingsvalg 13. november 2007 ...,valggrundlag folketingsvalg november endnu bed...,1169
59,Venstre,2011,Nye tider. Varig velfærd Der er udskrevet folk...,nye tider varig velfærd udskrevet folketingsva...,1729
60,Venstre,2015,DET VIL VENSTRE Ydelser til flygtninge skal ne...,ydelser flygtninge suniveau gør mindre attrak...,730


### Embed each manifesto
We'll treat the embeddings separately for each manifesto for each party. This should allow for a nuanced analysis of each party's adherence to its manifesto independently (with all the caveats that come with that of course).

In [None]:
import os
import time
import pandas as pd
import numpy as np
from tqdm import tqdm
from transformers import AutoTokenizer
from danlp.models import load_bert_base_model

# Load the Danish BERT model
print("Starting load of load_bert_base_model()")
model = load_bert_base_model()

# Load the corresponding tokenizer
print("Start load of tokenizer.")
tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')

# Define the chunk size and overlap
CHUNK_SIZE = 512
OVERLAP = 50

def get_embedding(text, model, tokenizer, chunk_size=CHUNK_SIZE, overlap=OVERLAP):
    tokens = tokenizer.tokenize(text)
    #print(f"Total tokens: {len(tokens)}")  # Debugging print

    if len(tokens) <= chunk_size:
        #print(f"in get_embedding(), chunk is {len(tokens)}, so going to embed")
        _, embedding, _ = model.embed_text(text)
        return embedding
    else:
        chunk_embeddings = []
        #print(f"chunk is {len(tokens)}, so going to chunk")
        for i in range(0, len(tokens), chunk_size - overlap):
            end = i + chunk_size
            chunk = tokens[i:end] if end <= len(tokens) else tokens[i:]
            print(f"Chunk start: {i}, Chunk end: {end}, Chunk length: {len(chunk)}")  # Debugging print
            
            if len(chunk) > chunk_size:
                print(f"Chunk size {len(chunk)} exceeds limit of {chunk_size}. Splitting further.")  # Debugging print
                continue  # Skipping this chunk to ensure it does not exceed the limit

            chunk_text = tokenizer.convert_tokens_to_string(chunk)
            chunk_tokens = tokenizer.tokenize(chunk_text)
            #print(f"chunk_text token length is {len(chunk_tokens)}, chunk_text is '{chunk_text}'")  # Debugging print
            
            if len(chunk_tokens) > chunk_size:
                #print(f"Tokenized chunk_text length {len(chunk_tokens)} exceeds limit of {chunk_size}. Skipping.")  # Debugging print
                continue  # Skip embedding if it still exceeds the token limit

            #print(f"Going to embed chunk_text with token length {len(chunk_tokens)}")  # Debugging print
            _, chunk_embedding, _ = model.embed_text(chunk_text)
            chunk_embeddings.append(chunk_embedding)
        
        # Check if chunk_embeddings is empty
        if not chunk_embeddings:
            print("No valid chunks were processed. Returning None.")
            return None

        # Average the embeddings from each chunk
        document_embedding = np.mean(np.stack(chunk_embeddings), axis=0)
        return document_embedding

def save_embedding_speech(embedding, index, date, party_name, output_dir, filenames):
    if embedding is None:
        #print(f"Embedding is None for index {index}. Skipping save.")
        return
    filename = f"{index}_{date}_{party_name}.npy"
    filepath = os.path.join(output_dir, filename)
    np.save(filepath, embedding)
    filenames.append(filename)  # Append the filename to the list
    print(f"Saving embedding to {filename}")  # Debugging print

def process_and_save_embeddings(df, model, tokenizer, output_dir):
    start_time = time.time()
    total_rows = len(df)
    filenames = []  # List to store filenames
    with tqdm(total=total_rows) as pbar:
        for index, row in df.iterrows():
            try:
                print(f"Processing row {index}")  # Debugging print
                embedding = get_embedding(row['processed_text'], model, tokenizer)
                save_embedding_speech(embedding, index, row['Year'], # this part has been changed now, is 'Date' in the other one
                row['Party_Name'], output_dir, filenames)
                pbar.update(1)
                pbar.set_description(f"Processed and saved embedding for index {index}")
            except Exception as e:
                print(f"Error processing index {index}: {e}")
                pbar.update(1)
    print(f"Total time elapsed: {time.time() - start_time} seconds.")
    return filenames

# Directory for saving embeddings
output_dir = f"{directory_path}/data/preprocessed/embeddings/manifesto_BERT_embeddings_1997_2022"

os.makedirs(output_dir, exist_ok=True)

# Process and save embeddings for the subset
filenames = process_and_save_embeddings(df, model, tokenizer, output_dir)

# Add the filenames to the df
df['embedding_filename'] = filenames

# Saving the updated df to a csv file
df.to_csv(f"{directory_path}/data/preprocessed/clean/manifestos/clean_manifestos_1997_2022_with_embedding_filenames.csv", index=False)

# Take a look
df