# Data Cleaning 🌟

This notebook is the first notebook used in data-cleaning (before annotation). It has the following sections (and some chunks in between for testing / inspection purposes):

- **XML Handling**: Parse files, fixing XML/text errors
- **Metadata & Content Extraction**: Pulling session info, agenda items, speaker details, timestamps, and utterances  
- **Making DebateType**: Tag each segment as Q&A, deliberation, bill reading, party leader debates, or other
- **Topics**: Clean titles, run BERTopic (to get policy topics), map to policy categories
- **Role and Turn Structure**: Infer roles (chair, asker, proponent, minister), assign turn and unit IDs
- **Anonymization**: Replace party names with pseudonyms and speaker mentions with generic labels
- **Export**: Save cleaned datasets (full and by debate type) for further analysis

In [None]:
import os
import re
import pandas as pd
import lxml.etree as ET

# Get home directory dynamically
HOME_DIR = os.path.expanduser("~")

# Define input XML folder
XML_FOLDER = os.path.join(HOME_DIR, "Desktop", "AARHUS_UNIVERSITY", "kandidat", "thesis_work", "data", "parliament", "debates_in_chamber") # Starting with all debates, taking from 2020-2025 later on

# Define output folder
OUTPUT_FOLDER = os.path.join(HOME_DIR, "Desktop", "AARHUS_UNIVERSITY", "kandidat", "thesis_work", "data_cleaning", "output")

# Ensure output dir exists
os.makedirs(OUTPUT_FOLDER, 
            exist_ok=True)

# Define output CSV path
OUTPUT_CSV = os.path.join(OUTPUT_FOLDER, 
                          "debates_2009_2025.csv")

# Debug print to check paths
print(f"📂 XML Input Folder: {XML_FOLDER}")
print(f"📂 Output Folder: {OUTPUT_FOLDER}")
print(f"📄 Output CSV File: {OUTPUT_CSV}")

# Regex pattern to find timestamps in format "Kl. xx:xx"
TIME_PATTERN = re.compile(r"\(Kl\. \d{2}:\d{2}\)")

def clean_text_repetitions(text):
    """
    Detects and removes erroneous duplicate fragments in XML text.
    """
    duplicate_patterns = [
        r"(som trods alt er kriteriet for at få lov at komme ind\.)",  # Detects exact phrase repetition
    ]

    for pattern in duplicate_patterns:
        matches = re.findall(pattern, text)
        if len(matches) > 1:  
            text = re.sub(pattern, "", text, count=1)  

    return text.strip()

def fix_xml_file(input_file, output_file):
    """
    Reads an XML file, fixes duplicated text errors, and saves the corrected version.
    """
    parser = ET.XMLParser(recover=True)  
    tree = ET.parse(input_file, parser)
    
    for char_elem in tree.findall(".//Char"):
        if char_elem.text:
            original_text = char_elem.text
            cleaned_text = clean_text_repetitions(original_text)
            if original_text != cleaned_text:
                print(f"🛠 FIXED: {original_text} -> {cleaned_text}")
            char_elem.text = cleaned_text  

    tree.write(output_file, encoding="utf-8", pretty_print=True)
    print(f"✅ Fixed XML saved as '{output_file}'")

def extract_agenda_items(root):
    """
    Extracts agenda items (ItemNo and ShortTitle) from the XML structure.
    """
    agenda_items = []
    
    for agenda in root.findall(".//DagsordenPunkt"):
        item_no = agenda.findtext(".//MetaFTAgendaItem/ItemNo", default="Unknown")
        short_title = agenda.findtext(".//MetaFTAgendaItem/ShortTitle", default="No title provided")
        
        agenda_items.append((item_no, short_title, agenda))
        
        print(f"📝 DEBUG: Extracted Agenda Item {item_no} -> {short_title}")

    return agenda_items

def determine_debate_type(agenda_title):
    """
    Categorizes the debate type based on keywords in the agenda title.
    """
    agenda_lower = agenda_title.lower()

    if "spørgetime" in agenda_lower or "spørgetid" in agenda_lower:
        return "question-answering"
    
    if "forhandling af" in agenda_lower:  # Detects 'Forhandling af'
        return "deliberation"
    
    if "behandling" in agenda_lower:
        return "reading of bill"
    
    return "other"

def extract_debate_data(xml_file):
    """
    Parses an XML file and extracts relevant debate information, including timestamps and agenda items.
    """
    print(f"🔍 DEBUG: Processing file {xml_file}...")

    try:
        tree = ET.parse(xml_file)
    except ET.XMLSyntaxError:
        print(f"❌ XML ERROR in {xml_file} -> Attempting to fix...")
        fixed_file = xml_file.replace(".xml", "_fixed.xml")
        fix_xml_file(xml_file, fixed_file)  
        tree = ET.parse(fixed_file)  
        print(f"🔄 Retrying parsing with fixed file: {fixed_file}")

    root = tree.getroot()

    session_id = root.findtext(".//ParliamentarySession", default="Unknown")
    meeting_number = root.findtext(".//MeetingNumber", default="Unknown")
    date_of_sitting = root.findtext(".//DateOfSitting", default="Unknown")
    location = root.findtext(".//Location", default="Unknown")

    agenda_items = extract_agenda_items(root)

    debates = []
    
    for agenda_no, agenda_title, agenda_element in agenda_items:
        turn_number = 1  

        debate_type = determine_debate_type(agenda_title)  

        for speech in agenda_element.findall(".//Tale"):
            speaker_first_name = speech.findtext(".//OratorFirstName", default="")
            speaker_last_name = speech.findtext(".//OratorLastName", default="")
            speaker_name = f"{speaker_first_name} {speaker_last_name}".strip()
            speaker_party = speech.findtext(".//GroupNameShort", default="Unknown")
            speaker_role = speech.findtext(".//OratorRole", default="Unknown")

            utterances = [elem.text.strip() for elem in speech.findall(".//TekstGruppe//Exitus//Linea//Char") if elem.text]
            full_text = " ".join(utterances)

            time_match = TIME_PATTERN.search(full_text)
            time_value = time_match.group(0) if time_match else ""

            clean_text = TIME_PATTERN.sub("", full_text).strip()

            print(f"🔍 DEBUG: Assigning to Agenda {agenda_no} -> {agenda_title}, DebateType: {debate_type}, Turn {turn_number}: {clean_text[:100]}...")

            if clean_text:
                debates.append([
                    session_id, 
                    meeting_number, 
                    date_of_sitting, 
                    location,
                    agenda_no, 
                    agenda_title,
                    debate_type,  
                    turn_number,  
                    speaker_name, 
                    speaker_party, 
                    speaker_role, 
                    time_value, 
                    clean_text
                ])
                
                turn_number += 1  

    return debates

all_debates = []
for filename in os.listdir(XML_FOLDER):
    if filename.endswith(".xml"):
        file_path = os.path.join(XML_FOLDER, filename)
        debates = extract_debate_data(file_path)
        all_debates.extend(debates)

df = pd.DataFrame(all_debates, columns=[
    "SessionID", 
    "MeetingNumber", 
    "Date", 
    "Location",
    "AgendaItemNo", 
    "AgendaTitle",
    "DebateType",  
    "TurnNo",  
    "Speaker", 
    "Party", 
    "Role", 
    "Time", 
    "Utterance"
])

OUTPUT_CSV = os.path.join(OUTPUT_FOLDER, "debates_2020_2025.csv")

df.to_csv(OUTPUT_CSV, sep=";", index=False, encoding="utf-8")

print("✅ Debate data saved successfully!")

In [None]:
forhandling_rows = df[df["AgendaTitle"].str.contains("Forhandling", case=False, na=False)]
forhandling_rows

## Checking out topics 💫

In [None]:
# Read the file
import os
import pandas as pd

HOME_DIR = os.path.expanduser("~")

# Define output folder
OUTPUT_FOLDER = os.path.join(HOME_DIR, "Desktop", "AARHUS_UNIVERSITY", "kandidat", "thesis_work", "data_cleaning", "output")

# Define output CSV path
OUTPUT_CSV = os.path.join(OUTPUT_FOLDER, 
                          "debates_2009_2025.csv")

OUTPUT_CSV

# get file
df = pd.read_csv(OUTPUT_CSV, sep=";", encoding="utf-8")
df

In [None]:
print("\n📝 Unique Agenda Titles:")
for title in df['AgendaTitle'].unique():
    print(f"- {title}")


In [None]:
import re
import pandas as pd
from bertopic import BERTopic

# Define introductory phrases to remove
intro_phrases = [
    r"^\d+\.\ behandling af",  # Matches "1. behandling af", "2. behandling af", etc.
    r"^førstebehandling af", 
    r"^andenbehandling af", 
    r"^spørgsmål til", 
    r"^debat om", 
    r"^orientering om", 
    r"^lovforslag om",
    r"^forhandling af"
]

# Function to clean agenda titles
def clean_agenda_title(title):
    title = title.lower().strip()
    for pattern in intro_phrases:
        title = re.sub(pattern, "", title).strip()
    title = re.sub(r"\d+", "", title)  # Remove numbers
    title = re.sub(r":", "", title)  # Remove colons
    title = re.sub(r"\s+", " ", title).strip()  # Remove extra spaces
    return title

# Function to assign categories based on topic keywords (substring matching)
def categorize_topic(topic_words):
    for category, keywords in categories.items():
        for keyword in keywords:
            if any(keyword in word for word in topic_words):  # Substring match
                return category
    return "Other"

# Categories with keywords
categories = {
    "Agriculture": ["landbrug", "fødevarer", "farming", "husdyr", "kvæg", "gødning", "mejeri", "planteavl", "frugt", "grøntsager"],
    "Business": [ "handel", "virksomhed", "økonomi", "marked", "arbejdsmarked", "erhverv",
        "investering", "industri", "e-handel", "konkurrenceevne", "iværksætteri",
        "eksport", "import", "forsyningskæde", "detailhandel", "grossisthandel",
        "SMV", "forretningsstrategi", "forbrugerbeskyttelse", "selskabsskat",
        "arbejdsmarkedspolitik", "arbejdspladser", "virksomhedsdrift", "offentlig auktion", 
        "iværksætteri", "konkurrenceevne", "pelsavl", "mink", "producentansvar",
        "lokalhandel", "gig-økonomi", "turistindtægter", "landbrugspolitik",
        "platformsarbejde", "producentansvar", "konkurrencevilkår", "små- og mellemstore virksomheder", "bæredygtig detailhandel",
        "nettoprisindekset", "forretningsmodeller", "arbejdsmarkedsdynamik", "markedsregulering", "forbrugerrettigheder",
        "handelsaftaler", "eksportstrategier", "økonomisk globalisering", "markedstilpasning",
        "forretningsudvikling", "digital handel", "produktinnovation", "markedsanalyse", "konkurrencelovgivning",
        "Det Danske Klasselotteri A/S", "testkøbere", "erhverv", "industri", "handel", "virksomhed", "iværksætteri", "detailhandel", "økosystem", "produktion"],
    "Culture": [ "medie", "kultur", "kunst", "journalistik", "film", "tv", "reklame",
        "bøger", "censur", "musik", "teater", "radio", "nyhedsformidling",
        "digitalisering", "mediepolitik", "streamingtjenester",
        "aviser", "sociale medier", "trykte medier", "public service",
        "ytringsfrihed", "tv-produktion", "scenekunst", "litteratur", "spiludvikling",
        "tekstiler", "bogafgift", "filmstøtte", "lokalradio", "biblioteksudlån","spilindustri", "digitale medier", "kulturarv",
        "koranafbrændinger", "folkekirken", "religion", "folkekirken", "stormoskéer", "præstestillinger",
        "moskéer","religiøse", "Nyborg Slot", "trossamfund", "Det Islamiske Trossamfund", "islam", "kristendom", "kristne",
        "medieejerstruktur", "ytringsfrihedsdebatter", "kulturpolitik", "filmfinansiering",
        "journalistisk etik", "public service-regulering", "indholdsproduktion", "digital kultur", "kunststøtte",
        "bogmarkedet", "kulturel identitet", "biblioteksdrift", "digitalisering af medier", "opretholdelse af pressefrihed",
        "Kirkeministeriets", "hooliganer", "begrebet køn", "kultur", "medier", "kunst", "musik", "bøger", "scenekunst", "tv", "radio", "journalistik", "presse", "bibliotek"],
    "Defence": ["forsvar", "militær", "krig", "sikkerhedspolitik", "cybersikkerhed",
        "terrorbekæmpelse", "nationale sikkerhed", "efterretningstjeneste",
        "våbenhandel", "fredsbevaring", "militærstrategi", "grænsekontrol",
        "anti-terror", "overvågning", "beredskabsstyrken", "krigspolitik",
        "dronekrig", "forsvarsbudget", "sikkerhedssamarbejde",
        "strategisk forsvar", "hjemmeværnet", "våbenproduktion", "cybersikkerhed", "terror",
        "våbenforbud", "beredskabsstyring", "terrortrussel", "cyberangreb", "grænsesikring", "våbenkontrol",
        "militærnær industri", "efterretningstjenester", "fredsbevarende missioner",
        "forsvarspolitik", "national sikkerhed", "militære operationer", "cyberforsvar", "trusselsanalyse",
        "strategisk afskrækkelse", "overvågningsteknologier", "anti-terror strategier", "krigsforebyggelse", "trussel",
        "ekstremismebekæmpelse", "efterretningstjenester", "forsvarsindustri", "grænsesikkerhed", "fredsbevarende missioner", "forsvar", "militær", "sikkerhedspolitik", "krig", "nato", "våben", "efterretning", "cyberforsvar"],
    "Economy": ["økonomi", "budget", "skat", "finanslov", "afgifter", "kapital", "gæld",
        "fradrag", "beskatning", "arbejdsmarked", "inflation", "valutakurs", "rente",
        "aktie", "finanspolitik", "bank", "investering", "pensionsopsparing",
        "børs", "løn", "privatøkonomi", "statsfinanser", "økonomisk vækst",
        "moms", "konjunkturer", "udgiftslofter", "finansår", "økonomi", "grundvederlag", "budget", "finans",
        "årpenge", "partistøtte", "producentansvar",
        "kryptovaluta", "hvidvask", "skatteunddragelse", "inflationshjælp", "økonomisk stabilitet", "grøn skattepolitik", 
        "kapitalindkomst", "pensionsrefrm", "digitale betalingsmidler", "realkreditmarkedet",
        "kreditforeningen", "fribeløb", "revision", "ATP-bidrag", "andelsboligforeninger", "bogføring", "revisionsbestemmelse", 
        "værdiansættelsesprincipper", "finansiel stabilitet", "rentesatser", "kapitalforvaltning", "lønstruktur", "skattepolitik",
        "økonomiske incitamenter", "monetær politik", "økonomisk regulering", "centralbank",
        "pensionssystem", "valutamarkeder", "offentlige finanser", "arbejdsgiverafgifter", "skattekontrol", "beskatningssystem",
        "indtægt", "konkursloven", "afskrivningssatsen for bygninger", "ejendomskreditaftalerne", "afskrivningssatsen", "udlejningsejendomme",
        "ejendomsvurderinger", "økonomi", "skat", "budget", "finans", "afgifter", "inflation", "rente", "børs", "marked", "invester"],
    "Education": ["læretid", "lære", "folkeskole", "praktik", "uddannelse", "universitet", "skole", "elever", "lærer", "eksamen", "forskning",
        "pædagogik", "studier", "bachelor", "doktorgrad", "gymnasium",
        "videnskab", "skolepolitik", "uddannelsesinstitution", "forskermiljø",
        "vidensdeling", "lærlinge", "pædagog", "videnskabelig", "studenterpolitik",
        "klasseundervisning", "læringsmetoder", "studerende", "videnskab", "forskning", "uddannelsespolitik",
        "seksualundervisning", "sprogmodel", "uddannelsesloft", "ph.d.-ordning", "kompetenceløft", "digitale læremidler",
        "SU-fribeløb","klassetrin", "folkeskole", "praktikpladstaxametertilskud", "SU-lånemuligheder", "SU",
        "uddannelsesreformer", "skolepolitik", "studiefinansiering", "vidensøkonomi",
        "efteruddannelse", "skoleintegration", "universitetsakkreditering", "elevernes trivsel", "forskningstilskud",
        "akademiske normer", "ph.d.-uddannelse", "studievejledning", "erhvervsuddannelser", "livslang læring", "ph.d",
        "udflytningen og nedskaleringen af studiepladser", "studiepladser", "studieplads","uddannelse", "universitet", "skole", "lærer", "elever", "eksamen", "folkeskole", "phd", "forskning", "studie"],
    "Environment and Energy": ["golfstrømmen", "energi", "klimaforandringer", "kulstof", "vindmøller", "solceller",
        "energipolitik", "grøn omstilling", "biobrændsel", "CO2-afgift",
        "olie", "naturgas", "atomkraft", "bæredygtig energi", "fossilfri",
        "energirenovering", "energisektor", "energiforbrug", "varmeforsyning",
        "brintteknologi", "energilagring", "vedvarende energikilder",
        "netto-nul-emission", "kernekraft", "energipolitik", "CO2-afgift", "industriområde", "industri", 
        "energiproduktion", "elforsyningsloven", "bioenergi", "brintstrategi", "CO2-lagring", "fjernvarmenet","grøn industri", 
        "energikapacitet", "elafgift","bæredygtig energiforsyning", "industriel udvikling", "grøn energiomstilling", "energimarkedet",
        "ressourceforvaltning", "kulstofaftryk", "emissionspolitik", "energinetværk", "fossilfri fremtid",
        "energiinnovation", "klimainitiativer", "industriel automatisering", "produktionskapacitet", "energieffektivisering", "brint", "emission", "klima", "emission", "miljø", "bæredygtighed", "drivhusgas", "nulemissionszone",
        "forurening", "biodiversitet", "vedvarende energi", "genbrug", "genanvendelse",
        "klimapolitik", "naturskydd", "naturgenopretning", "affaldshåndtering",
        "grøn omstilling", "co2", "fossilfri", "skovrejsning", "havmiljø", "økosystemer", "røggasvand", 
        "pesticider", "PFAS", "havvindmøllepark", "grøn", "randzoner", "tørke", "muslingeopdræt", "naturnationalparker", "parker",
        "biodynamisk", "kvælstofudledning", "vådområder", "havplan", "vådområder", "klimarisiko", "jordforurening", "genopretning af natur", 
        "bæredygtigt byggeri", "grøn finansiering", "resiliens over for klimaforandringer", "havbrug", "vådområder", "stormfloder", "økomord", 
        "naturnationalpark", "drikkevandsdirektivet", "plantebeskyttelsesmidler", "råstofstrategi",
        "udfasning af oksekød og lammekød i offentlige kantiner",
        "bæredygtige løsninger", "klimatilpasning", "bæredygtig udvikling", "naturbeskyttelse", "regnskove",
        "kulstofneutralitet", "bæredygtig landbrug", "øko-byer", "genopretning af vådområder", "klimakrise", "affaldsreduktion",
        "klimaaftaler", "parisaftalen", "klimamodstand", "naturkapital", "genbrugssystemer", "plastikforurening",
        "marine", "Plantebaserede Fødevarer","brystbenet hos æglæggende høns", "Om kvæg på græs",
        "bomtrawl", "Golfstrømmens mulige kollaps", "bæredygtige og avancerede biobrændstoffer", "hegningspligt", "miljø", "bæredygtighed", "forurening", "biodiversitet", "naturbeskyttelse", "naturnationalpark", "co2", "affald"],
    #"European Integration": ["eu", "brexit", "schengen", "europa", "euro", "kommission", "parlament"],
    "Foreign Affairs": ["eu", "brexit", "schengen", "europa", "euro", "kommission", "parlament", "eu", "udenrigs", "international", "traktat", "geopolitik",
        "handelstraktater", "diplomati", "udenrigspolitik", "nato",
        "sikkerhedspolitik", "FN", "globalisering", "menneskerettigheder",
        "asylpolitik", "internationale relationer", "embargo", "udenrigshandel",
        "grænsepolitik", "international bistand", "militære alliancer",
        "diplomatiske forbindelser", "udenrigsministeriet","Rusland", "Belarus", "NATO",
        "Ukraine", "Tysklandsstrategi", "udviklingssamarbejdet", "nordiske samarbejde",
        "strategisk autonomi", "cirkulær migration", "udviklingsmidler", "geopolitisk stabilitet",
        "grænseoverskridende samarbejde", "sanktioner", "de lande, som Danmark samarbejder med", "retsstatsprincippet",
        "Gaza til listen over konfliktområder", "Gaza","rigsfællesskabet", "regionale og kommunale hverv",
        "udviklingsbistand til de palæstinensiske selvstyreområder", "efterretningsmateriale fra Afghanistan",
        "geopolitisk strategi", "udenrigshandelspolitik", "udenrigssikkerhed", "diplomatisk samarbejde",
        "mellemstatslige aftaler", "internationale organisationer", "global økonomi", "menneskerettighedsovervågning",
        "konfliktløsning", "sanktioner", "ekspatpolitik", "udenrigsministeriet", "udenrigsøkonomi", "geopolitisk konkurrence",
        "passikkerhed","Schengensamarbejdet", "Israel-Palæstina-konflikten", "samarbejdet i Arktis", "Schengeninformationssystemet",
        "Schengen", "Danmarks ratifikation af protokoller om Finlands og Sveriges tiltrædelse", "Vestbredden""udenrigspolitik", "diplomati", "sikkerhed", "udviklingsbistand", "grænsekontrol", "geopolitik", "ambassade"],
    "Health Care": ["sundhed", "pleje", "hospital", "epidemi", "trivsel", "vaccination", "læger",
        "medicin", "sygdomsbekæmpelse", "patienter", "sygehus", "forebyggelse",
        "sygesikring", "mental sundhed", "sygdom", "sygepleje", "fysioterapi",
        "apotek", "lægemiddel", "hospitalsbehandling", "ældrepleje", "plejehjem",
        "behandlingsgaranti", "sundhedssektor",  "fri abort", "abort", "patientklager", "mental sundhed", "mental",
        "jordemødre", "fødsler", "fødsel", "stofbrugere", "stofmisbrug", "socialt frikort",
        "flergangsfødende", "psykiatriplan", "lægemiddelovervågning", "senfølger", "sygdomsregistrering", 
        "ufrivilligt barnløse", "tryghed for kvinder", "frisørers og bioanalytikeres skader", "befrugtede og ubefrugtede æg",
        "rugemoderskab", "misbrugsbehandling", "psykologhjælp", "dødshjælp", "kønsskiftebehandling","ordblindhed",
        "nikotinholdige", "smittetrykket", "videncenter for alternativ behandling", "organdonor", "lægedækning",
        "dødelighed", "sundhedsteknologi", "epidemisk beredskab", "sundhedsforvaltning", "medicinske fremskridt",
        "offentlige sundhedsordninger", "sygeforsikring", "kræftbehandling", "sundhedskompetence",
        "psykisk sygdom", "lægemiddeludvikling", "handicapkompensation", "plejesektor", "ergoterapi", "demenspleje",
        "opioider", "nødlicenser til vacciner", "vaccine", "kræftfremkaldende", "kræft", "endometriose","covid-19",
        "patientklagesystemet", "psykologbehandling", "psykiatrien", "lægemangel", "Psykolognævnet",
        "fertilitetsbehandling", "menstruation", "levende donor", "næsespray", "sundhed", "hospital", "læger", "medicin", "psykiatri", "sygdom", "vaccination", "behandling", "sygepleje"],
    "Housing": ["bolig", "leje", "ejendom", "byggeri", "real estate", "husleje", "ejerbolig"],
    "Immigration": ["udlænding", "integration",
        "arbejdsintegration", "flygtning", "indvandring", "indfødsrets", "indfødsret",
        "opholdstilladelse", "statsborgerskab", "naturalisation", 
        "udlændingeloven", "asyl", "grænsepolitik", 
        "migrantarbejde", "familiesammenføring", "borgerskab", 
        "opholdsstatus", "indrejseforbud", "deportation",
        "migrantkrise", "cirkulær migration", "medborgerskab",
        "statsborgerretskonventionen", "statsløse", "udvisning", "ghettoområder", "udenlandske tiggere", "dansk indfødsret", "indfødsretsprøven",
        "indvandringspolitik", "flygtningekriser", "asylansøgningsprocesser", "arbejdsmigrationsregler",
        "indfødsretsprøve", "statsborgerskabslovgivning", "integrationstiltag", "flersprogethed", "migrantbeskyttelse",
        "arbejdsmarkedsintegration", "grænsekontrol", "menneskehandel", "opholdstilladelser", "udvisningsregler", "naturalisation","indfødsretslovforslag",
        "Udrejsecenter", "udrejsecenter", "udrejsecentre", "hjemløshed", " udrejsecenter i Afrika", "ghettolister", "ghetto",
        "udenlandske ægtefæller", "vende tilbage til for syriske statsborgere", "indvandring", "integration", "asyl", "opholdstilladelse", "statsborgerskab", "naturalisation", "migrant"],
    "Justice": ["retsvæsen", "lovgivning", "domstole", "kriminalitet", "strafferet", "politi", "retssag", "anklager", "dom",  "domstol", "retsvæsen", "politi", "strafferet", "voldtægt", "dommer",
        "retssikkerhed", "kriminalitet", "lovgivning", "retshåndhævelse",
        "domstolsbehandling", "retskendelse", "anklager", "forsvarer", "retsmøde",
        "afsoning", "kriminel", "lovovertrædelse", "retspleje", "retssag",
        "menneskerettigheder", "varetægtsfængsling", "anklagemyndighed", "retsinstans", 
        "retsforskrifter", "varetægtsarrestanter", "kriminalisering", "psykisk vold", "bødestørrelser", "straffuldbyrdelse",
        "grooming", "stalking", "chikanesager", "chikane", "angreb med genstand på personer", "retsopgør","whistleblower",
        "tvangsfjernelser", "elektronisk overvågning", 
        "retsopgør", "whistleblower", "tvangsfjernelser", "elektronisk overvågning", "retsforfølgning", 
        "retsmægling", "anklagemyndighedens kompetence", "retslægeråd", "bødeudmåling", "kriminologisk forskning",
        "dansk statsborger", "straffesagskæden", "bandegrupperinger",
        "fængselsstraf", "minimumsstraffe","strafforfølgning", "lovtidende", "vold", "chikaneparagraffen", "Dna-profilregister",
        "forsætligt drab på flere personer og gravide",
        "retsinstanser", "anklage", "tiltalefrafald", "retspraksis", "bøde", "lovovertræder", "afstraffelse",
        "retshåndhævende myndigheder", "retsmedicinsk", "kriminalteknisk", "retsstat", "lovbrud", "myndighedsmisbrug",
        "domsafsigelse", "retsmidler", "politimyndigheder", "anholdelse", "fængselsvæsen", "retsplejeloven",
        "kvinders selvbestemmelse over egne kønsceller", "pebersprayordningen",
        "skærpet straf", "prøveløsladelse", "fængsel", "livstidsdømte", "livstidsdømt", "straffesager",
        "strafbart", "straf", "partnerdrabskommission", "offentlige straffeattester", "straffeattest", "løsladelsespraksis ved forbrydelser"],
    "Labour": ["arbejdsmarked", "løn", "arbejdsløshed", "fagforening", "overenskomst", "ansættelse", "strejke"],
    "Local and Regional Affairs": ["kommune", "region", "byråd", "lokalpolitik", "planlægning", "udviklingsstrategi"],
    "Social Affairs": ["børn", "familie", "pension", "arbejdsløshed", "handicap", "velfærd",
        "ældrepleje", "socialpolitik", "sociale ydelser", "boligsikring",
        "dagpenge", "kontanthjælp", "førtidspension", "børnetilskud",
        "udsatte grupper", "boligpolitik", "børnepasning",
        "familiepolitik", "fødevarehjælp", "socialrådgivning", "udsatte børn", "ligestillingspolitik", "minimumsnormering", 
        "ligestilling", "barnets lov", "arbejdslivet", "arbejdsliv",
        "ensomhedsstrategi", "anbringelsesreform", "socialøkonomi", "flerbørnsydelse", "barnets lov", "anbringelsesområdet", "ældreområdet",
        "jobparate ledige", "sexarbejdere", "prostitution", "adopteret",
        "velfærdsstatens fremtid", "social mobilitet", "fattigdomsbekæmpelse", "arbejdsløshedsunderstøttelse",
        "social inklusion", "socialt boligbyggeri", "socialrådgivning", "mindstelønspolitik", "boligstøtte",
        "handicaprettigheder", "børnefattigdom", "ligestilling på arbejdsmarkedet", "anbringelsesområdet", "udsatte grupper",
        "indsatsen mod hjemløshed", "hjemløse","hjemløs", "negativ social kontrol", "social kontrol", "socialområdet",
        "lotterier og liberalisering af landbaseret bingo", "pasning af eget barn", "jobrettet indsats for unge", "LGBTI-personer mod forskelsbehandling", 
        "prostituerede", "udslusningsboliger", "ældrelov", "ældretilsyn", "Hurtig jobstart for kandidatdimittender",
        "Center for Boligsocial Udvikling", "forlænge barslen", "barsel", "socialområdet", "velfærd", "pension", "dagpenge", "socialpolitik", "handicap", "ældrepleje", "børnepasning", "kontanthjælp"],
    "Infrastructure": ["transport", "jernbane", "vej", "motorvej", "metro", "cykelsti",
        "offentlig transport", "trafik", "havne", "bro", "lufthavn",
        "jernbanenet", "tog", "biltrafik", "vejafgifter", "busdrift",
        "trafiksikkerhed", "infrastruktur", "færger", "parkering", "mobilitet",
        "vejvedligeholdelse", "trafikregulering", "rute", "vejafgifter", "jernbanenet", "cykelstier",
        "partybusser", "motorområdet", "banelov", "havneinfrastruktur", "jernbanesikkerhed", "luftfartsområdet",
        "sensorbaseret trafikregulering", "letbane", "styrthjelm", "knallert", "motorcykler",
        "mobilbaseret varslingssystem", "luftfart","luftfartøj", "Bygningsfornyelse", "styrthjelm", "taxiloven",
        "ombygning af Aarhus H", "luftfartsstrategi", "skiltningsreglerne", "MitID og NemLog-in",
        "mobilitetsplanlægning", "offentlig transportreform", "vejrenovering", "jernbanekapacitet",
        "lufthavnsinfrastruktur", "bilafgifter", "byplanlægning", "byudvikling", "trafikovervågning",
        "bæredygtig transport", "elbilopladning", "cykelinfrastruktur", "mobilitetspolitik", "byrum",
        "bedre afvikling af køre- og teoriprøver", "digital selvbetjening","Limfjordsforbindelse",
        "digital postløsning", "færgebetjening", "transport", "vej", "jernbane", "metro", "cykelsti", "cykelstier", "it", "fiber", "internet", "mobilnet"],
    "Territories": ["grønland", "færøerne", "territorium", "selvstyre", "arktisk", "riksfællesskab"],
    "Other": ["Meddelelser fra formanden", "Punkt 0", "offentlighed", "lovtidende", "meta-politik"],
    "Elections & Parliamentary Processes": ["valg", "folketing", "partilederdebat", "regeringsdannelse",
        "stemmeret", "parlament", "lovgivning", "vælgere", "demokrati",
        "valgkamp", "politiske partier", "folkeafstemning", "regeringsforhandling",
        "stemmesystem", "politisk debat", "regeringspolitik",
        "politiske reformer", "lovgivningsproces", "koalition", "mandatfordeling", 
        "valglov", "stemmesystem", "folketing", "statsministerens åbningsredegørelse", "flagning", "spørgetid", "spørgetime",
        "partiskift", "ministeransvarlighed", "valgbureaukrati",
        "meddelelse af orlov","lovtidende", "offentlighedslovens bestemmelser",
        "borgmestres og regionsformænds ekstraindtægter fra bestyrelsesarbejde",
        "valgstrategier", "demokratiske principper", "lovgivningsprocesser", "politikudvikling",
        "parlamentarisk debat", "folketingets beføjelser", "regeringsaftaler", "partisystemer",
        "stemmedeltagelse", "valgprognoser", "koalitionsdannelser", "offentlighedsloven", "politikertroværdighed",
        "øget offentlighed ved Rigsrettens offentlige forhandlinger", "magtudredningen",
        "genskabe slettede sms"]
}

# Cleaning the AgendaTitle
df["CleanAgendaTitle"] = df["AgendaTitle"].apply(clean_agenda_title)

# Train BERTopic
topic_model = BERTopic(language="danish")  # Uses Danish embeddings
topics, probs = topic_model.fit_transform(df['CleanAgendaTitle'])

# Assign BERTopic topics
df["BERTopic_Topic"] = topics
df["BERTopic_Keywords"] = df["BERTopic_Topic"].apply(lambda t: topic_model.get_topic(t))
df["Assigned_Category"] = df["BERTopic_Keywords"].apply(lambda words: categorize_topic([w[0] for w in words] if words else []))

# Create unique identifier for each agenda topic on a given date
df["AgendaTitleDateID"] = df["AgendaTitle"].astype(str) + "_" + df["Date"].astype(str)


# Resolve conflicts - Assign the most frequent category per unique debate
most_common_category = (
    df.groupby("AgendaTitleDateID")["Assigned_Category"]
    .agg(lambda x: x.mode()[0] if not x.mode().empty else "Other")  # Get most frequent category
    .reset_index()
)

# Merge back into original df
df = df.drop(columns=["Assigned_Category"]).merge(most_common_category, on="AgendaTitleDateID", how="left")

# Print results
print(df[["AgendaTitle", "BERTopic_Topic", "Assigned_Category"]])


In [None]:
# VERIFICATION
import pandas as pd

# Ensuring `AgendaTitleDateID` exists
if "AgendaTitleDateID" not in df.columns or "Assigned_Category" not in df.columns:
    raise ValueError("❌ The required columns 'AgendaTitleDateID' and 'Assigned_Category' are missing from the DF!")

# Counting unique Assigned_Category values per AgendaTitleDateID
agenda_category_counts = df.groupby("AgendaTitleDateID")["Assigned_Category"].nunique()

# Identifying any conflicts (cases where more than 1 unique category exists)
conflicts = agenda_category_counts[agenda_category_counts > 1]

# OUtput results
if conflicts.empty:
    print("✅ All AgendaTitleDateIDs have only one unique Assigned_Category.")
else:
    print("⚠️ Some AgendaTitleDateIDs are associated with multiple Assigned_Categories.")
    print("🔍 Conflicting Entries:")
    print(df[df["AgendaTitleDateID"].isin(conflicts.index)])

# Lets check occurrences of each unique category in 'Assigned_Category'
print(df['Assigned_Category'].unique())

df['Assigned_Category'].value_counts()



In [None]:
# Save output
OUTPUT_FOLDER_versions = os.path.join(OUTPUT_FOLDER, "versions")
os.makedirs(OUTPUT_FOLDER_versions, exist_ok=True)


In [None]:
# READINng in TOPIC DF
import os
import pandas as pd

HOME_DIR = os.path.expanduser("~")

OUTPUT_FOLDER = os.path.join(HOME_DIR, "Desktop", "AARHUS_UNIVERSITY", "kandidat", "thesis_work", "data_cleaning", "output")

OUTPUT_FOLDER_versions = os.path.join(OUTPUT_FOLDER, "versions")

# Define output folder
OUTPUT_CSV_t = os.path.join(OUTPUT_FOLDER_versions, "debates_2009_2025_w_topicmod_s.csv")

# get file
df = pd.read_csv(OUTPUT_CSV_t, sep=";", encoding="utf-8")
df

Unnamed: 0,SessionID,MeetingNumber,Date,Location,AgendaItemNo,AgendaTitle,DebateType,TurnNo,Speaker,Party,Role,Time,Utterance,CleanAgendaTitle,BERTopic_Topic,BERTopic_Keywords,AgendaTitleDateID,Assigned_Category
0,20121,70,2013-03-15T10:00:00,Folketingssalen,0,Punkt 0,other,1,Mogens Lykketoft,S,formand,,Mødet er åbnet.,punkt,4,"[('punkt', 0.1416696397766679), ('', 1e-05), (...",Punkt 0_2013-03-15T10:00:00,Other
1,20121,70,2013-03-15T10:00:00,Folketingssalen,1,1. behandling af L 157: Om ændret organisering...,reading of bill,1,Mogens Lykketoft,S,formand,,Forhandlingen er åbnet. Hr. Jan E. Jørgensen s...,l om ændret organisering af statsforvaltninger...,661,"[('statsforvaltningerne', 0.3566967119848391),...",1. behandling af L 157: Om ændret organisering...,Other
2,20121,70,2013-03-15T10:00:00,Folketingssalen,1,1. behandling af L 157: Om ændret organisering...,reading of bill,2,Jan E. Jørgensen,V,medlem,,"Det lovforslag, vi skal behandle nu, er en udm...",l om ændret organisering af statsforvaltninger...,661,"[('statsforvaltningerne', 0.3566967119848391),...",1. behandling af L 157: Om ændret organisering...,Other
3,20121,70,2013-03-15T10:00:00,Folketingssalen,1,1. behandling af L 157: Om ændret organisering...,reading of bill,3,Mogens Lykketoft,S,formand,,Tak til Venstres ordfører. Så er det hr. Simon...,l om ændret organisering af statsforvaltninger...,661,"[('statsforvaltningerne', 0.3566967119848391),...",1. behandling af L 157: Om ændret organisering...,Other
4,20121,70,2013-03-15T10:00:00,Folketingssalen,1,1. behandling af L 157: Om ændret organisering...,reading of bill,4,Simon Kollerup,S,medlem,,"Siden kommunalreformen blev indført i 2007, ha...",l om ændret organisering af statsforvaltninger...,661,"[('statsforvaltningerne', 0.3566967119848391),...",1. behandling af L 157: Om ændret organisering...,Other
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
764424,20211,95,2022-04-22T10:00:00,Folketingssalen,0,Punkt 0,other,2,MødeSlut MødeSlut,MødeSlut,MødeSlut,(Kl. 15:34),Mødet er hævet. .,punkt,-1,"[('indfødsrets', 0.005173124539146184), ('medd...",Punkt 0_2022-04-22T10:00:00,Other
764425,20211,80,2022-03-29T13:00:00,Folketingssalen,0,Punkt 0,other,1,Henrik Dam Kristensen,S,formand,,Mødet er åbnet.,punkt,4,"[('punkt', 0.1416696397766679), ('', 1e-05), (...",Punkt 0_2022-03-29T13:00:00,Other
764426,20211,80,2022-03-29T13:00:00,Folketingssalen,1,Meddelelser fra formanden,other,1,Henrik Dam Kristensen,S,formand,,I dag er der følgende anmeldelser: Erhvervsmin...,meddelelser fra formanden,4602,"[('formanden', 1.1242230993492057), ('meddelel...",Meddelelser fra formanden_2022-03-29T13:00:00,Other
764427,20211,80,2022-03-29T13:00:00,Folketingssalen,0,Punkt 0,other,1,Henrik Dam Kristensen,S,formand,,"Så er der ikke mere at foretage i dette møde, ...",punkt,4,"[('punkt', 0.1416696397766679), ('', 1e-05), (...",Punkt 0_2022-03-29T13:00:00,Other


In [None]:
# Convert to datetime if not already
df['Date'] = pd.to_datetime(df['Date'])

# Get min and max date
date_min = df['Date'].min()
date_max = df['Date'].max()

print(f"Date range: {date_min} to {date_max}")

Date range: 2009-10-06 12:00:00 to 2025-02-21 09:00:00


In [3]:
# How many utterances per type?
print(df['Assigned_Category'].value_counts())

# Ensure MeetingDateID uniquely identifies each meeting across years
df['MeetingDateID'] = df['MeetingNumber'].astype(str) + "_" + df['Date'].astype(str)



Assigned_Category
Other                                  145998
Elections & Parliamentary Processes    137748
Economy                                 70087
Foreign Affairs                         53302
Health Care                             50831
Business                                47685
Education                               35769
Immigration                             35750
Environment and Energy                  35281
Social Affairs                          35083
Justice                                 26708
Infrastructure                          26691
Culture                                 22619
Defence                                 18122
Housing                                  7473
Local and Regional Affairs               7016
Agriculture                              5885
Labour                                   1723
Territories                               658
Name: count, dtype: int64


In [4]:
# Create a unique identifier for each agenda topic on a given date
df['AgendaTitleDateID'] = df['AgendaTitle'].astype(str) + "_" + df['Date'].astype(str)

# Verify if any AgendaTitleDateID has multiple AgendaCategories
agenda_category_counts = df.groupby('AgendaTitleDateID')['Assigned_Category'].nunique()

# Find cases where the same AgendaTitleDateID has more than one unique category
conflicts = agenda_category_counts[agenda_category_counts > 1]

if conflicts.empty:
    print("✅ All AgendaTitleDateIDs have only one unique Assigned_Category.")
else:
    print("⚠️ Some AgendaTitleDateIDs are associated with multiple AgendaCategories.")
    print(df[df['AgendaTitleDateID'].isin(conflicts.index)])

# Count unique meetings per AgendaCategory
meeting_counts = df[['AgendaTitleDateID',  # Could call it DebateI
                     'Assigned_Category']].drop_duplicates().groupby('Assigned_Category').size().reset_index(name="ID Count")
meeting_counts

✅ All AgendaTitleDateIDs have only one unique Assigned_Category.


Unnamed: 0,Assigned_Category,ID Count
0,Agriculture,127
1,Business,1530
2,Culture,553
3,Defence,367
4,Economy,1727
5,Education,925
6,Elections & Parliamentary Processes,1022
7,Environment and Energy,972
8,Foreign Affairs,894
9,Health Care,1325


In [None]:
# Sort to check
df = df.sort_values(by=["Date", "AgendaItemNo", "TurnNo"], ascending=[True, True, True])
df

Unnamed: 0,SessionID,MeetingNumber,Date,Location,AgendaItemNo,AgendaTitle,DebateType,TurnNo,Speaker,Party,Role,Time,Utterance,CleanAgendaTitle,BERTopic_Topic,BERTopic_Keywords,AgendaTitleDateID,Assigned_Category,MeetingDateID
743741,20091,1,2009-10-06 12:00:00,Folketingssalen,0,Punkt 0,other,1,Niels Helveg Petersen,RV,aldersformanden,,Mødet er åbnet. I henhold til grundloven er Fo...,punkt,4,"[('punkt', 0.1416696397766679), ('', 1e-05), (...",Punkt 0_2009-10-06 12:00:00,Other,1_2009-10-06 12:00:00
743742,20091,1,2009-10-06 12:00:00,Folketingssalen,0,Punkt 0,other,2,Thor Pedersen,V,formand,,"Jeg vil gerne takke for den tillid, som Tinget...",punkt,4,"[('punkt', 0.1416696397766679), ('', 1e-05), (...",Punkt 0_2009-10-06 12:00:00,Other,1_2009-10-06 12:00:00
743743,20091,1,2009-10-06 12:00:00,Folketingssalen,0,Punkt 0,other,3,Lars Løkke Rasmussen,,minister,,Danmark er et godt land at leve i. Vi har bygg...,punkt,4,"[('punkt', 0.1416696397766679), ('', 1e-05), (...",Punkt 0_2009-10-06 12:00:00,Other,1_2009-10-06 12:00:00
743744,20091,1,2009-10-06 12:00:00,Folketingssalen,0,Punkt 0,other,4,Thor Pedersen,V,formand,,Tak til statsministeren. Jeg kan oplyse Folket...,punkt,4,"[('punkt', 0.1416696397766679), ('', 1e-05), (...",Punkt 0_2009-10-06 12:00:00,Other,1_2009-10-06 12:00:00
743745,20091,1,2009-10-06 12:00:00,Folketingssalen,0,Punkt 0,other,5,MødeSlut MødeSlut,MødeSlut,MødeSlut,(Kl. 12:51),Mødet er hævet. .,punkt,4,"[('punkt', 0.1416696397766679), ('', 1e-05), (...",Punkt 0_2009-10-06 12:00:00,Other,1_2009-10-06 12:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16029,20241,58,2025-02-21 09:00:00,Folketingssalen,5,1. behandling af L 135: Om permanent ordning m...,reading of bill,94,Per Larsen,KF,medlem,,(Talen er under udarbejdelse),l om permanent ordning med medicinsk cannabis ...,1871,"[('notifikation', 0.17859902307522937), ('komm...",1. behandling af L 135: Om permanent ordning m...,Foreign Affairs,58_2025-02-21 09:00:00
16030,20241,58,2025-02-21 09:00:00,Folketingssalen,5,1. behandling af L 135: Om permanent ordning m...,reading of bill,95,Søren Gade,V,formand,,(Talen er under udarbejdelse),l om permanent ordning med medicinsk cannabis ...,1871,"[('notifikation', 0.17859902307522937), ('komm...",1. behandling af L 135: Om permanent ordning m...,Foreign Affairs,58_2025-02-21 09:00:00
16031,20241,58,2025-02-21 09:00:00,Folketingssalen,5,1. behandling af L 135: Om permanent ordning m...,reading of bill,96,Peder Hvelplund,EL,medlem,,(Talen er under udarbejdelse),l om permanent ordning med medicinsk cannabis ...,1871,"[('notifikation', 0.17859902307522937), ('komm...",1. behandling af L 135: Om permanent ordning m...,Foreign Affairs,58_2025-02-21 09:00:00
16032,20241,58,2025-02-21 09:00:00,Folketingssalen,5,1. behandling af L 135: Om permanent ordning m...,reading of bill,97,Søren Gade,V,formand,,(Talen er under udarbejdelse),l om permanent ordning med medicinsk cannabis ...,1871,"[('notifikation', 0.17859902307522937), ('komm...",1. behandling af L 135: Om permanent ordning m...,Foreign Affairs,58_2025-02-21 09:00:00


In [6]:
# To check which are left in "Other" category
print("\n📝 Remaining 'Other' Agenda Titles:")
other_titles = df[df['Assigned_Category'] == "Other"]['AgendaTitle'].unique()
for title in other_titles:
    print(f"- {title}")



📝 Remaining 'Other' Agenda Titles:
- Punkt 0
- 1) Spørgsmål om meddelelse af orlov til og indkaldelse af stedfortræder.
- Forhandling af R 1: Om statsministerens åbningsredegørelse.
- Fremme af F 1: Om mangel på arbejdskraft og sikring af velstand.
- 1. behandling af L 21: Om »farm out« og Nordsøfonden.
- Spørgsmål om meddelelse af orlov til og indkaldelse af stedfortræder for
- 1. behandling af L 6: Om forenkling af procedure ved udenlandske autorisationsansøgninger.
- 1. behandling af L 33: Om sommerhuse og campering m.v.
- 1. behandling af L 32: Om udstykning og anden registrering i matriklen m.m.
- Fremme af F 8: Om modeller i modelbranchen.
- 1. behandling af L 28: Om hasardspil i turneringsform.
- 1. behandling af L 40: Om bemyndigelse til indførelse af frivillige ordninger m.v.
- Fremme af F 10: Om hævdelse og udvikling af det danske sprog.
- 1. behandling af L 12: Om registrering af ledningsejere.
- 1. behandling af B 17: Om ændring af den kommunale styrelseslov.
- 1. behandli

In [7]:
# Rename
df.rename(columns={"Assigned_Category": "AgendaCategory"}, inplace=True)
df

Unnamed: 0,SessionID,MeetingNumber,Date,Location,AgendaItemNo,AgendaTitle,DebateType,TurnNo,Speaker,Party,Role,Time,Utterance,CleanAgendaTitle,BERTopic_Topic,BERTopic_Keywords,AgendaTitleDateID,AgendaCategory,MeetingDateID
743741,20091,1,2009-10-06 12:00:00,Folketingssalen,0,Punkt 0,other,1,Niels Helveg Petersen,RV,aldersformanden,,Mødet er åbnet. I henhold til grundloven er Fo...,punkt,4,"[('punkt', 0.1416696397766679), ('', 1e-05), (...",Punkt 0_2009-10-06 12:00:00,Other,1_2009-10-06 12:00:00
743742,20091,1,2009-10-06 12:00:00,Folketingssalen,0,Punkt 0,other,2,Thor Pedersen,V,formand,,"Jeg vil gerne takke for den tillid, som Tinget...",punkt,4,"[('punkt', 0.1416696397766679), ('', 1e-05), (...",Punkt 0_2009-10-06 12:00:00,Other,1_2009-10-06 12:00:00
743743,20091,1,2009-10-06 12:00:00,Folketingssalen,0,Punkt 0,other,3,Lars Løkke Rasmussen,,minister,,Danmark er et godt land at leve i. Vi har bygg...,punkt,4,"[('punkt', 0.1416696397766679), ('', 1e-05), (...",Punkt 0_2009-10-06 12:00:00,Other,1_2009-10-06 12:00:00
743744,20091,1,2009-10-06 12:00:00,Folketingssalen,0,Punkt 0,other,4,Thor Pedersen,V,formand,,Tak til statsministeren. Jeg kan oplyse Folket...,punkt,4,"[('punkt', 0.1416696397766679), ('', 1e-05), (...",Punkt 0_2009-10-06 12:00:00,Other,1_2009-10-06 12:00:00
743745,20091,1,2009-10-06 12:00:00,Folketingssalen,0,Punkt 0,other,5,MødeSlut MødeSlut,MødeSlut,MødeSlut,(Kl. 12:51),Mødet er hævet. .,punkt,4,"[('punkt', 0.1416696397766679), ('', 1e-05), (...",Punkt 0_2009-10-06 12:00:00,Other,1_2009-10-06 12:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16029,20241,58,2025-02-21 09:00:00,Folketingssalen,5,1. behandling af L 135: Om permanent ordning m...,reading of bill,94,Per Larsen,KF,medlem,,(Talen er under udarbejdelse),l om permanent ordning med medicinsk cannabis ...,1871,"[('notifikation', 0.17859902307522937), ('komm...",1. behandling af L 135: Om permanent ordning m...,Foreign Affairs,58_2025-02-21 09:00:00
16030,20241,58,2025-02-21 09:00:00,Folketingssalen,5,1. behandling af L 135: Om permanent ordning m...,reading of bill,95,Søren Gade,V,formand,,(Talen er under udarbejdelse),l om permanent ordning med medicinsk cannabis ...,1871,"[('notifikation', 0.17859902307522937), ('komm...",1. behandling af L 135: Om permanent ordning m...,Foreign Affairs,58_2025-02-21 09:00:00
16031,20241,58,2025-02-21 09:00:00,Folketingssalen,5,1. behandling af L 135: Om permanent ordning m...,reading of bill,96,Peder Hvelplund,EL,medlem,,(Talen er under udarbejdelse),l om permanent ordning med medicinsk cannabis ...,1871,"[('notifikation', 0.17859902307522937), ('komm...",1. behandling af L 135: Om permanent ordning m...,Foreign Affairs,58_2025-02-21 09:00:00
16032,20241,58,2025-02-21 09:00:00,Folketingssalen,5,1. behandling af L 135: Om permanent ordning m...,reading of bill,97,Søren Gade,V,formand,,(Talen er under udarbejdelse),l om permanent ordning med medicinsk cannabis ...,1871,"[('notifikation', 0.17859902307522937), ('komm...",1. behandling af L 135: Om permanent ordning m...,Foreign Affairs,58_2025-02-21 09:00:00


## Remaining preprocessing

In [None]:
import re
import pandas as pd

# Ensuring data is sorted properly
df_preprocessed = df.sort_values(by=["Date", "AgendaTitleDateID", "TurnNo"])

# Creating lagged columns for Formand's previous statements
df_preprocessed["PrevUtterance"] = df_preprocessed["Utterance"].shift(1).str.lower()
df_preprocessed["PrevSpeaker"] = df_preprocessed["Speaker"].shift(1)

# Initializing TurnRole column
df_preprocessed["TurnRole"] = "unknown"

# 1.1: Assigning "chair" to all Formand entries
df_preprocessed.loc[df_preprocessed["Role"].str.lower().str.contains("formand", na=False), "TurnRole"] = "chair"

# 1.2: Assiging "minister" to all minister entries
df_preprocessed.loc[df_preprocessed["Role"].str.lower().str.contains("minister", na=False), "TurnRole"] = "minister"

# 2: Assigning "asker" and "proponent" based on explicit mentions in Formand’s utterance
df_preprocessed.loc[
    (df_preprocessed["PrevUtterance"].str.contains("spørgeren", na=False, regex=True)) &
    (~df_preprocessed["Role"].str.lower().str.contains("formand", na=False)),
    "TurnRole"
] = "asker"

df_preprocessed.loc[
    (df_preprocessed["PrevUtterance"].str.contains("ordføreren", na=False, regex=True)) &
    (~df_preprocessed["Role"].str.lower().str.contains("formand", na=False)),
    "TurnRole"
] = "proponent"

# Extract first proponent if introduced in Formand’s utterance ---
first_proponent = None

for index, row in df_preprocessed.iterrows():
    if row["TurnRole"] == "chair":
        match = re.search(
            r"(?:er|den første, der får ordet, er|næste ordfører er)\s+(?:hr\.|fru)\s+([\w\s\-]+?)\s+[,]", 
            row["Utterance"], 
            re.IGNORECASE
        )
        if match:
            first_proponent = match.group(1).strip()
            print(f"🔍 Extracted first proponent: {first_proponent}")

    # Assign first_proponent role
    if first_proponent and row["Speaker"] == first_proponent and row["TurnRole"] == "unknown":
        df_preprocessed.at[index, "TurnRole"] = "proponent"
        print(f"✅ First proponent assigned: {row['Speaker']} (Row {index})")
        first_proponent = None  # Reset

# Detect new askers (including "korte bemærkninger") ---
new_asker = None

for index, row in df_preprocessed.iterrows():
    if row["TurnRole"] == "chair":
        # Look for new askers from different patterns
        asker_match = re.search(r"så er det\s+(?:hr\.|fru)\s+([\w\s\-]+)", row["Utterance"], re.IGNORECASE)
        bemærkning_match = re.search(r"bemærkning.*?(?:hr\.|fru)\s+([\w\s\-]+)", row["Utterance"], re.IGNORECASE)

        if asker_match:
            new_asker = asker_match.group(1).strip()
            print(f"🔍 New asker detected (pattern 1): {new_asker}")
        elif bemærkning_match:
            new_asker = bemærkning_match.group(1).strip()
            print(f"🔍 New asker detected (pattern 2 - korte bemærkninger): {new_asker}")

    # Assign detected askers if they match next speaker
    if new_asker and row["Speaker"] == new_asker and row["TurnRole"] == "unknown":
        df_preprocessed.at[index, "TurnRole"] = "asker"
        print(f"✅ New asker assigned: {row['Speaker']} (Row {index})")
        new_asker = None  # Reset

# Final check for "short format" introductions ---
for index, row in df_preprocessed.iterrows():
    if row["TurnRole"] == "chair":
        # Look for very short introductions like "Hr. Thomas Skriver Jensen, Socialdemokratiet."
        short_intro_match = re.search(r"(?:hr\.|fru)\s+([\w\s\-]+),", row["Utterance"], re.IGNORECASE)

        if short_intro_match:
            extracted_name = short_intro_match.group(1).strip()
            print(f"🔍 Short format detected: {extracted_name}")

            # Get next row (should be the unknown speaker)
            if index + 1 < len(df_preprocessed):
                next_row = df_preprocessed.iloc[index + 1]
                
                if next_row["Speaker"] == extracted_name and next_row["TurnRole"] == "unknown":
                    df_preprocessed.at[index + 1, "TurnRole"] = "asker"
                    print(f"✅ Short format asker assigned: {next_row['Speaker']} (Row {index + 1})")

# Assign "asker" if surrounded by chair entries ---
df_preprocessed = df_preprocessed.reset_index(drop=True)

for i in range(1, len(df_preprocessed) - 2):  # Ensure valid index range
    if (
        df_preprocessed.at[i, "TurnRole"] == "unknown" and
        df_preprocessed.at[i - 1, "TurnRole"] == "chair" and
        df_preprocessed.at[i + 1, "TurnRole"] == "chair" and
        df_preprocessed.at[i + 2, "TurnRole"] == "proponent"
    ):
        df_preprocessed.at[i, "TurnRole"] = "asker"
        print(f"✅ Final fallback asker assigned: {df_preprocessed.at[i, 'Speaker']} (Row {i})")

# Assign "proponent" if unknown is surrounded by specific pattern ---
df_preprocessed = df_preprocessed.reset_index(drop=True)

for i in range(1, len(df_preprocessed) - 4):  # Ensure valid index range
    if (
        df_preprocessed.at[i, "TurnRole"] == "unknown" and
        df_preprocessed.at[i - 1, "TurnRole"] == "chair" and
        df_preprocessed.at[i + 1, "TurnRole"] == "chair" and
        df_preprocessed.at[i + 2, "TurnRole"] == "asker" and
        df_preprocessed.at[i + 3, "TurnRole"] == "chair" and
        df_preprocessed.at[i + 4, "TurnRole"] == "proponent"
    ):
        df_preprocessed.at[i, "TurnRole"] = "proponent"
        print(f"✅ Final fallback proponent assigned: {df_preprocessed.at[i, 'Speaker']} (Row {i})")


# Assign "asker" if unknown is surrounded by the new pattern ---
df_preprocessed = df_preprocessed.reset_index(drop=True)

for i in range(6, len(df_preprocessed) - 3):  # Ensure valid index range
    if (
        df_preprocessed.at[i, "TurnRole"] == "unknown" and
        df_preprocessed.at[i - 6, "TurnRole"] == "minister" and
        df_preprocessed.at[i - 5, "TurnRole"] == "chair" and
        df_preprocessed.at[i - 4, "TurnRole"] == "asker" and
        df_preprocessed.at[i - 3, "TurnRole"] == "chair" and
        df_preprocessed.at[i - 2, "TurnRole"] == "minister" and
        df_preprocessed.at[i - 1, "TurnRole"] == "chair" and
        df_preprocessed.at[i + 1, "TurnRole"] == "chair" and
        df_preprocessed.at[i + 2, "TurnRole"] == "minister" and
        df_preprocessed.at[i + 3, "TurnRole"] == "chair"
    ):
        df_preprocessed.at[i, "TurnRole"] = "asker"
        print(f"✅ New asker assigned: {df_preprocessed.at[i, 'Speaker']} (Row {i})")

# Assign "asker" if unknown is surrounded by the new pattern ---
df_preprocessed = df_preprocessed.reset_index(drop=True)

for i in range(4, len(df_preprocessed) - 2):  # Ensure valid index range
    if (
        df_preprocessed.at[i, "TurnRole"] == "unknown" and
        df_preprocessed.at[i - 4, "TurnRole"] == "asker" and
        df_preprocessed.at[i - 3, "TurnRole"] == "chair" and
        df_preprocessed.at[i - 2, "TurnRole"] == "minister" and
        df_preprocessed.at[i - 1, "TurnRole"] == "chair" and
        df_preprocessed.at[i + 1, "TurnRole"] == "chair" and
        df_preprocessed.at[i + 2, "TurnRole"] == "minister"
    ):
        df_preprocessed.at[i, "TurnRole"] = "asker"
        print(f"✅  New asker assigned: {df_preprocessed.at[i, 'Speaker']} (Row {i})")

# Force assign "asker" if chair says "Ordføreren." ---
for i in range(1, len(df_preprocessed)):  # Ensure valid index range
    if (
        df_preprocessed.at[i, "TurnRole"] == "chair" and
        "ordføreren." in df_preprocessed.at[i, "Utterance"].lower()
    ):
        df_preprocessed.at[i - 1, "TurnRole"] = "asker"
        print(f"✅ Forced 'asker' at Row {i-1} because chair said 'Ordføreren.' at Row {i}")

# Assign proponent if introduced by chair using "Næste ordfører er hr./fru. [name]" ---
for index, row in df_preprocessed.iterrows():
    if row["TurnRole"] == "chair":
        # Look for different patterns of introducing proponents
        match = re.search(
            r"(?:er|den første, der får ordet, er|næste ordfører er)\s+(?:hr\.|fru)\s+([\w\s\-]+?)\s+[,]", 
            row["Utterance"], 
            re.IGNORECASE
        )
        if match:
            next_proponent = match.group(1).strip()
            print(f"🔍 Extracted next proponent: {next_proponent}")

            # Get next row (should be the proponent)
            if index + 1 < len(df_preprocessed):
                next_row = df_preprocessed.iloc[index + 1]
                
                # Only assign if TurnRole is still "unknown"
                if next_row["Speaker"] == next_proponent and next_row["TurnRole"] == "unknown":
                    df_preprocessed.at[index + 1, "TurnRole"] = "proponent"
                    print(f"✅ Assigned 'proponent' to: {next_row['Speaker']} (Row {index + 1})")


# Assign "minister" to all minister entries
df_preprocessed.loc[df_preprocessed["Role"].str.lower().str.contains("minister", na=False), "TurnRole"] = "minister"

# assign mødeslut if it says that and pause
df_preprocessed.loc[df_preprocessed["Role"].str.lower().str.contains("MødeSlut", na=False), "TurnRole"] = "MødeSlut"
df_preprocessed.loc[df_preprocessed["Role"].str.lower().str.contains("Pause", na=False), "TurnRole"] = "Pause"

# Assign "asker" for remaining unknowns in question-answering debates ---
df_preprocessed.loc[
    (df_preprocessed["TurnRole"] == "unknown") & 
    (df_preprocessed["DebateType"] == "question-answering"), 
    "TurnRole"
] = "asker"

print("✅: Assigned 'asker' to remaining 'unknown' values in question-answering debates.")

# Dropping some temp cols
df_preprocessed.drop(columns=["PrevUtterance", "PrevSpeaker"], inplace=True)

# Get desired column order
final_column_order = [
    "SessionID", "MeetingNumber", "Date", "Location", "AgendaItemNo", 
    "AgendaTitle", "DebateType", "TurnNo", "Speaker", "Party", "Role", 
    "TurnRole", "Time", "Utterance", "AgendaCategory", "MeetingDateID", "AgendaTitleDateID"
]

# Reorder the df
df_preprocessed = df_preprocessed[final_column_order]

# Check it
df_preprocessed.head(30)



🔍 New asker detected (pattern 2 - korte bemærkninger): Peter Christensen
🔍 New asker detected (pattern 1): Margrethe Vestager for en kort bemærkning
🔍 New asker detected (pattern 1): Johanne Schmidt-Nielsen
✅ New asker assigned: Johanne Schmidt-Nielsen (Row 452782)
🔍 New asker detected (pattern 1): Simon Emil Ammitzbøll
✅ New asker assigned: Simon Emil Ammitzbøll (Row 452790)
🔍 New asker detected (pattern 1): Ole Sohn for en kort bemærkning
🔍 New asker detected (pattern 1): Morten Østergaard for en kort bemærkning
🔍 New asker detected (pattern 1): Morten Bødskov for en kort bemærkning
🔍 New asker detected (pattern 1): Klaus Hækkerup for en kort bemærkning
🔍 New asker detected (pattern 1): Frank Aaen for en kort bemærkning
🔍 New asker detected (pattern 1): Pernille Vigsø Bagge for en kort bemærkning
🔍 New asker detected (pattern 1): Kirsten Brosbøl for en kort bemærkning
🔍 New asker detected (pattern 1): Niels Helveg Petersen for en kort bemærkning
🔍 New asker detected (pattern 1): Jesp

#### Fixing debatetypes

In [None]:
# Fix debatetypes
def update_debate_type_before_debateunitid(df):
    """
    Updates DebateType:
    - If 'Forhandling' appears in the AgendaTitle and DebateType is 'other', update to 'deliberation'.
    - If 'partilederdebat' appears in the AgendaTitle, update to 'party leader debate'.
    This function must run before DebateUnitIDs are created.
    """
    df = df.copy()  # Avoid modifying the original Df
    
    # Update DebateType for deliberation
    df.loc[
        (df["AgendaTitle"].str.contains("Forhandling", case=True, na=False)) & 
        (df["DebateType"] == "other"), 
        "DebateType"
    ] = "deliberation"

    # Update DebateType for party leader debates
    df.loc[
        df["AgendaTitle"].str.contains("partilederdebat", case=False, na=False), 
        "DebateType"
    ] = "party_leader_debate"
    
    return df

# Apply function BEFORE creating DebateUnitID
df_preprocessed = update_debate_type_before_debateunitid(df_preprocessed)

# Check
df_preprocessed['DebateType'].unique()

df_preprocessed.to_csv("df_preprocessed_w_topics_before_turns.csv")


In [None]:
# Filter out rows where DebateType is "other"
filtered_df = df_preprocessed[df_preprocessed["DebateType"].str.lower() != "other"]

# Count "unknown" instances in TurnRole after filtering
unknown_count = filtered_df["TurnRole"].str.lower().eq("unknown").sum()

print(f"🔍 Number of 'unknown' instances in TurnRole (excluding DebateType 'other'): {unknown_count}")

# 🔍 Number of 'unknown' instances in TurnRole (excluding DebateType 'other'): 24002



### Assigning turns

In [None]:

# Ensure data is sorted properly
df_w_turnsequence = df_preprocessed.sort_values(
    by=["Date", "AgendaItemNo", "TurnNo"]).reset_index(drop=True)

# Update TurnRole based on Role for party_leader_debate
df_w_turnsequence.loc[
    (df_w_turnsequence["DebateType"] == "party_leader_debate") & 
    (df_w_turnsequence["Role"] == "minister"), 
    "TurnRole"
] = "minister"

df_w_turnsequence.loc[
    (df_w_turnsequence["DebateType"] == "party_leader_debate") & 
    (df_w_turnsequence["Role"] == "medlem"), 
    "TurnRole"
] = "member"

df_w_turnsequence.loc[
    (df_w_turnsequence["Role"] == "formand"), 
    "TurnRole"
] = "chair"
df_w_turnsequence


In [None]:
# Add asker for party_leader_debates
# Define window size for checking past speaker occurrences
window_size = 12

# Ensure data is sorted properly
df_w_turnsequence = df_w_turnsequence.sort_values(
    by=["Date", "MeetingNumber", "AgendaItemNo", "AgendaTitle", "TurnNo"]
).reset_index(drop=True)

# Check if DebateType column exists
if "DebateType" in df_w_turnsequence.columns:
    # Identify rows belonging to party leader debates
    mask_party_leader = df_w_turnsequence["DebateType"] == "party_leader_debate"

    # Iterate over the df
    for index in df_w_turnsequence[mask_party_leader].index:
        speaker = df_w_turnsequence.at[index, "Speaker"]

        # Get the last `window_size` rows excluding current row
        start_idx = max(0, index - window_size)
        past_speakers = df_w_turnsequence.loc[start_idx:index - 1, "Speaker"].tolist()

        # If speaker is not in the past `window_size` speakers, set TurnRole to 'asker'
        if speaker not in past_speakers:
            df_w_turnsequence.at[index, "TurnRole"] = "asker"

df_w_turnsequence[df_w_turnsequence["DebateType"]== 'party_leader_debate']


In [None]:

for (date, meeting, agenda_item, agenda_title, debate_type), group in df_w_turnsequence.groupby(
    ["Date", "MeetingNumber", "AgendaItemNo", "AgendaTitle", "DebateType"]
):
    current_turn = 0  # Start TurnSequence per debate
    last_speakers = []  # Rolling list to track the last three unique speakers (including chair)

    for index, row in group.iterrows():
        speaker = row["Speaker"]
        role = row["TurnRole"]
        party = str(row.get("Party", ""))  # in case Party column exists

        # 1) Check for "MødeSlut" in Speaker or Party columns
        if ("MødeSlut" in str(speaker)) or ("MødeSlut" in party):
            #print(f"\nMødeSlut detected in row {index}. Resetting context.")
            current_turn = 0
            last_speakers = []
            df_w_turnsequence.at[index, "TurnSequence"] = current_turn
            continue

        # Debug: show current processing state
        #print(f"\nProcessing row {index}: {speaker} ({role})")
        #print(f"Before processing: last_speakers = {last_speakers}, current_turn = {current_turn}")

        # Handle first row of the group/debate
        if index == group.index[0]:
            df_w_turnsequence.at[index, "TurnSequence"] = current_turn  # Should be 0
            last_speakers.append(speaker)
            #print("First row detected. TurnSequence set to 0 and added speaker to last_speakers.")
            continue

        # Normal processing based on the count of unique speakers
        if len(set(last_speakers)) < 3:
            if speaker not in last_speakers:
                last_speakers.append(speaker)
                #print(f"Added {speaker} to last_speakers: {last_speakers}")
            current_turn += 1  # Increment the turn sequence
        else:
            if speaker not in last_speakers:
                #print(f"New speaker detected: {speaker}. Resetting TurnSequence and last_speakers.")
                current_turn = 0
                last_speakers = [speaker]
            else:
                current_turn += 1

        # Debug: show state after processing current row
        #print(f"After processing: last_speakers = {last_speakers}, current_turn = {current_turn}")
        df_w_turnsequence.at[index, "TurnSequence"] = current_turn

        # 2) Check for consecutive 'chair' roles:
        # Get the current position within the group
        pos = group.index.get_loc(index)
        if pos < len(group) - 1:
            next_row = group.iloc[pos + 1]
            next_role = next_row["TurnRole"]
            if role == "chair" and next_role == "chair":
                #print(f"Consecutive chairs detected at row {index} and row {group.index[pos + 1]}. Resetting context.")
                current_turn = 0
                last_speakers = []



In [None]:
# Now, assign the turns to fix the things that did not get assigned above (there were still some unknowns)
'''
Following this structured order:
chair → asker → chair → proponent (and repeat)

When a new asker is introduced (TurnRole = asker after a chair), reset to 0.
Otherwise, continue numbering sequentially within the same cycle.

'''
# Ensure data is sorted properly
df_w_turnsequence = df_preprocessed.sort_values(
    by=["Date", 
        "MeetingNumber",
        "AgendaItemNo", 
        "AgendaTitle",
        "TurnNo"]).reset_index(drop=True)

# Initialize TurnSequence column
df_w_turnsequence["TurnSequence"] = 0  
current_turn = -1  # Start at -1 so the first new asker lands at 0
prev_role = None  # Track the last role
prev_asker = None  # Track the last asker (by name)

# Iterate through rows, grouping within AgendaItemNo and AgendaTitle
for (date, meeting, agenda_item, agenda_title), group in df_w_turnsequence.groupby(["Date", "MeetingNumber", "AgendaItemNo", "AgendaTitle"]):
    current_turn = -1  # Reset TurnSequence for each unique AgendaItemNo and AgendaTitle
    prev_role = None
    prev_asker = None

    for index, row in group.iterrows():
        role = row["TurnRole"]
        speaker = row["Speaker"]

        # If a new asker (different person) appears after a chair, reset the sequence
        if role == "asker" and prev_role == "chair":
            if speaker != prev_asker:
                df_w_turnsequence.at[index - 1, "TurnSequence"] = 0  # Chair introducing the asker should be 0
                current_turn = 1  # Start the sequence from 1 for the asker
                prev_asker = speaker
            else:
                current_turn += 1  # Continue sequence if same asker
        elif role in ["proponent", "minister", "unknown"]:  
            current_turn += 1  # Continue sequence for proponent, minister, or unknown
        elif role == "chair":  
            current_turn += 1  # Continue sequence for chair (only if it's not a reset point)

        # Assign TurnSequence
        df_w_turnsequence.at[index, "TurnSequence"] = current_turn
        prev_role = role  # Update previous role


# Display sample
df_w_turnsequence.head(30)


## Replacing party names in all utterances with pseudonyms

In [None]:
import re

# Defining mapping of party names (including historical names) to pseudonyms
party_pseudonyms = {
    # Socialdemokratiet
    "Socialdemokratiet": "Parti_A",
    "Socialdemokraterne": "Parti_A",
    "Socialdemokraternes": "Parti_As",
    "Socialdemokratiets": "Parti_As",
    "Socialdemokratisk": "Parti_As",
    "Socialdemokrater": "Parti_A",

    # Venstre
    "Venstre": "Parti_B",
    "Venstres": "Parti_Bs",

    # Radikale Venstre
    "Radikale Venstre": "Parti_C",
    "Det Radikale Venstre": "Parti_C",
    #"Radikale": "Parti_C",
    "Radikales": "Parti_Cs",
    "De Radikale": "Parti_C",
    "De Radikales": "Parti_Cs",

    # Konservative Folkeparti
    "Konservative Folkeparti": "Parti_D",
    "Det Konservative Folkeparti": "Parti_D",
    "Konservative": "Parti_D",
    "Konservatives": "Parti_Ds",
    "De Konservative": "Parti_D",
    "De Konservatives": "Parti_Ds",
    "konservativ side": "Parti_Ds side",

    # Socialistisk Folkeparti
    "Socialistisk Folkeparti": "Parti_E",
    "Socialistisk Folkepartis": "Parti_Es",
    "Socialistiske Folkeparti": "Parti_E",
    "Socialistisk": "Parti_E",
    "Socialistiskes": "Parti_Es",

    # Dansk Folkeparti
    "Dansk Folkeparti": "Parti_F",
    "Dansk Folkepartis": "Parti_Fs",

    # Fremskridtspartiet (Historisk DF-navn)
    "Fremskridtspartiet": "Parti_F",
    "Fremskridtspartiets": "Parti_Fs",

    # Enhedslisten
    "Enhedslisten": "Parti_G",
    "Enhedslistens": "Parti_Gs",
    "Rød-Grøn Alliance": "Parti_G",
    "Rød-Grønne Alliance": "Parti_G",

    # Liberal Alliance
    "Liberal Alliance": "Parti_H",
    "Liberale Alliance": "Parti_H",
    "Liberal Alliances": "Parti_Hs",
    "Liberales": "Parti_Hs",  # Genitive form

    # Ny Alliance (Historisk før LA)
    "Ny Alliance": "Parti_H",
    "Ny Alliances": "Parti_Hs",

    # Alternativet
    "Alternativet": "Parti_I",
    "Alternativets": "Parti_Is",

    # Danmarksdemokraterne
    "Danmarksdemokraterne": "Parti_J",
    "Danmarksdemokraternes": "Parti_Js",

    # Nye Borgerlige
    "Nye Borgerlige": "Parti_K",
    "Nye Borgerliges": "Parti_Ks",

    # Frie Grønne
    "Frie Grønne": "Parti_L",
    "De Frie Grønne": "Parti_L",
    "Frie Grønnes": "Parti_Ls",

    # Kristendemokraterne
    "Kristendemokraterne": "Parti_M",
    "Kristendemokraternes": "Parti_Ms",
    "De Kristne Demokrater": "Parti_M",
    "Kristendemokratiet": "Parti_M",
    "Kristendemokratiets": "Parti_Ms",
}


# Compile regex pattern to match any of the party names (case insensitive)
party_pattern = re.compile(r'\b(' + '|'.join(re.escape(party) for party in party_pseudonyms.keys()) + r')\b', re.IGNORECASE)

# Function to replace party names with pseudonyms
def replace_party_names(text):
    if pd.isna(text):  # Handle missing values
        return text
    
    # Perform case-insensitive replacement while preserving original case
    return party_pattern.sub(lambda match: party_pseudonyms.get(match.group(0), 
                                                                party_pseudonyms.get(match.group(0).title(), 
                                                                match.group(0))), text)

# Apply function to the "Utterance" column
df_w_turnsequence["Utterance"] = df_w_turnsequence["Utterance"].astype(str).apply(replace_party_names)

print("✅ Party names replaced with pseudonyms, including historical names.")

# Check it
df_w_turnsequence.head(30)


## Replacing names in the utterances with 'spørgeren' or 'ordføreren' (depending on the situation)

In [None]:
import re
import pandas as pd

# Function to replace mentioned speaker names (with Hr./Fru. included) in utterances
def replace_mentioned_names(row, name_list):
    text = row["Utterance"]
    role = row["TurnRole"]
    
    if pd.isna(text):  # Handle missing values
        return text

    # Set replacement term based on current speaker's role
    if role in ["minister", "proponent"]:
        replacement = "Spørgeren"
    elif role == "asker":
        replacement = "Ordføreren"
    else:
        replacement = "Taleren"
        #return text  # No replacement for chair or undefined roles

    # Replace all names found in the name list
    for name in name_list:
        if isinstance(name, str):  # Ensure it's a valid string
            # Match both "Hr./Fru. Name" and just "Name"
            name_pattern = re.compile(r'\b(?:hr\.|fru)\s*' + re.escape(name) + r'\b', re.IGNORECASE)
            text = name_pattern.sub(replacement, text)

    return text

# Processing each unique (Date, AgendaItemNo, AgendaTitle) tuple
for (date, agenda_item, agenda_title), group in df_w_turnsequence.groupby(["Date", "AgendaItemNo", "AgendaTitle"]):
    unique_names = group["Speaker"].dropna().unique().tolist()  # Get unique speaker names

    # Apply the replacement function to all utterances in this group
    df_w_turnsequence.loc[group.index, "Utterance"] = group.apply(
        lambda row: replace_mentioned_names(row, unique_names), axis=1
    )

print("✅ Speaker names in utterances replaced with generic references")

# check it
df_w_turnsequence.head(30)

## Assigning DebateUnitIDs

In [None]:
# Filter out rows where the utterance contains "(Talen er under udarbejdelse)"
df_w_turnsequence = df_w_turnsequence[~df_w_turnsequence["Utterance"].str.contains(r"\(Talen er under udarbejdelse\)", na=False, regex=True)]

df_w_turnsequence_sort = df_w_turnsequence.sort_values(
    by=["Date", "AgendaItemNo", "TurnNo"]).reset_index(drop=True)


In [None]:
# Now, lets ensure each DebateUnitID is assigned uniquely within Date, MeetingNumber, AgendaItemNo, and AgendaTitle --

# Ensure the df is sorted correctly
df_w_turnsequence_sort_new = df_w_turnsequence.sort_values(
    by=["Date", "MeetingNumber", "AgendaItemNo", "AgendaTitle", "TurnNo"]
).reset_index(drop=True)

# Initialize DebateUnitID
debate_unit_id = 1
debate_unit_ids = []

# Iterate through rows, grouping within Date, MeetingNumber, AgendaItemNo, and AgendaTitle
for (date, meeting, agenda_item, agenda_title), group in df_w_turnsequence_sort_new.groupby(["Date", "MeetingNumber", "AgendaItemNo", "AgendaTitle"]):
    debate_unit_id += 1  # Reset DebateUnitID for each unique AgendaItemNo and AgendaTitle

    for index, row in group.iterrows():
        if row["TurnSequence"] == 0 and index > 0:  # New debate unit starts at every 0 (except first row)
            debate_unit_id += 1
        debate_unit_ids.append(debate_unit_id)

# Assign DebateUnitID to df
df_w_turnsequence_sort_new["DebateUnitID"] = debate_unit_ids

In [None]:
# Again, I found some unknowns - so need to rerun some turnrole code to get the last ones assigned:
import re

# Ensure data is sorted properly
df_w_turnsequence_sort_new = df_w_turnsequence_sort_new.sort_values(
    by=["Date", "MeetingNumber", "AgendaItemNo", "AgendaTitle", "TurnNo"]
).reset_index(drop=True)

# Regex pattern to detect "Vi går videre til [anden, tredje, Xte] runde"
pattern_vi_videres = re.compile(r"Vi går videre til (anden|tredje|\d+\.?) runde", re.IGNORECASE)

# Iterate through rows, focusing only on "question-answering" DebateType
if {"DebateType", "DebateUnitID", "Utterance", "TurnRole"}.issubset(df_w_turnsequence_sort_new.columns):
    mask_question_answering = df_w_turnsequence_sort_new["DebateType"] == "question-answering"

    for debate_unit, group in df_w_turnsequence_sort_new[mask_question_answering].groupby("DebateUnitID"):
        current_turn = -1  # Reset TurnSequence per debate unit

        # Loop through rows while tracking previous four roles dynamically
        for i in range(len(group)):
            index = group.index[i]
            role = group.at[index, "TurnRole"]
            utterance = group.at[index, "Utterance"] if "Utterance" in group.columns else ""

            # Fetch previous roles and utterances safely
            prev_role_1 = group.at[group.index[i - 1], "TurnRole"] if i - 1 >= 0 else None
            prev_utterance_1 = group.at[group.index[i - 1], "Utterance"] if i - 1 >= 0 else ""

            prev_role_2 = group.at[group.index[i - 2], "TurnRole"] if i - 2 >= 0 else None
            prev_role_3 = group.at[group.index[i - 3], "TurnRole"] if i - 3 >= 0 else None
            prev_role_4 = group.at[group.index[i - 4], "TurnRole"] if i - 4 >= 0 else None

            # Debugging: Print previous utterance for verification
            if prev_role_1 == "chair":
                print(f"Row {i}, Previous Chair Utterance: {prev_utterance_1}")

            # Check for reset patterns
            if prev_role_2 == "chair" and prev_role_1 == "chair" and role in ["asker", "proponent"]:
                current_turn = 0  # Reset for Pattern 1 & 2
            elif prev_role_4 == "chair" and prev_role_3 == "chair" and prev_role_2 == "minister" and prev_role_1 == "chair" and role in ["asker", "proponent"]:
                current_turn = 0  # Reset for Pattern 3 & 4
            elif prev_role_1 == "chair" and pattern_vi_videres.search(prev_utterance_1):
                current_turn = 0  # Reset TurnSequence if chair says "Vi går videre til X runde"
            else:
                current_turn += 1  # Otherwise, continue incrementing

            # Assign TurnSequence
            df_w_turnsequence_sort_new.at[index, "TurnSequence"] = current_turn


In [None]:
# Cool - now we redefine

# Ensure the df is sorted correctly
df_w_turnsequence_sort_new = df_w_turnsequence_sort_new.sort_values(
    by=["Date", "MeetingNumber", "AgendaItemNo", "AgendaTitle", "TurnNo"]
).reset_index(drop=True)

# Initialize DebateUnitID
debate_unit_id = 1
debate_unit_ids = []

# Iterate through rows, grouping within Date, MeetingNumber, AgendaItemNo, and AgendaTitle
for (date, meeting, agenda_item, agenda_title), group in df_w_turnsequence_sort_new.groupby(["Date", "MeetingNumber", "AgendaItemNo", "AgendaTitle"]):
    debate_unit_id += 1  # Reset DebateUnitID for each unique AgendaItemNo and AgendaTitle

    for index, row in group.iterrows():
        if row["TurnSequence"] == 0 and index > 0:  # New debate unit starts at every 0 (except first row)
            debate_unit_id += 1
        debate_unit_ids.append(debate_unit_id)

# Assign DebateUnitID to df
df_w_turnsequence_sort_new["DebateUnitID"] = debate_unit_ids

In [None]:
# Ensure the necessary columns exist before filtering
if {"DebateType", "TurnSequence", "DebateUnitID"}.issubset(df_w_turnsequence_sort_new.columns):
    # Create a subset where DebateType is "question-answering" and TurnSequence exceeds 20
    subset_question_answering = df_w_turnsequence_sort_new[
        (df_w_turnsequence_sort_new["DebateType"] == "question-answering") &
        (df_w_turnsequence_sort_new["TurnSequence"] > 20)
    ]
else:
    subset_question_answering = None  # Handle case where columns are missing

subset_question_answering


# Make versions of different types


In [None]:
# Check that no debate has more types
def check_debate_type_consistency(df):
    """
    Checks that each DebateUnitID has only one unique DebateType.
    If any DebateUnitID has multiple DebateTypes, it prints the problematic cases.
    """
    # Count unique DebateTypes per DebateUnitID
    debate_type_counts = df.groupby("DebateUnitID")["DebateType"].nunique()

    # Find DebateUnitIDs with more than one unique DebateType
    inconsistent_units = debate_type_counts[debate_type_counts > 1].index

    if inconsistent_units.any():
        print(f"⚠️ Warning: {len(inconsistent_units)} DebateUnitIDs have multiple DebateTypes!")
        inconsistent_df = df[df["DebateUnitID"].isin(inconsistent_units)]
        print(inconsistent_df[["DebateUnitID", "AgendaTitle", "DebateType"]].drop_duplicates().sort_values("DebateUnitID"))
    else:
        print("✅ All DebateUnitIDs have a consistent DebateType.")

# Lets run the check
check_debate_type_consistency(df_w_turnsequence_sort_new)


In [None]:
# For the party_leader_debate, there are a lot of unknowns - could we replace the TurnSequence with a rule for partyleaderdebates thats like if theres a new name entering/new row with new name (Speaker col has names) after two other people have debated for at least 2 turns, then the turn resets? in this df the chair has been removed, so we need to redefine it anyway
# df_w_turnsequence_sort_new.DebateType.unique() # make a df for each array(['other', 'question-answering', 'reading of bill', 'deliberation', 'party_leader_debate'], dtype=object)

# Convert TurnRole to Danish role names
role_mapping = {
    "asker": "Spørger",
    "minister": "Minister",
    "proponent": "Ordfører",
    "chair": "Mødeleder",
    "unknown": "Ukendt"
}

df_w_turnsequence_sort_new["TurnRole_Danish"] = df_w_turnsequence_sort_new["TurnRole"].map(role_mapping).fillna("Ukendt")

import os


##### --- Lets go
# Defining base output dir
base_output_dir = "/Users/pbrams/Desktop/AARHUS_UNIVERSITY/kandidat/thesis_work/data_cleaning/output/clean"

# Ensure the output directories exist
debate_types = df_w_turnsequence_sort_new["DebateType"].unique()

# Save each subset to the corresponding folder
for debate_type in debate_types:
    # Format folder name (replace spaces and hyphens with underscores)
    folder_name = debate_type.replace(" ", "_").replace("-", "_")
    output_path = os.path.join(base_output_dir, folder_name)
    
    # Create folder if it doesn't exist
    os.makedirs(output_path, exist_ok=True)
    
    # Filter df for the current debate type
    df_subset = df_w_turnsequence_sort_new[df_w_turnsequence_sort_new["DebateType"] == debate_type]
    
    # Save as CSV
    file_path = os.path.join(output_path, f"{folder_name}_data_2009_2025.csv")
    df_subset.to_csv(file_path, index=False)

# Confirmation message
print("✅ DFs saved in their respective folders")


## Save versions without 'chair' (+ reset the TurnSequence first)

In [None]:
import pandas as pd

# First we remove 'chair' entries
df_no_chair = df_w_turnsequence_sort_new[df_w_turnsequence_sort_new["TurnRole"] != "chair"].copy()
df_no_chair = df_w_turnsequence_sort_new[df_w_turnsequence_sort_new["Role"] != "formand"].copy()

# For the party_leader_debate and deliberation debatetype, there are a lot of unknowns - could we replace the TurnSequence with a rule for partyleaderdebates thats like if theres a new name entering/new row with new name (Speaker col has names) after two other people have debated for at least 2 turns, then the turn resets? in this df the chair has been removed, so we need to redefine it anyway

# First ensure data is sorted properly
df_no_chair = df_no_chair.sort_values(by=["Date", "AgendaItemNo", "TurnNo"]).reset_index(drop=True)

# Convert TurnRole to Danish role names
role_mapping = {
    "asker": "Spørger",
    "minister": "Minister",
    "proponent": "Ordfører",
    "chair": "Mødeleder",
    "unknown": "Ukendt"
}

df_no_chair["TurnRole_Danish"] = df_no_chair["TurnRole"].map(role_mapping).fillna("Ukendt")

# Initialize TurnSequence column
df_no_chair["TurnSequence"] = 0  

# Group by DebateUnitID and reset TurnSequence within each group
for debate_unit, group in df_no_chair.groupby("DebateUnitID"):
    df_no_chair.loc[group.index, "TurnSequence"] = range(len(group))


# Check that no debate has more types
def check_debate_type_consistency(df):
    """
    Checks that each DebateUnitID has only one unique DebateType.
    If any DebateUnitID has multiple DebateTypes, it prints the problematic cases.
    """
    # Count unique DebateTypes per DebateUnitID
    debate_type_counts = df.groupby("DebateUnitID")["DebateType"].nunique()

    # Find DebateUnitIDs with more than one unique DebateType
    inconsistent_units = debate_type_counts[debate_type_counts > 1].index

    if inconsistent_units.any():
        print(f"⚠️ Warning: {len(inconsistent_units)} DebateUnitIDs have multiple DebateTypes!")
        inconsistent_df = df[df["DebateUnitID"].isin(inconsistent_units)]
        print(inconsistent_df[["DebateUnitID", "AgendaTitle", "DebateType"]].drop_duplicates().sort_values("DebateUnitID"))
    else:
        print("✅ All DebateUnitIDs have a consistent DebateType.")

# Run the check
check_debate_type_consistency(df_no_chair)


In [None]:
# Save files
import os

# Define base output directory
base_output_dir = "/Users/pbrams/Desktop/AARHUS_UNIVERSITY/kandidat/thesis_work/data_cleaning/output/clean"

# Ensure the output directories exist
debate_types = df_no_chair["DebateType"].unique()

# Save each subset to the corresponding folder
for debate_type in debate_types:
    # Format folder name (replace spaces and hyphens with underscores)
    folder_name = debate_type.replace(" ", "_").replace("-", "_")
    output_path = os.path.join(base_output_dir, folder_name)
    
    # Create folder if it doesn't exist
    os.makedirs(output_path, exist_ok=True)
    
    # Filter df for the current debate type
    df_subset = df_no_chair[df_no_chair["DebateType"] == debate_type]
    
    # Save as CSV
    file_path = os.path.join(output_path, f"{folder_name}_nochair_data_2009_2025.csv")
    df_subset.to_csv(file_path, index=False)

print("✅ DFs without chair entires and with new turnroles saved in their respective folders")

In [None]:
# Check that no debate has more types
def check_debate_type_consistency(df):
    """
    Checks that each DebateUnitID has only one unique DebateType.
    If any DebateUnitID has multiple DebateTypes, it prints the problematic cases.
    """
    # Count unique DebateTypes per DebateUnitID
    debate_type_counts = df.groupby("DebateUnitID")["DebateType"].nunique()

    # Find DebateUnitIDs with more than one unique DebateType
    inconsistent_units = debate_type_counts[debate_type_counts > 1].index

    if inconsistent_units.any():
        print(f"⚠️ Warning: {len(inconsistent_units)} DebateUnitIDs have multiple DebateTypes!")
        inconsistent_df = df[df["DebateUnitID"].isin(inconsistent_units)]
        print(inconsistent_df[["DebateUnitID", "AgendaTitle", "DebateType"]].drop_duplicates().sort_values("DebateUnitID"))
    else:
        print("✅ All DebateUnitIDs have a consistent DebateType.")

# Run the check
check_debate_type_consistency(df_no_chair)

# Generating user ids for annotators

In [None]:
import random
import string

def generate_random_user_ids(num_users=50, length=8):
    """Generate a list of random user IDs."""
    user_ids = []
    for _ in range(num_users):
        user_id = ''.join(random.choices(string.ascii_uppercase + string.digits, k=length))
        user_ids.append(user_id)
    return user_ids

# Generate 35 random user IDs
random_user_ids = generate_random_user_ids()

# Display the generated user IDs
df_users = pd.DataFrame({"User ID": random_user_ids})
df_users.to_csv("/Users/pbrams/Desktop/AARHUS_UNIVERSITY/kandidat/thesis_work/data_cleaning/data/user_ids/user_ids.csv")