# Feature engineering for gold-spans 🛠️ 

This notebook reads in the consolidated gold-span dataset (with turn context and metadata), computes label and quality distributions, and engineers linguistic, contextual, and metadata-based features (e.g., token counts, rhetorical markers, government/opposition status, TF-IDF bigrams) for each span.  It then exports a curated feature-augmented table ready for downstream model training.


In [None]:
import pandas as pd

# Get data
df = pd.read_csv("output/coalesced_gold_aug_w_turns_w_context.csv")

# CHECK LABEL DIST
num_unique = len(df)
print(f"🔢 Total unique labeled spans: {num_unique}")


label_dist = (
    df['label']
    .value_counts()                                  # raw counts
    .reset_index(name='Count')                       # df with columns ['index','Count']
    .rename(columns={'index': 'Label'})              # rename 'index' → 'Label'
)
# Add percentage column
label_dist['Percentage'] = (
    label_dist['Count']
    .div(label_dist['Count'].sum())                  # fraction of total
    .mul(100)                                        # convert to percent
    .round(2)                                        # round to 2 decimals
)

print("\n📊 Label Distribution (% of unique labeled spans):")
print(label_dist)

# Distribution of quality_status (percentage of each category)
quality_status_dist = (
    df['quality_status']
    .value_counts(normalize=True)
    .mul(100.0)
    .round(2)
    .reset_index()
    .rename(columns={'index': 'Quality Status', 'quality_status': 'Percentage'})
)

print("\n📊 Quality Status Distribution (% of unique labeled spans):")
print(quality_status_dist)


# Calculate percentage for 'answer' vs all other labels
# We use a case-insensitive comparison for 'answer'
answer_mask = df['label'].str.lower() == 'answer'
perc_answer = answer_mask.mean() * 100  # because the mean of booleans gives the proportion True
perc_non_answer = 100 - perc_answer
print("\n📊 'Answer' vs Other Labels (% of unique labeled spans):")
print(f"Answer: {perc_answer:.2f}%")
print(f"Other Labels: {perc_non_answer:.2f}%")


print(df.columns)
df

In [6]:
# Get more statistics

# 1. Computing token/word counts
df['num_tokens'] = df['span'].str.split().str.len()
df['num_words']  = df['num_tokens']  # here tokens==words

# 2. Average span length per label
avg_span_length = (
    df.groupby('label')
      .agg(
          avg_tokens=('num_tokens', 'mean'),
          avg_words=('num_words',  'mean')
      )
      .round(2)
      .reset_index()
)
print("\nAverage span length per label (tokens & words):")
print(avg_span_length)

# 3. Distribution by Debate Type
debate_type_dist = (
    df['METADATA_DebateType']
      .value_counts()
      .rename_axis('DebateType')
      .reset_index(name='Count')
)
debate_type_dist['Percentage'] = (
    debate_type_dist['Count']
    / debate_type_dist['Count'].sum()
    * 100
).round(2)
print("\nDistribution by Debate Type:")
print(debate_type_dist)

# 4. Distribution by Agenda Category
agenda_category_dist = (
    df['METADATA_AgendaCategory']
      .value_counts()
      .rename_axis('AgendaCategory')
      .reset_index(name='Count')
)
agenda_category_dist['Percentage'] = (
    agenda_category_dist['Count']
    / agenda_category_dist['Count'].sum()
    * 100
).round(2)
print("\nDistribution by Agenda Category:")
print(agenda_category_dist)


Average span length per label (tokens & words):
                       label  avg_tokens  avg_words
0                     answer       37.43      37.43
1                     attack       29.19      29.19
2    attack + self_promotion       31.25      31.25
3           attack + stretch       35.25      35.25
4                    evasion       43.62      43.62
5           evasion + attack       36.24      36.24
6   evasion + self_promotion       43.74      43.74
7          evasion + stretch       44.55      44.55
8             self_promotion       34.45      34.45
9   self_promotion + stretch       26.67      26.67
10                   stretch       54.00      54.00

Distribution by Debate Type:
            DebateType  Count  Percentage
0  party_leader_debate    550       92.59
1         deliberation     28        4.71
2      reading of bill     16        2.69

Distribution by Agenda Category:
                        AgendaCategory  Count  Percentage
0  Elections & Parliamentary Processe

In [8]:
# And even more
# 1. Proportions by METADATA_Role
role_dist = (
    df['METADATA_Role']
      .value_counts()
      .reset_index(name='Count')
      .rename(columns={'index': 'METADATA_Role'})
)
role_dist['Percentage'] = (role_dist['Count'] / role_dist['Count'].sum() * 100).round(2)

print("\nDistribution by Speaker Role (METADATA_Role):")
print(role_dist)

# 2. Proportions by METADATA_TurnRole
turnrole_dist = (
    df['METADATA_TurnRole']
      .value_counts()
      .reset_index(name='Count')
      .rename(columns={'index': 'METADATA_TurnRole'})
)
turnrole_dist['Percentage'] = (turnrole_dist['Count'] / turnrole_dist['Count'].sum() * 100).round(2)

print("\nDistribution by Turn Role (METADATA_TurnRole):")
print(turnrole_dist)


Distribution by Speaker Role (METADATA_Role):
         METADATA_Role  Count  Percentage
0               medlem    562       94.61
1             minister     30        5.05
2  fungerende minister      2        0.34

Distribution by Turn Role (METADATA_TurnRole):
  METADATA_TurnRole  Count  Percentage
0            member    515       86.70
1         proponent     30        5.05
2          minister     30        5.05
3             asker     19        3.20


In [7]:
len(df['debate_unit_id'].unique())

199

## Feature engineering 

In [78]:
df['METADATA_Party'].unique()
df['METADATA_Date'].unique()
df['METADATA_Speaker'].unique()


array(['Peter Kofod Poulsen', 'Mette Frederiksen',
       'Jakob Ellemann-Jensen', 'Pernille Vermund',
       'Kristian Thulesen Dahl', 'Morten Østergaard', 'Pia Olsen Dyhr',
       'Pernille Skipper', 'Søren Pape Poulsen', 'Uffe Elbæk',
       'Alex Vanopslagh', 'Kristian Hegaard', 'Karsten Hønge',
       'Søren Søndergaard', 'Sofie Carsten Nielsen', 'Torsten Gejl',
       'Michael Aastrup Jensen', 'Aaja Chemnitz Larsen',
       'Aki-Matilda Høegh-Dam', 'Mai Villadsen', 'Edmund Joensen',
       'Peter Juel-Jensen', 'Morten Messerschmidt', 'Sikandar Siddique',
       'Troels Lund Poulsen', 'Inger Støjberg', 'Lars Løkke Rasmussen',
       'Martin Lidegaard', 'Franciska Rosenkilde', 'Aaja Chemnitz',
       'Alexander Ryle', 'Henrik Frandsen', 'Rasmus Jarlov',
       'Kim Edberg Andersen', 'Niels Flemming Hansen', 'Pelle Dragsted',
       'Peter Kofod', 'Tobias Grotkjær Elmstrøm',
       'Flemming Møller Mortensen', 'Katrine Daugaard', 'Vivi Kier',
       'Kaare Dybvad Bek', 'Frederik Vad

In [79]:
# Some preprocessing becuase i found some missing party labels

# Step 1: Define speaker-to-party mapping for politicians who have not switched parties
consistent_speaker_parties = {
    'Mette Frederiksen': 'S',   # Social Democrats
    'Troels Lund Poulsen': 'V', # Venstre
    'Kaare Dybvad Bek': 'S',
    'Peter Hummelgaard': 'S',
    'Vivi Kier': 'KF'
}

# Step 2: Assign party for consistent cases (except Løkke)
df.loc[
    df['METADATA_Speaker'].isin(consistent_speaker_parties.keys()),
    'METADATA_Party'
] = df['METADATA_Speaker'].map(consistent_speaker_parties)

# Step 3: Assign party for Lars Løkke based on year
# First, ensure METADATA_Date is datetime
df['METADATA_Date'] = pd.to_datetime(df['METADATA_Date'], errors='coerce')

# Then assign party conditionally
df.loc[
    (df['METADATA_Speaker'] == 'Lars Løkke Rasmussen') &
    (df['METADATA_Date'].dt.year < 2021),
    'METADATA_Party'
] = 'V'  # Venstre

df.loc[
    (df['METADATA_Speaker'] == 'Lars Løkke Rasmussen') &
    (df['METADATA_Date'].dt.year >= 2021),
    'METADATA_Party'
] = 'M'  # Moderaterne

# Ensure METADATA_Date is in datetime format
df['METADATA_Date'] = pd.to_datetime(df['METADATA_Date'], errors='coerce')

# Filter rows from 2016-01-01 and onward
df = df[df['METADATA_Date'].dt.year >= 2016]



In [None]:
import re
import pandas as pd
import numpy as np
from difflib import SequenceMatcher
import spacy

# Load Danish spaCy model
nlp = spacy.load("da_core_news_sm")

# Government/opposition status by party and year
party_status_by_year = {
    'S': {
        2016: 'opposition',  # V minority gov until 2019
        2020: 'government',  # S minority gov supported by red bloc
        2021: 'government',
        2022: 'government',
        2023: 'government',  # S in centrist coalition with V + M
        2024: 'government'
    },
    'V': {
        2016: 'government',
        2020: 'opposition',
        2021: 'opposition',
        2022: 'opposition',
        2023: 'government',
        2024: 'government'
    },
    'DF': {
        2016: 'support',     # Supported V gov (not formally in it)
        2020: 'opposition',
        2021: 'opposition',
        2022: 'opposition',
        2023: 'opposition',
        2024: 'opposition'
    },
    'NB': {
        2020: 'opposition',
        2021: 'opposition',
        2022: 'opposition',
        2023: 'opposition',
        2024: 'opposition'
    },
    'RV': {
        2016: 'opposition',
        2020: 'support',     # Supported S minority gov
        2021: 'support',
        2022: 'support',
        2023: 'opposition',  # Not part of new S-V-M centrist coalition
        2024: 'opposition'
    },
    'SF': {
        2016: 'opposition',
        2020: 'support',
        2021: 'support',
        2022: 'support',
        2023: 'opposition',  # Not in S-V-M coalition
        2024: 'opposition'
    },
    'EL': {
        2016: 'opposition',
        2020: 'support',
        2021: 'support',
        2022: 'support',
        2023: 'opposition',
        2024: 'opposition'
    },
    'KF': {
        2016: 'support',     # Supported V minority gov
        2020: 'opposition',
        2021: 'opposition',
        2022: 'opposition',
        2023: 'opposition',  # Not in S-V-M coalition
        2024: 'opposition'
    },
    'ALT': {
        2016: 'opposition',
        2020: 'opposition',
        2021: 'opposition',
        2022: 'opposition',
        2023: 'opposition',
        2024: 'opposition'
    },
    'LA': {
        2016: 'support',     # Supported V minority gov
        2020: 'opposition',
        2021: 'opposition',
        2022: 'opposition',
        2023: 'opposition',  # Not in coalition, despite being right-wing
        2024: 'opposition'
    },
    'IA': {
        2020: 'support',     # Greenlandic party supporting S
        2021: 'support',
        2022: 'support',
        2023: 'support',
        2024: 'support'
    },
    'SIU': {
        2020: 'support',     # Greenlandic Siumut
        2021: 'support',
        2022: 'support',
        2023: 'support',
        2024: 'support'
    },
    'SP': {
        2020: 'opposition',  # Faroese Republic party
        2021: 'opposition',
        2022: 'opposition',
        2023: 'opposition',
        2024: 'opposition'
    },
    'FG': {
        2020: 'opposition',  # Faroese Social Democrats
        2021: 'opposition',
        2022: 'opposition',
        2023: 'support',     # Supporting S-V-M coalition (confirmed for 2022–2024)
        2024: 'support'
    },
    'DD': {
        2023: 'opposition',
        2024: 'opposition'
    },
    'M': {
        2023: 'government',  # Moderates (Lars Løkke) formed coalition with S and V
        2024: 'government'
    }
} # These are based on 

# Gender mapping by speaker name - some genders were missing
name_gender_map = {
    'Peter Kofod Poulsen': 'male',
    'Mette Frederiksen': 'female',
    'Jakob Ellemann-Jensen': 'male',
    'Pernille Vermund': 'female',
    'Kristian Thulesen Dahl': 'male',
    'Morten Østergaard': 'male',
    'Pia Olsen Dyhr': 'female',
    'Pernille Skipper': 'female',
    'Søren Pape Poulsen': 'male',
    'Uffe Elbæk': 'male',
    'Alex Vanopslagh': 'male',
    'Kristian Hegaard': 'male',
    'Karsten Hønge': 'male',
    'Søren Søndergaard': 'male',
    'Sofie Carsten Nielsen': 'female',
    'Torsten Gejl': 'male',
    'Michael Aastrup Jensen': 'male',
    'Aaja Chemnitz Larsen': 'female',
    'Aki-Matilda Høegh-Dam': 'female',
    'Mai Villadsen': 'female',
    'Edmund Joensen': 'male',
    'Peter Juel-Jensen': 'male',
    'Morten Messerschmidt': 'male',
    'Sikandar Siddique': 'male',
    'Troels Lund Poulsen': 'male',
    'Inger Støjberg': 'female',
    'Lars Løkke Rasmussen': 'male',
    'Martin Lidegaard': 'male',
    'Franciska Rosenkilde': 'female',
    'Aaja Chemnitz': 'female',
    'Alexander Ryle': 'male',
    'Henrik Frandsen': 'male',
    'Rasmus Jarlov': 'male',
    'Kim Edberg Andersen': 'male',
    'Niels Flemming Hansen': 'male',
    'Pelle Dragsted': 'male',
    'Peter Kofod': 'male',
    'Tobias Grotkjær Elmstrøm': 'male',
    'Flemming Møller Mortensen': 'male',
    'Katrine Daugaard': 'female',
    'Vivi Kier': 'female',
    'Kaare Dybvad Bek': 'male',
    'Frederik Vad': 'male',
    'Peter Hummelgaard': 'male'
}

def extract_rhetorical_features(text, nlp_model, prefix=""):
    if not isinstance(text, str) or pd.isna(text):
        return {f"{prefix}{k}": 0 for k in ['num_tokens', 'num_nouns', 'num_verbs', 'num_adjs',
                                           'has_modal', 'modal_count', 'entity_count',
                                           'has_entity', 'pronoun_iwe_count', 'pronoun_you_count']} | {f"{prefix}root_verb": None}

    doc = nlp_model(text)
    modal_words = {"kan", "skal", "måske", "må", "bør", "vil"}
    pronouns_i_we = {'jeg', 'vi'}
    pronouns_you = {'du', 'de', 'dig'}

    return {
        f"{prefix}num_tokens": len(doc),
        f"{prefix}num_nouns": sum(1 for token in doc if token.pos_ == "NOUN"),
        f"{prefix}num_verbs": sum(1 for token in doc if token.pos_ == "VERB"),
        f"{prefix}num_adjs": sum(1 for token in doc if token.pos_ == "ADJ"),
        f"{prefix}has_modal": any(token.text.lower() in modal_words for token in doc),
        f"{prefix}modal_count": sum(1 for token in doc if token.text.lower() in modal_words),
        f"{prefix}entity_count": sum(1 for ent in doc.ents if ent.label_ in ["ORG", "PERSON", "LOC"]),
        f"{prefix}has_entity": any(ent.label_ in ["ORG", "PERSON", "LOC"] for ent in doc.ents),
        f"{prefix}pronoun_iwe_count": sum(1 for token in doc if token.text.lower() in pronouns_i_we),
        f"{prefix}pronoun_you_count": sum(1 for token in doc if token.text.lower() in pronouns_you),
        f"{prefix}root_verb": next((token.lemma_ for token in doc if token.dep_ == "ROOT"), None),
    }

def keyword_presence(text, keywords):
    text = text.lower() if isinstance(text, str) else ""
    return any(word in text for word in keywords)

def compute_overlap(a, b):
    if not isinstance(a, str) or not isinstance(b, str):
        return 0.0
    return SequenceMatcher(None, a.lower(), b.lower()).ratio()

def add_text_features(df, col_name="span", prefix="span_"):
    attack_words = ['utroligt', 'kritiserer', 'fejl', 'svigtet', 'mangler', 'ministeren']
    evasion_words = ['som sagt', 'vi har en plan', 'lad os se', 'måske', 'proces', 'nu skal vi huske']
    selfpromo_words = ['vi har gjort', 'jeg har sikret', 'første gang', 'historisk', 'styrket', 'løftet',
                       'jeg har arbejdet', 'jeg har hele tiden']

    df[f'{prefix}has_attack_words'] = df[col_name].apply(lambda x: keyword_presence(x, attack_words))
    df[f'{prefix}has_evasion_words'] = df[col_name].apply(lambda x: keyword_presence(x, evasion_words))
    df[f'{prefix}has_selfpromo_words'] = df[col_name].apply(lambda x: keyword_presence(x, selfpromo_words))
    df[f'{prefix}starts_with_jamen'] = df[col_name].str.strip().str.lower().str.startswith("jamen")
    df[f'{prefix}starts_with_altså'] = df[col_name].str.strip().str.lower().str.startswith("altså")
    df[f'{prefix}starts_with_ja'] = df[col_name].str.strip().str.lower().str.startswith("ja")
    df[f'{prefix}starts_with_nej'] = df[col_name].str.strip().str.lower().str.startswith("nej")
    df[f'{prefix}starts_with_ja_or_nej'] = df[f'{prefix}starts_with_ja'] | df[f'{prefix}starts_with_nej']
    #df[f'{prefix}contains_ja_or_nej'] = df[col_name].str.contains(r'\b(ja|nej)\b', case=False, na=False)
    df[f'{prefix}contains_ja_or_nej'] = df[col_name].str.contains(r'\b(?:ja|nej)\b', case=False, na=False)

    # Modal density (requires extract_rhetorical_features to be run first)
    if f'{prefix}modal_count' in df.columns and f'{prefix}num_tokens' in df.columns:
        df[f'{prefix}modal_density'] = df[f'{prefix}modal_count'] / df[f'{prefix}num_tokens']
    else:
        df[f'{prefix}modal_density'] = np.nan

    return df

def role_mentions(text):
    roles = ['ministeren', 'ordføreren', 'spørgeren', 'medlemmet', 'parti_', 'taleren']
    text_lower = text.lower() if isinstance(text, str) else ""
    return {f'mentions_{role}': role in text_lower for role in roles}

def add_number_features(df, col_name="span", prefix="span_"):
    df[f'{prefix}has_number'] = df[col_name].str.contains(r'\d+', regex=True, na=False)
    df[f'{prefix}number_count'] = df[col_name].apply(lambda x: len(re.findall(r'\d+', x)) if isinstance(x, str) else 0)
    return df

def add_negation_question_features(df, col_name="span", prefix="span_"):
    #df[f'{prefix}has_negation'] = df[col_name].str.contains(r"\b(ikke|ingen|nej|aldrig|intet|nægtet)\b", case=False, na=False)
    df[f'{prefix}has_negation'] = df[col_name].str.contains(r"\b(?:ikke|ingen|nej|aldrig|intet|nægtet)\b", case=False, na=False)
    df[f'{prefix}contains_question_mark'] = df[col_name].str.contains(r"\?", na=False)
    return df


def add_metadata_features(df):
    df['year'] = pd.to_datetime(df['METADATA_Date'], errors='coerce').dt.year
    df['gov_opp_status'] = df.apply(lambda row: party_status_by_year.get(row['METADATA_Party'], {}).get(row['year'], 'unknown'), axis=1)
    df['is_government'] = df['gov_opp_status'] == 'government'
    df['gender'] = df['METADATA_Speaker'].map(name_gender_map)
    return df

def add_overlap_feature(df):
    df['question_overlap'] = df.apply(
        lambda row: compute_overlap(row.get('prev_turn', ''), row.get('span', '')),
        axis=1
    )
    return df

def add_any_role_mentions(df):
    span_mention_cols = [
        'span_mentions_ministeren', 
        'span_mentions_ordføreren', 
        'span_mentions_spørgeren', 
        'span_mentions_medlemmet', 
        'span_mentions_parti_', 
        'span_mentions_taleren'  
    ]
    prev_mention_cols = [
        'prev_turn_mentions_ministeren', 
        'prev_turn_mentions_ordføreren', 
        'prev_turn_mentions_spørgeren', 
        'prev_turn_mentions_medlemmet', 
        'prev_turn_mentions_parti_', 
        'prev_turn_mentions_taleren'
    ]

    df['span_any_mention_of_role_or_party'] = df[span_mention_cols].any(axis=1)
    df['prev_turn_any_mention_of_role_or_party'] = df[prev_mention_cols].any(axis=1)

    return df

# Ensure METADATA_Date is in datetime format
df['METADATA_Date'] = pd.to_datetime(df['METADATA_Date'], errors='coerce')

# Filter rows from 2016-01-01 and onward
df = df[df['METADATA_Date'].dt.year >= 2016].reset_index(drop=True)


In [81]:
def run_full_feature_engineering_pipeline(df):
    from sklearn.feature_extraction.text import TfidfVectorizer

    span_features = df["span"].apply(lambda x: extract_rhetorical_features(x, nlp, prefix="span_"))
    df = pd.concat([df, pd.DataFrame(span_features.tolist())], axis=1)

    prev_features = df["prev_turn"].apply(lambda x: extract_rhetorical_features(x, nlp, prefix="prev_turn_"))
    df = pd.concat([df, pd.DataFrame(prev_features.tolist())], axis=1)

    df = pd.concat([df, df["span"].apply(role_mentions).apply(pd.Series).add_prefix("span_")], axis=1)
    df = pd.concat([df, df["prev_turn"].apply(role_mentions).apply(pd.Series).add_prefix("prev_turn_")], axis=1)

    df = add_number_features(df, col_name="span", prefix="span_")
    df = add_number_features(df, col_name="prev_turn", prefix="prev_turn_")

    df = add_text_features(df, col_name="span", prefix="span_")
    df = add_text_features(df, col_name="prev_turn", prefix="prev_turn_")

    df = add_negation_question_features(df, col_name="span", prefix="span_")
    df = add_negation_question_features(df, col_name="prev_turn", prefix="prev_turn_")

    df = add_metadata_features(df)
    df = add_overlap_feature(df)

    df = add_any_role_mentions(df)


    tfidf_vectorizer = TfidfVectorizer(ngram_range=(2, 2), max_features=100)
    tfidf_matrix = tfidf_vectorizer.fit_transform(df["span"].fillna("").astype(str))
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
    df = pd.concat([df.reset_index(drop=True), tfidf_df.reset_index(drop=True)], axis=1)

    return df

# Run pipeline
df_engineered = run_full_feature_engineering_pipeline(df)

In [None]:
df_engineered

In [83]:
df_engineered['gov_opp_status'].unique()

cols_to_keep = [
    'debate_unit_id',
    'span',
    'label',
    'quality_status',
    'span_speaker',
    'METADATA_Date',
    'METADATA_Party',
    'METADATA_Speaker',
    'gov_opp_status'
]

# Subset rows where gov_opp_status is unknown or missing
unknown_status_df = df_engineered[
    df_engineered['gov_opp_status'].isna() | (df_engineered['gov_opp_status'] == 'unknown')
][cols_to_keep]

#unknown_status_df['METADATA_Date'].unique()
unknown_status_df


Unnamed: 0,debate_unit_id,span,label,quality_status,span_speaker,METADATA_Date,METADATA_Party,METADATA_Speaker,gov_opp_status


In [84]:
list(df_engineered.columns)


['debate_unit_id',
 'label',
 'span',
 'quality_status',
 'source_annotators',
 'construction_note',
 'full_text',
 'span_turn',
 'span_speaker',
 'prev_turn',
 'prev_turn_speaker',
 'next_turn',
 'next_turn_speaker',
 'METADATA_SessionID',
 'METADATA_MeetingNumber',
 'METADATA_Date',
 'METADATA_Location',
 'METADATA_AgendaItemNo',
 'METADATA_AgendaTitle',
 'METADATA_DebateType',
 'METADATA_TurnNo',
 'METADATA_Speaker',
 'METADATA_Party',
 'METADATA_Role',
 'METADATA_TurnRole',
 'METADATA_Time',
 'METADATA_Utterance',
 'METADATA_AgendaCategory',
 'METADATA_MeetingDateID',
 'METADATA_AgendaTitleDateID',
 'METADATA_TurnRole_Danish',
 'MATCH_FLAG',
 'METADATA_PREV_SessionID',
 'METADATA_PREV_MeetingNumber',
 'METADATA_PREV_Date',
 'METADATA_PREV_Location',
 'METADATA_PREV_AgendaItemNo',
 'METADATA_PREV_AgendaTitle',
 'METADATA_PREV_DebateType',
 'METADATA_PREV_TurnNo',
 'METADATA_PREV_Speaker',
 'METADATA_PREV_Party',
 'METADATA_PREV_Role',
 'METADATA_PREV_TurnRole',
 'METADATA_PREV_Time'

In [None]:
list(df_engineered.columns)

df_subset = df_engineered[['debate_unit_id',
 'label',
 'span',
 'quality_status',
 'source_annotators',
 'construction_note',
 'full_text',
 'span_turn',
 'span_speaker',
 'prev_turn',
 'prev_turn_speaker',
 'next_turn',
 'next_turn_speaker',
 'METADATA_SessionID',
 'METADATA_MeetingNumber',
 'METADATA_Date',
 'METADATA_Location',
 'METADATA_AgendaItemNo',
 'METADATA_AgendaTitle',
 'METADATA_DebateType',
 'METADATA_TurnNo',
 'METADATA_Speaker',
 'METADATA_Party',
 'METADATA_Role',
 'METADATA_TurnRole',
 'METADATA_Time',
 'METADATA_Utterance',
 'METADATA_AgendaCategory',
 'METADATA_MeetingDateID',
 'METADATA_AgendaTitleDateID',
 'METADATA_TurnRole_Danish',
 'MATCH_FLAG',
 'METADATA_PREV_SessionID',
 'METADATA_PREV_MeetingNumber',
 'METADATA_PREV_Date',
 'METADATA_PREV_Location',
 'METADATA_PREV_AgendaItemNo',
 'METADATA_PREV_AgendaTitle',
 'METADATA_PREV_DebateType',
 'METADATA_PREV_TurnNo',
 'METADATA_PREV_Speaker',
 'METADATA_PREV_Party',
 'METADATA_PREV_Role',
 'METADATA_PREV_TurnRole',
 'METADATA_PREV_Time',
 'METADATA_PREV_Utterance',
 'METADATA_PREV_AgendaCategory',
 'METADATA_PREV_MeetingDateID',
 'METADATA_PREV_AgendaTitleDateID',
 'METADATA_PREV_TurnRole_Danish',
 'PREV_MATCH_FLAG',
 'METADATA_NEXT_SessionID',
 'METADATA_NEXT_MeetingNumber',
 'METADATA_NEXT_Date',
 'METADATA_NEXT_Location',
 'METADATA_NEXT_AgendaItemNo',
 'METADATA_NEXT_AgendaTitle',
 'METADATA_NEXT_DebateType',
 'METADATA_NEXT_TurnNo',
 'METADATA_NEXT_Speaker',
 'METADATA_NEXT_Party',
 'METADATA_NEXT_Role',
 'METADATA_NEXT_TurnRole',
 'METADATA_NEXT_Time',
 'METADATA_NEXT_Utterance',
 'METADATA_NEXT_AgendaCategory',
 'METADATA_NEXT_MeetingDateID',
 'METADATA_NEXT_AgendaTitleDateID',
 'METADATA_NEXT_TurnRole_Danish',
 'NEXT_MATCH_FLAG',
 'span_num_tokens',
 'span_num_nouns',
 'span_num_verbs',
 'span_num_adjs',
 'span_has_modal',
 'span_modal_count',
 'span_entity_count',
 'span_has_entity',
 'span_pronoun_iwe_count',
 'span_pronoun_you_count',
 'span_root_verb',
 'prev_turn_num_tokens',
 'prev_turn_num_nouns',
 'prev_turn_num_verbs',
 'prev_turn_num_adjs',
 'prev_turn_has_modal',
 'prev_turn_modal_count',
 'prev_turn_entity_count',
 'prev_turn_has_entity',
 'prev_turn_pronoun_iwe_count',
 'prev_turn_pronoun_you_count',
 'prev_turn_root_verb',
 'span_mentions_ministeren',
 'span_mentions_ordføreren',
 'span_mentions_spørgeren',
 'span_mentions_medlemmet',
 'span_mentions_parti_',
 'span_mentions_taleren',
 'prev_turn_mentions_ministeren',
 'prev_turn_mentions_ordføreren',
 'prev_turn_mentions_spørgeren',
 'prev_turn_mentions_medlemmet',
 'prev_turn_mentions_parti_',
 'prev_turn_mentions_taleren',
 'span_has_number',
 'span_number_count',
 'prev_turn_has_number',
 'prev_turn_number_count',
 'span_has_attack_words',
 'span_has_evasion_words',
 'span_has_selfpromo_words',
 'span_starts_with_jamen',
 'span_starts_with_altså',
 'span_starts_with_ja',
 'span_starts_with_nej',
 'span_starts_with_ja_or_nej',
 'span_contains_ja_or_nej',
 'span_modal_density',
 'prev_turn_has_attack_words',
 'prev_turn_has_evasion_words',
 'prev_turn_has_selfpromo_words',
 'prev_turn_starts_with_jamen',
 'prev_turn_starts_with_altså',
 'prev_turn_starts_with_ja',
 'prev_turn_starts_with_nej',
 'prev_turn_starts_with_ja_or_nej',
 'prev_turn_contains_ja_or_nej',
 'prev_turn_modal_density',
 'span_has_negation',
 'span_contains_question_mark',
 'prev_turn_has_negation',
 'prev_turn_contains_question_mark',
 'year',
 'gov_opp_status',
 'is_government',
 'gender',
 'question_overlap',
 'span_any_mention_of_role_or_party',
 'prev_turn_any_mention_of_role_or_party']]

df_subset.to_csv("output/feature_engineered/df_coalesced_labels_feature_engineered_18_april.csv") 
df_subset

In [None]:
## Checking if any of the pretraining corpora contain debateunits from the gold dataset
import pandas as pd

# Paths
fe_path    = "/Users/pbrams/Desktop/AARHUS_UNIVERSITY/kandidat/thesis_work/annotations/output/feature_engineered/df_coalesced_labels_feature_engineered_18_april.csv"
all_path   = "/Users/pbrams/Desktop/AARHUS_UNIVERSITY/kandidat/thesis_work/annotations/output/for_pretraining/ALL_debates_with_turns_and_anon_18_april_debates_wo_labelledrows.csv"
pld_path   = "/Users/pbrams/Desktop/AARHUS_UNIVERSITY/kandidat/thesis_work/annotations/output/for_pretraining/PLD_debates_with_turns_and_anon_18_april_debates_wo_labelledrows.csv"
pldqa_path = "/Users/pbrams/Desktop/AARHUS_UNIVERSITY/kandidat/thesis_work/annotations/output/for_pretraining/PLD_QA_debates_with_turns_and_anon_18_april_debates_wo_labelledrows.csv"

# Load just the id columns (faster & less memory)
fe    = pd.read_csv(fe_path,    usecols=["debate_unit_id"])
all_  = pd.read_csv(all_path,   usecols=["debate_unit_id"])
pld   = pd.read_csv(pld_path,   usecols=["debate_unit_id"])
pldqa = pd.read_csv(pldqa_path, usecols=["debate_unit_id"])

# Turn into Python sets
ids_fe    = set(fe["debate_unit_id"])
ids_all   = set(all_["debate_unit_id"])
ids_pld   = set(pld["debate_unit_id"])
ids_pldqa = set(pldqa["debate_unit_id"])

# Compute intersections
overlap_all   = ids_fe & ids_all
overlap_pld   = ids_fe & ids_pld
overlap_pldqa = ids_fe & ids_pldqa

# Report
print(f"Total IDs in feature-engineered df: {len(ids_fe)}")
print(f"Overlap with ALL_debates:    {len(overlap_all)}")
print(f"Overlap with PLD_debates:    {len(overlap_pld)}")
print(f"Overlap with PLD_QA_debates: {len(overlap_pldqa)}\n")

if overlap_all:
    print("Example overlaps in ALL_debates:", list(overlap_all)[:10])
if overlap_pld:
    print("Example overlaps in PLD_debates:", list(overlap_pld)[:10])
if overlap_pldqa:
    print("Example overlaps in PLD_QA_debates:", list(overlap_pldqa)[:10])


Total IDs in feature-engineered df: 198
Overlap with ALL_debates:    0
Overlap with PLD_debates:    0
Overlap with PLD_QA_debates: 0



In [None]:
# This was the code used to ensure the above:

# import pandas as pd 

# # new one
# debates = pd.read_csv("/Users/pbrams/Desktop/AARHUS_UNIVERSITY/kandidat/thesis_work/annotations/src/data_from_other_nb/coalesced_gold_df_with_turns_and_anon_18_april.csv")

# debates["debate_unit_id"] = debates["debate_unit_id"].astype(int)
# debates = debates.drop(columns=[col for col in debates.columns if col.startswith("Unnamed:")])

# # Remove the rows from this
# df = pd.read_csv("/Users/pbrams/Desktop/AARHUS_UNIVERSITY/kandidat/thesis_work/annotations/output/coalesced_gold_aug_w_turns_w_context.csv") # putting the new one

# # Remove labeled rows (e.g. using debate_unit_id or text match)
# if 'debate_unit_id' in debates.columns:
#     debates_ex = debates[~debates['debate_unit_id'].isin(df['debate_unit_id'])]

# This is all the data without the labelled rows
#debates_ex.to_csv("/Users/pbrams/Desktop/AARHUS_UNIVERSITY/kandidat/thesis_work/annotations/output/for_pretraining/ALL_debates_with_turns_and_anon_18_april_debates_wo_labelledrows.csv")

# debates_ex

Unnamed: 0,SessionID,MeetingNumber,Date,Location,AgendaItemNo,AgendaTitle,DebateType,TurnNo,Speaker,Party,...,TurnRole,Time,Utterance,AgendaCategory,MeetingDateID,AgendaTitleDateID,TurnSequence,debate_unit_id,TurnRole_Danish,Utterance_anon
0,20091,4,2009-10-20 13:00:00,Folketingssalen,6,1. behandling af L 27: Om Europol.,reading of bill,6,Dennis Flydtkjær,DF,...,proponent,,Dette lovforslag handler om at gennemføre Råde...,Foreign Affairs,4_2009-10-20 13:00:00,1. behandling af L 27: Om Europol._2009-10-20 ...,0,187,Ordfører,Dette lovforslag handler om at gennemføre Råde...
1,20091,4,2009-10-20 13:00:00,Folketingssalen,6,1. behandling af L 27: Om Europol.,reading of bill,8,Karen Hækkerup,S,...,asker,,"[Lydudfald] … den her frygt, som Parti_F har f...",Foreign Affairs,4_2009-10-20 13:00:00,1. behandling af L 27: Om Europol._2009-10-20 ...,1,187,Spørger,"[Lydudfald] … den her frygt, som Parti_F har f..."
2,20091,4,2009-10-20 13:00:00,Folketingssalen,6,1. behandling af L 27: Om Europol.,reading of bill,10,Dennis Flydtkjær,DF,...,proponent,,"Jeg er ikke overrasket over, at Parti_A er uen...",Foreign Affairs,4_2009-10-20 13:00:00,1. behandling af L 27: Om Europol._2009-10-20 ...,2,187,Ordfører,"Jeg er ikke overrasket over, at Parti_A er uen..."
3,20091,4,2009-10-20 13:00:00,Folketingssalen,6,1. behandling af L 27: Om Europol.,reading of bill,12,Karen Hækkerup,S,...,asker,,Nu samarbejder man jo også med politimyndighed...,Foreign Affairs,4_2009-10-20 13:00:00,1. behandling af L 27: Om Europol._2009-10-20 ...,3,187,Spørger,Nu samarbejder man jo også med politimyndighed...
4,20091,4,2009-10-20 13:00:00,Folketingssalen,6,1. behandling af L 27: Om Europol.,reading of bill,14,Dennis Flydtkjær,DF,...,proponent,,"Jeg kan meddele Spørgeren, at jeg bestemt er e...",Foreign Affairs,4_2009-10-20 13:00:00,1. behandling af L 27: Om Europol._2009-10-20 ...,4,187,Ordfører,"Jeg kan meddele Spørgeren, at jeg bestemt er e..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
301200,20121,68,2013-03-13 13:00:00,Folketingssalen,2,Besvarelse af oversendte spørgsmål til ministrene,other,153,Manu Sareen,,...,minister,,"Nu er det god latin herinde, at man forholder ...",Other,68_2013-03-13 13:00:00,Besvarelse af oversendte spørgsmål til ministr...,48,27880,Minister,"Nu er det god latin herinde, at man forholder ..."
301201,20191,88,2020-03-31 16:00:00,Folketingssalen,0,Punkt 0,other,2,MødeSlut MødeSlut,MødeSlut,...,unknown,(Kl. 22:44),Mødet er hævet. .,Other,88_2020-03-31 16:00:00,Punkt 0_2020-03-31 16:00:00,0,81458,Ukendt,Mødet er hævet. .
301202,20191,88,2020-03-31 16:00:00,Folketingssalen,0,Punkt 0,other,4,Pause Pause,Pause,...,unknown,(Kl. 18:30),Mødet er udsat. .,Other,88_2020-03-31 16:00:00,Punkt 0_2020-03-31 16:00:00,1,81458,Ukendt,Mødet er udsat. .
301203,20191,88,2020-03-31 16:00:00,Folketingssalen,0,Punkt 0,other,6,Pause Pause,Pause,...,unknown,(Kl. 20:30),Mødet er udsat. .,Other,88_2020-03-31 16:00:00,Punkt 0_2020-03-31 16:00:00,2,81458,Ukendt,Mødet er udsat. .


In [None]:
df.columns

Index(['debate_unit_id', 'label', 'span', 'quality_status',
       'source_annotators', 'construction_note', 'full_text', 'span_turn',
       'span_speaker', 'prev_turn', 'prev_turn_speaker', 'next_turn',
       'next_turn_speaker', 'METADATA_SessionID', 'METADATA_MeetingNumber',
       'METADATA_Date', 'METADATA_Location', 'METADATA_AgendaItemNo',
       'METADATA_AgendaTitle', 'METADATA_DebateType', 'METADATA_TurnNo',
       'METADATA_Speaker', 'METADATA_Party', 'METADATA_Role',
       'METADATA_TurnRole', 'METADATA_Time', 'METADATA_Utterance',
       'METADATA_AgendaCategory', 'METADATA_MeetingDateID',
       'METADATA_AgendaTitleDateID', 'METADATA_TurnRole_Danish', 'MATCH_FLAG',
       'METADATA_PREV_SessionID', 'METADATA_PREV_MeetingNumber',
       'METADATA_PREV_Date', 'METADATA_PREV_Location',
       'METADATA_PREV_AgendaItemNo', 'METADATA_PREV_AgendaTitle',
       'METADATA_PREV_DebateType', 'METADATA_PREV_TurnNo',
       'METADATA_PREV_Speaker', 'METADATA_PREV_Party', 'METADAT