## 📙 00 Preprocessing

This notebook includes the preparation of Hellenic Parliament Proceedings dataset for analysis. This preprocessing steps follow best practices in political text analysis (Denny & Spirling 2017) to accomodate the specific needs for each downstream analysis of each Research Question.

In [1]:
#data-cleaning-processing-core-libs
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import time
tqdm.pandas()
import ipywidgets as widgets
widgets.IntSlider()

#text-processing
import re
import string
import unicodedata

#visualization
import matplotlib.pyplot as plt
import seaborn as sns

#greek-nlp-toolkit-Loukas(2024)
from gr_nlp_toolkit import Pipeline

#greek-stopwords
import stopwordsiso as stopwords_iso
greek_stopwords = list(stopwords_iso.stopwords("el"))

from nltk.corpus import stopwords
greek_stopwords_nltk = stopwords.words("greek")
combined = list(set(greek_stopwords + greek_stopwords_nltk))

  import pkg_resources


In [35]:
df = pd.read_csv('par10_20c.csv')

In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 536446 entries, 0 to 536445
Data columns (total 11 columns):
 #   Column                 Non-Null Count   Dtype 
---  ------                 --------------   ----- 
 0   member_name            522524 non-null  object
 1   sitting_date           536446 non-null  object
 2   parliamentary_period   536446 non-null  object
 3   parliamentary_session  536446 non-null  object
 4   parliamentary_sitting  536446 non-null  object
 5   political_party        536327 non-null  object
 6   government             536446 non-null  object
 7   member_region          513121 non-null  object
 8   roles                  522524 non-null  object
 9   member_gender          522524 non-null  object
 10  speech                 536446 non-null  object
dtypes: object(11)
memory usage: 45.0+ MB


### Data handling & Cleaning

In [37]:
columns_to_drop = ["parliamentary_session", "parliamentary_sitting", "parliamentary_period","member_region"]
df = df.drop(columns=columns_to_drop)

In [38]:
missing_percent = df.isnull().mean().sort_values(ascending=False) * 100
print(missing_percent.round(2))

member_name        2.60
roles              2.60
member_gender      2.60
political_party    0.02
sitting_date       0.00
government         0.00
speech             0.00
dtype: float64


In [39]:
df = df.dropna(subset=["member_name", "roles", "member_gender", "political_party"])

In [40]:
df['sitting_date'] = pd.to_datetime(df['sitting_date'], errors='coerce')

In [41]:
df['year'] = df['sitting_date'].dt.year

In [42]:
df['political_party'].unique()

array(['πανελληνιο σοσιαλιστικο κινημα', 'νεα δημοκρατια',
       'εξωκοινοβουλευτικός', 'κομμουνιστικο κομμα ελλαδας',
       'λαικος ορθοδοξος συναγερμος',
       'συνασπισμος ριζοσπαστικης αριστερας',
       'ανεξαρτητοι (εκτος κομματος)', 'δημοκρατικη αριστερα',
       'ανεξαρτητοι ελληνες - πανος καμμενος',
       'λαικος συνδεσμος - χρυση αυγη',
       'ανεξαρτητοι δημοκρατικοι βουλευτες', 'το ποταμι',
       'ανεξαρτητοι ελληνες εθνικη πατριωτικη δημοκρατικη συμμαχια',
       'λαικη ενοτητα',
       'δημοκρατικη συμπαραταξη (πανελληνιο σοσιαλιστικο κινημα - δημοκρατικη αριστερα)',
       'ενωση κεντρωων', 'κινημα αλλαγης',
       'ελληνικη λυση - κυριακος βελοπουλος',
       'μετωπο ευρωπαικης ρεαλιστικης ανυπακοης (μερα25)'], dtype=object)

In [43]:
df['government'].unique()

array(["['παπανδρεου α. γεωργιου(06/10/2009-11/11/2011)']",
       "['παπαδημου λουκα δ.(11/11/2011-17/05/2012)']",
       "['πικραμμενου παναγιωτη οθ. (υπηρεσιακη)(17/05/2012-21/06/2012)']",
       "['σαμαρα κ. αντωνιου(21/06/2012-26/01/2015)']",
       "['τσιπρα π. αλεξιου(26/01/2015-27/08/2015)']",
       "['τσιπρα π. αλεξιου(21/09/2015-08/07/2019)']",
       "['μητσοτακη κυριακου(08/07/2019-28/07/2020)']"], dtype=object)

In [44]:
# Define the government coalition mapping (PM name substring, years, and parties)
gov_coalitions = [
    {
        'pm': 'παπανδρεου α. γεωργιου',
        'years': [2009, 2010, 2011],
        'parties': ['πανελληνιο σοσιαλιστικο κινημα']
    },
    {
        'pm': 'παπαδημου λουκα δ.',
        'years': [2011, 2012],
        'parties': ['πανελληνιο σοσιαλιστικο κινημα', 'νεα δημοκρατια', 'λαικος ορθοδοξος συναγερμος']
    },
    {
        'pm': 'σαμαρα κ. αντωνιου',
        'years': [2012, 2013, 2014, 2015],
        'parties': ['πανελληνιο σοσιαλιστικο κινημα', 'νεα δημοκρατια']
    },
    {
        'pm': 'τσιπρα π. αλεξιου',
        'years': [2015, 2016, 2017, 2018, 2019],
        'parties': ['συνασπισμος ριζοσπαστικης αριστερας', 'ανεξαρτητοι ελληνες εθνικη πατριωτικη δημοκρατικη συμμαχια']
    }
]

In [45]:
# Function to check if a speaker was in government
def is_government(row):
    gov_name = row['government'].lower()
    party = row['political_party'].lower()
    year = row['year']

    for coalition in gov_coalitions:
        if (coalition['pm'] in gov_name) and (year in coalition['years']) and (party in coalition['parties']):
            return 1
    return 0

In [46]:
df['is_government'] = df.apply(is_government, axis=1)

In [47]:
gov_titles = [
    'πρωθυπουργος',
    'αντιπροεδρος της κυβερνησης',
    'αναπληρωτης υπουργος',
    'υπουργος',
    'υφυπουργος'
]


In [48]:
def extract_speaker_role(role_text):
    role_text = str(role_text).lower()
    for title in gov_titles:
        if title in role_text:
            return title
    return None

In [49]:
df['speaker_gov_role'] = df['roles'].apply(extract_speaker_role)

In [50]:
leadership_titles = [
    'αρχηγος κομματος',
    'αρχηγος αξιωματικης αντιπολιτευσης'
]

In [51]:
def extract_leadership_role(role_text):
    role_text = str(role_text).lower()
    for title in leadership_titles:
        if title in role_text:
            return title
    return None

In [52]:
df['leadership_role'] = df['roles'].apply(extract_leadership_role)

In [53]:
df = df[~df['roles'].str.contains('αντιπροεδρος βουλης|προεδρος βουλης|αντιπροεδρος', case=False, na=False)]

<b style="color:red;">Checkpoint-Saved</b>

In [54]:
df.to_csv("processed01_par10-20.csv", index=False)

### Preprocessing Speech Text Pipeline

In [11]:
p_df = pd.read_csv('processed01_par10-20.csv')

In [12]:
p_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 341805 entries, 0 to 341804
Data columns (total 13 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   Unnamed: 0        341805 non-null  int64 
 1   member_name       341805 non-null  object
 2   sitting_date      341805 non-null  object
 3   political_party   341805 non-null  object
 4   government        341805 non-null  object
 5   roles             341805 non-null  object
 6   member_gender     341805 non-null  object
 7   speech            341805 non-null  object
 8   year              341805 non-null  int64 
 9   is_government     341805 non-null  int64 
 10  speaker_gov_role  60473 non-null   object
 11  leadership_role   10689 non-null   object
 12  speech_clean      341668 non-null  object
dtypes: int64(3), object(10)
memory usage: 33.9+ MB


In [13]:
p_df.head()

Unnamed: 0.1,Unnamed: 0,member_name,sitting_date,political_party,government,roles,member_gender,speech,year,is_government,speaker_gov_role,leadership_role,speech_clean
0,0,τσιαρας αλεξανδρου κωνσταντινος,2010-01-11,νεα δημοκρατια,['παπανδρεου α. γεωργιου(06/10/2009-11/11/2011)'],['βουλευτης'],male,"Σας ευχαριστώ πολύ, κύριε Πρόεδρε. Κυρίες και...",2010,0,,,"σας πολυ, . και , μιας και ειναι η πρωτη μερα ..."
1,1,ζωης κωνσταντινου χρηστος,2010-01-11,νεα δημοκρατια,['παπανδρεου α. γεωργιου(06/10/2009-11/11/2011)'],['βουλευτης'],male,"Ευχαριστώ, κύριε Πρόεδρε.Επιτρέψτε μου κι εμέ...",2010,0,,,", .επιτρεψτε μου κι εμενα, πριν απ’ ολα, να απ..."
2,2,ζωης κωνσταντινου χρηστος,2010-01-11,νεα δημοκρατια,['παπανδρεου α. γεωργιου(06/10/2009-11/11/2011)'],['βουλευτης'],male,Εσείς δυστυχώς πρέπει να αναθεωρήσετε τις θέσ...,2010,0,,,εσεις δυστυχως πρεπει να αναθεωρησετε τις θεσε...
3,3,ταλιαδουρος αθανασιου σπυριδων,2010-01-11,νεα δημοκρατια,['παπανδρεου α. γεωργιου(06/10/2009-11/11/2011)'],['βουλευτης'],male,"Κύριε Πρόεδρε, όπως επισημάνθηκε και από τους...",2010,0,,,", οπως επισημανθηκε και απο τους συναδελφους μ..."
4,4,χαρακοπουλος παντελη μαξιμος,2010-01-11,νεα δημοκρατια,['παπανδρεου α. γεωργιου(06/10/2009-11/11/2011)'],['βουλευτης'],male,"Ευχαριστώ πολύ, κύριε Πρόεδρε.Κυρίες και κύρι...",2010,0,,,"πολυ, . και , η ερωτηση που καταθεσαμε οι βουλ..."


In [11]:
#removing ntonation/diacritics (accent marks) used in written greek
def remove_greek_diacritics(text):
    text = str(text)
    text = unicodedata.normalize("NFD", text)
    text = ''.join([char for char in text if not unicodedata.combining(char)])
    return text

In [14]:
def basic_cleaning(text):
    # Remove Greek diacritics
    text = remove_greek_diacritics(text)
    
    # Lowercase
    text = text.lower()
    
    # Remove formal expressions and honorifics
    patterns_to_remove = [
        r'κυριε\s+(υπουργε|υφυπουργε|προεδρε|αντιπροεδρε)',
        r'\bκ\.\s*',
        r'\bευχαριστω\b',
        r'\bαγαπητοι συναδελφοι\b',
        r'\bκυριοι συναδελφοι\b',
        r'\bκυριες\b'
    ]
    for pattern in patterns_to_remove:
        text = re.sub(pattern, '', text)
    
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

In [8]:
#stopwords_removal-function
def stopwords_removal(text):
    #remove stopwords
    words = text.split()
    words = [word for word in words if word not in greek_stopwords]
    return ' '.join(words)

In [17]:
p_df['speech_clean'] = p_df['speech'].progress_apply(basic_cleaning)

  0%|          | 0/341805 [00:00<?, ?it/s]

<b style="color:red;">Checkpoint-Saved</b>

In [18]:
p_df.to_csv('processed01_par10-20.csv')

## 📒01 Analysis RQ1 - 6 Sentiment Lexicon (Dritsa, 2018)
✅ diacritics removal\
✅ formal phrases and honorifics\
✅ remove extra white space\
✅ tokenize speech\
✅ lemmatize speech\
❌ keep stop-words because included in Drista 2018 6 sentiments lexicon

In [2]:
from collections import defaultdict
import jellyfish

In [3]:
import spacy

In [4]:
rq1_df = pd.read_csv('processed01_par10-20.csv',index_col=0)

In [5]:
rq1_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 341805 entries, 0 to 341804
Data columns (total 13 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   Unnamed: 0        341805 non-null  int64 
 1   member_name       341805 non-null  object
 2   sitting_date      341805 non-null  object
 3   political_party   341805 non-null  object
 4   government        341805 non-null  object
 5   roles             341805 non-null  object
 6   member_gender     341805 non-null  object
 7   speech            341805 non-null  object
 8   year              341805 non-null  int64 
 9   is_government     341805 non-null  int64 
 10  speaker_gov_role  60473 non-null   object
 11  leadership_role   10689 non-null   object
 12  speech_clean      338501 non-null  object
dtypes: int64(3), object(10)
memory usage: 36.5+ MB


In [59]:
nlp = spacy.load("el_core_news_sm")

def tokenize_and_lemmatize(text):
    doc = nlp(text)
    return [token.lemma_ for token in doc if not token.is_space]

In [76]:
# Load lexicon into dictionary
lexicon_df = pd.read_csv("out_lexicon_6sent.csv")

In [77]:
lexicon_df.head()

Unnamed: 0,term,anger,disgust,fear,happiness,sadness,surprise
0,αβαφτιστος,4.0,4.5,1.0,1.0,2.5,4.5
1,Χριστος,4.5,3.75,4.25,4.0,4.0,4.5
2,α,3.75,4.0,4.0,4.0,3.75,4.75
3,αβαπτιστος,4.0,4.5,1.0,1.0,2.5,4.5
4,αβεβαιοτητα,1.0,1.0,2.5,1.0,1.5,1.0


In [78]:
lexicon = {
    row['term'].strip(): [
        float(row['anger']),
        float(row['disgust']),
        float(row['fear']),
        float(row['happiness']),
        float(row['sadness']),
        float(row['surprise'])
    ]
    for _, row in lexicon_df.iterrows()
}

In [29]:
def sent6_vec(text, lexicon, tokenize_fn):
    word_vecs = []

    for word in tokenize_fn(text):
        if word in lexicon:
            word_vecs.append(lexicon[word])

    if not word_vecs:
        return [0] * 6  
    
    word_vecs = np.array(word_vecs)

    rms = np.sqrt(np.mean(np.square(word_vecs), axis=0))
    return [round(v, 3) for v in rms]

<b style="color:blue;">Test to Sample - Started</b>

In [80]:
sample_df = rq1_df.sample(n=1000, random_state=32).copy()

In [79]:
print("αγαπημενη" in lexicon)
print(lexicon.get("αγαπημενη"))

True
[1.0, 1.0, 1.0, 4.75, 1.0, 2.75]


In [44]:
sample_df['speech_clean'] = sample_df['speech_clean'].fillna('').astype(str)

In [82]:
sample_df['sent6_vec'] = sample_df['speech_clean'].progress_apply(
    lambda text: sent6_vec(text, lexicon, tokenize_and_lemmatize)
)

  0%|          | 0/1000 [00:00<?, ?it/s]

In [83]:
sample_df[['member_name', 'political_party', 'year', 'roles', 'is_government', 'sent6_vec']].head()


Unnamed: 0,member_name,political_party,year,roles,is_government,sent6_vec
318719,αναγνωστοπουλου πετρου αθανασια (σια),συνασπισμος ριζοσπαστικης αριστερας,2019,['αναπληρωτης υπουργος εξωτερικων(15/02/2019-0...,0,"[3.142, 2.915, 2.016, 1.581, 1.0, 3.142]"
126402,κωνσταντοπουλου ν. ζωη,συνασπισμος ριζοσπαστικης αριστερας,2014,['βουλευτης'],0,"[0, 0, 0, 0, 0, 0]"
321719,μελας παναγιωτη ιωαννης,νεα δημοκρατια,2019,['βουλευτης'],0,"[0, 0, 0, 0, 0, 0]"
257082,βαρδακης δημητριου σωκρατης,συνασπισμος ριζοσπαστικης αριστερας,2017,['βουλευτης'],1,"[2.63, 2.517, 1.744, 2.457, 1.0, 3.131]"
65452,χρυσοχοιδης βασιλειου μιχαηλ,πανελληνιο σοσιαλιστικο κινημα,2012,['υπουργος αναπτυξης ανταγωνιστικοτητας και να...,1,"[3.651, 3.391, 2.363, 1.969, 1.225, 3.582]"


In [84]:
sentiment_cols = ['anger', 'disgust', 'fear', 'happiness', 'sadness', 'surprise']
sentiment_df = pd.DataFrame(sample_df['sent6_vec'].tolist(), columns=sentiment_cols)

In [85]:
sample_df_sent = pd.concat([sample_df, sentiment_df], axis=1)

In [86]:
sample_df_sent[sentiment_cols].describe()

Unnamed: 0,anger,disgust,fear,happiness,sadness,surprise
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,1.769585,1.588783,1.115906,1.067624,0.714237,1.915119
std,1.644246,1.50585,1.040254,0.995051,0.668034,1.72734
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0
50%,2.0,1.5,1.0,1.0,1.0,2.5
75%,3.261,3.026,2.109,1.969,1.145,3.5
max,4.33,4.0,4.25,4.5,3.5,4.33


<b style="color:blue;">Test to Sample - END</b>

In [90]:
rq1_df['sent6_vec'] = rq1_df['speech_clean'].progress_apply(
    lambda text: sent6_vec(text, lexicon, tokenize_and_lemmatize)
)

  0%|          | 0/341805 [00:00<?, ?it/s]

In [91]:
sentiment_cols = ['anger', 'disgust', 'fear', 'happiness', 'sadness', 'surprise']
sentiment_redf = pd.DataFrame(rq1_df['sent6_vec'].tolist(), columns=sentiment_cols)

In [92]:
meta_cols = ['member_name', 'political_party', 'year', 'is_government', 'roles']
meta_df = rq1_df[meta_cols].reset_index(drop=True)

In [93]:
rq1_results_df = pd.concat([meta_df, sentiment_redf], axis=1)

<b style="color:red;">Checkpoint-Saved</b>

In [94]:
rq1_results_df.to_csv('RQ1_results_df.csv')

In [99]:
rq1_results_df.head()

Unnamed: 0,member_name,political_party,year,is_government,roles,anger,disgust,fear,happiness,sadness,surprise
0,τσιαρας αλεξανδρου κωνσταντινος,νεα δημοκρατια,2010,0,['βουλευτης'],3.394,3.118,2.192,2.021,1.084,3.512
1,ζωης κωνσταντινου χρηστος,νεα δημοκρατια,2010,0,['βουλευτης'],3.275,3.1,2.085,1.74,1.067,3.413
2,ζωης κωνσταντινου χρηστος,νεα δημοκρατια,2010,0,['βουλευτης'],3.913,3.522,2.366,1.803,1.173,3.965
3,ταλιαδουρος αθανασιου σπυριδων,νεα δημοκρατια,2010,0,['βουλευτης'],3.386,3.188,2.176,2.101,1.101,3.674
4,χαρακοπουλος παντελη μαξιμος,νεα δημοκρατια,2010,0,['βουλευτης'],2.779,2.68,2.08,1.857,1.28,3.037


## 📒02 Analysis RQ2 - SMOG Index Greek Dritsa, (2018)
✅ diacritics removal\
✅ formal phrases and honorifics\
✅ remove extra white space

In [6]:
rq2_df = pd.read_csv('processed01_par10-20.csv',index_col=0)

In [7]:
rq2_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 341805 entries, 0 to 341804
Data columns (total 13 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   Unnamed: 0        341805 non-null  int64 
 1   member_name       341805 non-null  object
 2   sitting_date      341805 non-null  object
 3   political_party   341805 non-null  object
 4   government        341805 non-null  object
 5   roles             341805 non-null  object
 6   member_gender     341805 non-null  object
 7   speech            341805 non-null  object
 8   year              341805 non-null  int64 
 9   is_government     341805 non-null  int64 
 10  speaker_gov_role  60473 non-null   object
 11  leadership_role   10689 non-null   object
 12  speech_clean      338501 non-null  object
dtypes: int64(3), object(10)
memory usage: 36.5+ MB


In [9]:
rq2_df['speech_clean'][100]

'με συγχωρειτε πολυ η ελληνικη κυβερνηση ειναι αυτη η οποια το απεσυρε απο τα προγραμματα χρηματοδοτησεως της ευρωπαικης ενωσης για ενα και μονο λογο για να μην αλλαξει ο ρολος και ο στοχος του εργου γιατι ακριβως αν αλλαζε ο στοχος του εργου τοτε θα επρεπε να καλυφθει απο τις οδηγιες οι οποιες ηταν μεταγενεστερες'

In [10]:
rq2_df = rq2_df.drop(columns=['Unnamed: 0', 'speech_clean'])

In [10]:
def prepare_text_for_smog(text):
    if not isinstance(text, str):
        return None

    #Remove Greek diacritics
    text = remove_greek_diacritics(text)

    #Normalize to lowercase
    text = text.lower()

    #Keep sentence punctuation: periods (.), question marks (?), exclamation (!)
    #Replace them all with periods for SMOG sentence splitting
    text = text.replace('!', '.').replace('?', '.')

    #Remove formal/honorific expressions
    patterns_to_remove = [
        r'κυριε\s+(υπουργε|υφυπουργε|προεδρε|αντιπροεδρε)',
        r'\bκ\.\s*', 
        r'\bευχαριστω\b',
        r'\bαγαπητοι συναδελφοι\b',
        r'\bκυριοι συναδελφοι\b',
        r'\bκυριες\b'
    ]
    for pattern in patterns_to_remove:
        text = re.sub(pattern, '', text)

    #Convert digits to text using invoice library (e.g., "2023" → "δυο χιλιαδες είκοσι τρία")
    def normalize_numbers(match):
        try:
            return invoice.num_to_text(int(match.group()))
        except:
            return match.group()

    text = re.sub(r'\d+', normalize_numbers, text)

    #Remove unwanted symbols (keep Greek letters, spaces, periods)
    text = re.sub(r'[^α-ωΑ-Ωά-ώΆ-Ώ0-9. ]+', ' ', text)

    #Collapse multiple spaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [13]:
rq2_df['speech_for_smog'] = rq2_df['speech'].progress_apply(prepare_text_for_smog)

  0%|          | 0/341805 [00:00<?, ?it/s]

<b style="color:red;">Checkpoint-Saved</b>

In [15]:
rq2_df.to_csv('processed0rq2_par10-20.csv.csv', index=False)

In [2]:
rq2_df = pd.read_csv('processed0rq2_par10-20.csv.csv',index_col=0)

In [4]:
!pip install greek-accentuation invoice nltk



In [5]:
import math
import spacy
from greek_accentuation.syllabify import syllabify
import invoice

In [6]:
nlp = spacy.load("el_core_news_sm")

In [7]:
def compute_smog(text):
    if not isinstance(text, str):
        return None

    #Greek sentence tokenizer
    sentences = [sent.text for sent in nlp(text).sents]

    #SMOG needs at least 60 sentences to sample 30 | Dritsa 2018 followed
    if len(sentences) < 60:
        return None

    #get 20 middle sentences from a chunk
    def middle_20(sent_list):
        center = len(sent_list) // 2
        return sent_list[center - 10 : center + 10]

    #sample 10 from each third of the speech
    part1 = middle_20(sentences[:len(sentences)//3])
    part2 = middle_20(sentences[len(sentences)//3:2*len(sentences)//3])
    part3 = middle_20(sentences[2*len(sentences)//3:])
    sample_sentences = part1 + part2 + part3

    polysyllables = 0

    for sentence in sample_sentences:
        words = sentence.split()  # token-level accuracy is not critical for SMOG
        for word in words:
            if word.isalpha():
                try:
                    if len(syllabify(word)) > 2:
                        polysyllables += 1
                except:
                    continue

    # SMOG formula (based on 30-sentence sample)
    smog = 1.0430 * math.sqrt((polysyllables * 30) / len(sample_sentences)) + 3.1291
    return round(smog, 2)

<b style="color:blue;">Test to Sample - Started</b>

In [15]:
sample_df = rq2_df[rq2_df['speech'].notna()].sample(1000, random_state=42).copy()

In [16]:
sample_df['smog_score'] = sample_df['speech_for_smog'].apply(compute_smog)

In [17]:
sample_df['smog_score'].describe()

count    19.000000
mean     17.904737
std       1.535442
min      14.120000
25%      17.110000
50%      17.470000
75%      19.215000
max      20.570000
Name: smog_score, dtype: float64

<b style="color:blue;">Test to Sample - END</b>

In [18]:
rq2_df['smog_score'] = rq2_df['speech_for_smog'].progress_apply(compute_smog)

  0%|          | 0/341805 [00:00<?, ?it/s]

In [19]:
columns_to_keep = [
    'member_name', 'year', 'political_party', 'is_government',
    'speaker_gov_role', 'leadership_role', 'roles', 'smog_score'
]
rq2_results_df = rq2_df[columns_to_keep].copy()

<b style="color:red;">Checkpoint-Saved</b>

In [20]:
rq2_results_df.to_csv('RQ2_results_df.csv', index=False)

In [22]:
rq2_results_df['smog_score'].describe()

count    8347.000000
mean       18.787255
std         2.469414
min        12.040000
25%        17.020000
50%        18.620000
75%        20.410000
max        31.490000
Name: smog_score, dtype: float64

## 📒03 Analysis RQ3 - Semantic Drift - Greek-BERT