In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

nltk.download('stopwords')

file_name = '../data/filtered_events.csv'

df = pd.read_csv(file_name, delimiter=',')

from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
def preprocess(text):
    text = re.sub(r'\W+', ' ', str(text).lower())
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)
regex_pattern =  r"^(?:\[)?.?\d{4}(?:[,.:]?\s)?"

df.notes = df['notes'].str.replace(regex_pattern, '', regex=True).str.strip()
df['clean_notes'] = df['notes'].apply(preprocess)
df = df[df['clean_notes'].str.strip() != '']

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\semvv\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\semvv\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
vectorizer = CountVectorizer(max_df=0.9, min_df=5, stop_words='english')
X = vectorizer.fit_transform(df['clean_notes'])

In [3]:
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components=12, random_state=42)
lda.fit(X)

In [4]:
def print_topics(model, vectorizer, top_n=11):
    words = vectorizer.get_feature_names_out()
    for idx, topic in enumerate(model.components_):
        print(f"Topic #{idx + 1}:")
        print(" ".join([words[i] for i in topic.argsort()[-top_n:]]))
        print()

print_topics(lda, vectorizer)

Topic #1:
environmental palestinian demand palestine 2023 demonstrated protest people action climate activist

Topic #2:
2021 2022 including far protested gathered measure coronavirus right protest people

Topic #3:
citizen 2021 2020 government protest demanding protesting rally activist protested gathered

Topic #4:
prison protester member people force athens arrested group officer demonstrator police

Topic #5:
called 2021 people gathered coronavirus protested government student school protest 2020

Topic #6:
union working better 2023 held condition protested 2022 demand worker protest

Topic #7:
fsu russian 2024 union ukraine 2022 including people cgt demonstrated gathered

Topic #8:
protested 2023 area 2025 resident people outside 2024 protester held protest

Topic #9:
cgt government day protest national event reform pension people police 2023

Topic #10:
price movement eu february demonstration protest event nationwide agricultural 2024 farmer

Topic #11:
demand staged union healt

In [5]:
topic_distributions = lda.transform(X)
df['inferred_topic_LDA'] = topic_distributions.argmax(axis=1)

In [6]:
topic_keywords = {
    "Palestine": ["gaza", "west bank", "hamas", "palestinian", "idf"],
    "Israel": ["israel", "netanyahu", "zionist", "idf"],
    "Climate": ["climate", "global warming", "carbon", "emissions", "sustainability", "greenhouse"],
    "Consumerism": ["consumerism", "shopping", "advertising", "materialism", "brands"],
    "Migration": ["migration", "refugee", "immigrant", "asylum", "border"],
    "Labor rights": ["labor", "strike", "union", "wage", "workers rights", "employment"],
    "Racism": ["racism", "racist", "discrimination", "hate crime", "racial"],
    "Higher education": ["university", "college", "student loan", "tuition", "campus"],
    "Corruption": ["corruption", "bribery", "embezzlement", "fraud", "scandal"],
    "Anti-government": ["protest", "anti-government", "coup", "dictator", "regime"],
    "Peace": ["peace", "ceasefire", "nonviolence", "diplomacy", "truce"],
    "LGBT rights": ["lgbt", "gay", "trans", "queer", "pride", "same-sex"]
}

for topic in topic_keywords:
    topic_keywords[topic] = [kw.lower() for kw in topic_keywords[topic]]

def assign_topic(note):
    note_lower = str(note).lower()
    for topic, keywords in topic_keywords.items():
        if any(kw in note_lower for kw in keywords):
            return topic
    return "Unknown"

df["inferred_topic_hardcoded"] = df["notes"].apply(assign_topic)

In [7]:
df.to_csv("../data/notes_with_inferred_topics.csv", index=False)