In [8]:
!pip install -r requirements.txt




In [9]:
import chardet
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from collections import Counter
from matplotlib.backends.backend_pdf import PdfPages
import emoji
from pathlib import Path
import warnings
import unicodedata

from nltk.corpus import stopwords
from wordcloud import WordCloud
from nltk.util import ngrams

nltk.download('stopwords')
warnings.filterwarnings('ignore')
nltk.download('punkt', quiet=True)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

<font size = 5><b><u>Preprocessing</u></b></font>

In [15]:



def clean_text(s):
    """Clean a single text entry."""
    if not isinstance(s, str):
        return ""
    # fix encoding weirdness and accents
    s = unicodedata.normalize("NFKC", s)
    # lowercase
    s = s.lower()
    # remove URLs
    s = re.sub(r"http\S+|www\.\S+", " ", s)
    # remove mentions and hashtags
    s = re.sub(r"@\w+|#\w+", " ", s)
    # keep letters (English + Spanish + German)
    s = re.sub(r"[^a-zA-ZáéíóúñüäößÄÖÜ ]", " ", s)
    # collapse multiple spaces
    s = re.sub(r"\s+", " ", s).strip()
    s = emoji.replace_emoji(s, replace='')
    return s

def preprocess_file(input_path, output_path):
    """Load, clean, and save CSV."""
    print(f"\nProcessing {input_path}...")
    df = pd.read_csv(input_path, encoding="utf-8")

    # detect text column
    possible = [c for c in df.columns if c.lower() in ("text")]
    text_col = possible[0] if possible else df.columns[0]

    # keep text + label if available
    cols = ['id',text_col]
    if 'polarization' in df.columns:
        cols.append('polarization')
    df = df[[c for c in cols if c in df.columns]]

    # clean text
    df['text'] = df[text_col].apply(clean_text)

    # drop blanks + duplicates
    df = df[df['text'].str.strip() != ""]
    df = df.drop_duplicates(subset=['text'])

    # save cleaned file
    df.to_csv(output_path, index=False, encoding='utf-8-sig')
    print(f"Saved cleaned file: {output_path} ({len(df)} rows)")

# run for all three
files = [
    ("data/eng.csv", "data/eng_clean.csv"),
    ("data/deu.csv", "data/deu_clean.csv"),
    ("data/spa.csv", "data/spa_clean.csv")
]

for inp, out in files:
    preprocess_file(inp, out)



Processing data/eng.csv...
Saved cleaned file: data/eng_clean.csv (2671 rows)

Processing data/deu.csv...
Saved cleaned file: data/deu_clean.csv (3179 rows)

Processing data/spa.csv...
Saved cleaned file: data/spa_clean.csv (3302 rows)


<font size = 5>data_loader.py</font>

In [35]:

df_eng_train = pd.read_csv("data/eng_clean.csv", encoding="utf-8-sig")
df_eng_train = df_eng_train[['text', 'polarization']]
df_eng_test= pd.read_csv("data/eng_test.csv", encoding="utf-8-sig")
df_eng_test = df_eng_test[['text', 'polarization']]

df_deu_train = pd.read_csv("data/deu_clean.csv", encoding="utf-8-sig")
df_deu_train = df_deu_train[['text', 'polarization']]
df_deu_test= pd.read_csv("data/deu_test.csv", encoding="utf-8-sig")
df_deu_test = df_deu_test[['text', 'polarization']]

df_spa_train = pd.read_csv("data/spa_clean.csv", encoding="utf-8-sig")
df_spa_train = df_spa_train[['text', 'polarization']]
df_spa_test= pd.read_csv("data/spa_test.csv", encoding="utf-8-sig")
df_spa_test = df_spa_test[['text', 'polarization']]
print(df_eng_train.head())
print(df_deu_train.head())
print(df_spa_train.head())

                                                text  polarization
0           is defending imperialism in the dnd chat             0
1  still playing with this i am now following rac...             0
2  senate gov theres groups out there republicans...             0
3  abc md david anderson said the additional fund...             0
4  bad people i have some conservative values so ...             0
                                                text  polarization
0    natürlich bin ich linksgrün ich habe herz u n d             0
1  schuld sind habeck die grünen und diese linksg...             1
2  vielleicht ist da ja tatsächlich was dran höch...             1
3  so noch schnell alle linksgrün versifften demo...             1
4  ich drücke der störchin die daumen berlin ist ...             1
                                                text  polarization
0  bueno tirando y si hay repregunta entonces pal...             0
1                  caimos en su retorica de indigena          

In [40]:
import pandas as pd
import pickle
import re
from nltk.util import ngrams
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

# ====== Load your cleaned data ======
df_eng = pd.read_csv("data/eng_clean.csv", encoding="utf-8-sig")[['text', 'polarization']]
df_deu = pd.read_csv("data/deu_clean.csv", encoding="utf-8-sig")[['text', 'polarization']]
df_spa = pd.read_csv("data/spa_clean.csv", encoding="utf-8-sig")[['text', 'polarization']]

# ====== Split into train/test ======
df_eng_train, df_eng_test = train_test_split(df_eng, test_size=0.2, random_state=42, stratify=df_eng['polarization'])
df_deu_train, df_deu_test = train_test_split(df_deu, test_size=0.2, random_state=42, stratify=df_deu['polarization'])
df_spa_train, df_spa_test = train_test_split(df_spa, test_size=0.2, random_state=42, stratify=df_spa['polarization'])

# Optional: save these splits for future use
df_eng_train.to_csv("data/eng_train.csv", index=False)
df_eng_test.to_csv("data/eng_test.csv", index=False)
df_deu_train.to_csv("data/deu_train.csv", index=False)
df_deu_test.to_csv("data/deu_test.csv", index=False)
df_spa_train.to_csv("data/spa_train.csv", index=False)
df_spa_test.to_csv("data/spa_test.csv", index=False)

# ====== Function to generate vocab ======
def get_combined_vocab(texts, lang='english'):
    stop_words = set(stopwords.words(lang))
    if lang != 'english':
        stop_words.update(stopwords.words('english'))

    tokens_list = []
    for txt in texts:
        tokens = re.findall(r'\b\w+\b', str(txt).lower())
        tokens = [t for t in tokens if t not in stop_words and len(t) > 2 and t.isalpha()]
        tokens_list.extend(tokens)

    vocab_set = set()
    # unigrams
    for token in tokens_list:
        vocab_set.add(token)
    # bigrams and trigrams
    for n in [2, 3]:
        for gram in ngrams(tokens_list, n):
            if len(set(gram)) > 1:
                vocab_set.add(' '.join(gram))
    return list(vocab_set)

# ====== Generate vocab per class and save ======
datasets = {
    "english": df_eng_train,
    "german": df_deu_train,
    "spanish": df_spa_train
}

for lang, df in datasets.items():
    vocab_dict = {}
    for label, label_name in [(0, 'nonpolarized'), (1, 'polarized')]:
        texts = df[df['polarization'] == label]['text']
        vocab_dict[label_name] = get_combined_vocab(texts, lang=lang)

    # save vocab for this language
    with open(f"{lang}_vocab.pkl", "wb") as f:
        pickle.dump(vocab_dict, f)

    print(f"✅ Saved {lang.capitalize()} vocabulary (unigrams+bigrams+trigrams) by class as {lang}_vocab.pkl")


✅ Saved English vocabulary (unigrams+bigrams+trigrams) by class as english_vocab.pkl
✅ Saved German vocabulary (unigrams+bigrams+trigrams) by class as german_vocab.pkl
✅ Saved Spanish vocabulary (unigrams+bigrams+trigrams) by class as spanish_vocab.pkl


In [36]:
import pickle
with open("english_vocab.pkl", "rb") as f:
    english_vocab = pickle.load(f)

print(english_vocab['polarized'][:10])  # first 10 ngrams for polarized class
print(english_vocab['nonpolarized'][:10])


['breathes fox', 'essence trumpism', 'gaza israel woke', 'crisis', 'thwarting people', 'thats right', 'canadian literature cant', 'fever', 'cold', 'states screwed emboldened']
['think middle east', 'gallego announce', 'crisis', 'saw mike', 'putins relationship', 'husband goes', 'even bill clinton', 'ceasefire deal', 'dunno tell', 'immigration aiming']


In [41]:
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# ====== Load vocabularies ======
with open("english_vocab.pkl", "rb") as f:
    eng_vocab = pickle.load(f)
with open("german_vocab.pkl", "rb") as f:
    deu_vocab = pickle.load(f)
with open("spanish_vocab.pkl", "rb") as f:
    spa_vocab = pickle.load(f)
eng_vocab = list(set(eng_vocab['polarized'] + eng_vocab['nonpolarized']))
deu_vocab =list(set( deu_vocab['polarized'] + deu_vocab['nonpolarized']))
spa_vocab =list(set( spa_vocab['polarized'] + spa_vocab['nonpolarized']))
# ====== Function to train and evaluate TF-IDF + Logistic Regression ======
def train_eval(df_train, df_test, vocab, lang_name):
    print(f"\n===== {lang_name.upper()} =====")

    # TF-IDF vectorizer using your saved vocabulary
    tfidf = TfidfVectorizer(vocabulary=vocab)

    # Fit & transform train set
    X_train = tfidf.fit_transform(df_train['text'])
    y_train = df_train['polarization']

    # Transform test set
    X_test = tfidf.transform(df_test['text'])
    y_test = df_test['polarization']

    # Logistic Regression classifier
    clf = LogisticRegression(max_iter=1000)
    clf.fit(X_train, y_train)

    # Predictions
    y_pred = clf.predict(X_test)

    # Evaluation
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))

    return tfidf, clf

# ====== Train & evaluate for each language ======
tfidf_eng, clf_eng = train_eval(df_eng_train, df_eng_test, eng_vocab, "English")
tfidf_deu, clf_deu = train_eval(df_deu_train, df_deu_test, deu_vocab, "German")
tfidf_spa, clf_spa = train_eval(df_spa_train, df_spa_test, spa_vocab, "Spanish")



===== ENGLISH =====
Accuracy: 0.7102803738317757

Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.88      0.79       334
           1       0.69      0.42      0.52       201

    accuracy                           0.71       535
   macro avg       0.70      0.65      0.66       535
weighted avg       0.71      0.71      0.69       535


===== GERMAN =====
Accuracy: 0.6729559748427673

Classification Report:
               precision    recall  f1-score   support

           0       0.68      0.70      0.69       334
           1       0.66      0.64      0.65       302

    accuracy                           0.67       636
   macro avg       0.67      0.67      0.67       636
weighted avg       0.67      0.67      0.67       636


===== SPANISH =====
Accuracy: 0.6656580937972768

Classification Report:
               precision    recall  f1-score   support

           0       0.65      0.71      0.68       329
           1 