# Grok (Synonym replacement augmentation)

In [None]:
# Install required packages
!pip install transformers datasets torch evaluate accelerate nltk optuna scikit-learn iterative-stratification emoji pandarallel sentencepiece huggingface_hub transformers[sentencepiece] pyarrow jsonlines nlpaug

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting optuna
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting iterative-stratification
  Downloading iterative_stratification-0.1.9-py3-none-any.whl.metadata (1.3 kB)
Collecting emoji
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Collecting pandarallel
  Downloading pandarallel-1.6.5.tar.gz (14 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting jsonlines
  Downloading jsonlines-4.0.0-py3-none-any.whl.metadata (1.6 kB)
Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl.metadata (14 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70

In [None]:
# Import libraries
import os
import re
import emoji
import json
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
import nltk
import optuna
from pandarallel import pandarallel
from collections import Counter

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoConfig,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback,
    get_linear_schedule_with_warmup,
    DataCollatorWithPadding
)

from sklearn.metrics import (
    f1_score,
    precision_score,
    recall_score,
    classification_report
)
from sklearn.preprocessing import MultiLabelBinarizer
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit

from torch.nn import BCEWithLogitsLoss
from torch.utils.data import Dataset
import nlpaug.augmenter.word as naw

In [None]:
# Set random seeds for reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

set_seed()

# Initialize parallel processing for pandas
pandarallel.initialize(progress_bar=True, nb_workers=4)

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [None]:
# Configure paths and constants
DATA_PATH = "."  # Adjust to your data path
SAVE_PATH = "./models"
LOGS_PATH = "./logs"
MODEL_NAME = "xlm-roberta-large"
MAX_LENGTH = 256
BATCH_SIZE = 8
ACCUMULATION_STEPS = 4
# EPOCHS = 20
# EPOCHS = 8 #etay balo asche 35%
EPOCHS = 10
LR = 2e-5
WARMUP_RATIO = 0.1
# WEIGHT_DECAY = 0.01
WEIGHT_DECAY = 0.1

os.makedirs(SAVE_PATH, exist_ok=True)
os.makedirs(LOGS_PATH, exist_ok=True)

In [None]:
!git clone https://github.com/unlp-workshop/unlp-2025-shared-task.git

Cloning into 'unlp-2025-shared-task'...
remote: Enumerating objects: 90, done.[K
remote: Counting objects: 100% (23/23), done.[K
remote: Compressing objects: 100% (18/18), done.[K
remote: Total 90 (delta 8), reused 10 (delta 5), pack-reused 67 (from 1)[K
Receiving objects: 100% (90/90), 4.60 MiB | 7.47 MiB/s, done.
Resolving deltas: 100% (30/30), done.


In [None]:
%cd unlp-2025-shared-task/data
import os
os.listdir()
%cd span_detection
os.listdir()

/content/unlp-2025-shared-task/data
/content/unlp-2025-shared-task/data/span_detection


['test.csv', 'train.parquet', 'solution.csv']

In [None]:
# Load dataset
print("Loading dataset...")
# df = pd.read_parquet(os.path.join(DATA_PATH, "/content/train.parquet"))
import pandas as pd

df = pd.read_parquet("train.parquet")

# Display the first few rows
df.head()


Loading dataset...


Unnamed: 0,id,content,lang,manipulative,techniques,trigger_words
0,0bb0c7fa-101b-4583-a5f9-9d503339141c,Новий огляд мапи DeepState від російського вій...,uk,True,"[euphoria, loaded_language]","[[27, 63], [65, 88], [90, 183], [186, 308]]"
1,7159f802-6f99-4e9d-97bd-6f565a4a0fae,Недавно 95 квартал жёстко поглумился над русск...,ru,True,"[loaded_language, cherry_picking]","[[0, 40], [123, 137], [180, 251], [253, 274]]"
2,e6a427f1-211f-405f-bd8b-70798458d656,🤩\nТим часом йде евакуація Бєлгородського авто...,uk,True,"[loaded_language, euphoria]","[[55, 100]]"
3,1647a352-4cd3-40f6-bfa1-d87d42e34eea,В Україні найближчим часом мають намір посилит...,uk,False,,
4,9c01de00-841f-4b50-9407-104e9ffb03bf,"Расчёты 122-мм САУ 2С1 ""Гвоздика"" 132-й бригад...",ru,True,[loaded_language],"[[114, 144]]"


In [None]:
def convert_techniques(x):
    if x is None:
        return []
    if isinstance(x, np.ndarray):
        return x.tolist()
    if isinstance(x, list):
        return x
    if isinstance(x, str):
        try:
            return json.loads(x)
        except:
            return []
    return []

In [None]:
df['techniques'] = df['techniques'].apply(convert_techniques)

In [None]:
df['techniques'].head()

Unnamed: 0,techniques
0,"[euphoria, loaded_language]"
1,"[loaded_language, cherry_picking]"
2,"[loaded_language, euphoria]"
3,[]
4,[loaded_language]


In [None]:
print(f"Dataset loaded with {len(df)} samples")
print(f"Languages: {df['lang'].value_counts().to_dict()}")
all_techniques = [t for techs in df['techniques'] for t in techs]
techniques_counter = Counter(all_techniques)
print("Techniques distribution:")
for technique, count in techniques_counter.most_common():
    print(f"  {technique}: {count} ({count/len(df)*100:.2f}%)")

Dataset loaded with 3822 samples
Languages: {'uk': 2147, 'ru': 1675}
Techniques distribution:
  loaded_language: 1973 (51.62%)
  cherry_picking: 512 (13.40%)
  glittering_generalities: 483 (12.64%)
  cliche: 463 (12.11%)
  euphoria: 462 (12.09%)
  fud: 385 (10.07%)
  appeal_to_fear: 300 (7.85%)
  whataboutism: 158 (4.13%)
  bandwagon: 157 (4.11%)
  straw_man: 138 (3.61%)


In [None]:
ukrainian_emoji_dict = {
    # ⚡ Urgency & Breaking News
    "⚡": "терміново",  # "urgent" / "breaking news"
    "⚠️": "попередження",  # "warning"
    "🚨": "сирена",  # "alert"
    "❗": "важливо",  # "important"
    "❓": "питання",  # "questionable"

    # 🎭 Emotional Manipulation
    "😭": "плач",  # "crying" (used to exaggerate sadness)
    "😢": "смуток",  # "sadness"
    "😂": "сміх",  # "laughter" (mockery in propaganda)
    "😡": "гнів",  # "anger" (provoking emotions)
    "🤬": "обурення",  # "rage" (hate speech & provocation)
    "😱": "паніка",  # "panic" (fear appeal)
    "😨": "переляк",  # "fear"

    # 🔥 Propaganda & Exaggeration
    "🔥": "сенсація",  # "sensationalism" (exaggeration)
    "💥": "вибухова новина",  # "explosive news"
    "💣": "загроза",  # "threat" (fear-based manipulation)
    "⚔️": "війна",  # "war"

    # ❤️ Patriotism & Unity Appeals
    "🇺🇦": "україна",  # "Ukraine" (nationalism & unity)
    "🇷🇺": "росія",  # "Russia" (propaganda context)
    "❤️": "любов",  # "love" (nationalistic appeals)
    "🤝": "єдність",  # "unity"
    "💪": "сила",  # "strength" (mobilization)

    # 🎯 Call to Action & Persuasion
    "🚀": "перемога",  # "victory" (used in war propaganda)
    "🏆": "успіх",  # "success"
    "💰": "корупція",  # "corruption" (often used in political posts)

    # 🏛️ Government & Authority
    "🏛️": "влада",  # "government"
    "📢": "пропаганда",  # "propaganda"
    "📜": "закон",  # "law"

    # 📡 Media & Information Manipulation
    "📢": "фейк",  # "fake news"
    "📺": "телебачення",  # "media influence"
    "📰": "змі",  # "mass media"
    "💻": "кібервійна",  # "cyberwar"

    # 🎭 Distrust & Confusion
    "🤔": "сумнів",  # "doubt"
    "🙄": "скептицизм",  # "skepticism"
    "🧐": "маніпуляція",  # "manipulation"

    # **Restoring Full Set of Emojis Below**

    # People & Actions
    "👨": "чоловік",
    "👩": "жінка",
    "👮": "поліцейський",
    "👷": "будівельник",
    "🧑‍⚕️": "лікар",
    "🧑‍🎓": "студент",
    "🧑‍🏫": "вчитель",

    # Love & Affection
    "😍": "захоплення любов",  # Heart eyes emoji
    "🥰": "кохання ніжність",  # Smiling with hearts emoji
    "🤩": "зіркові очі",  # "star eyes" (amazement, awe)
    "😘": "поцілунок",
    "😗": "поцілунок",
    "😚": "поцілунок",
    "😙": "поцілунок",
    "💙": "синє серце",
    "💛": "жовте серце",
    "💜": "фіолетове серце",
    "💔": "розбите серце",
    "💕": "кохання",
    "💖": "захоплення",
    "💘": "закоханість",
    "😊": "посмішка",  # "smiling face with smiling eyes" (happiness)
    "😉": "підморгування",  # "winking face" (playfulness)
    "😆": "сміх до сліз",  # "laughing face with tightly closed eyes"
    "😄": "щасливе обличчя",  # "grinning face with smiling eyes"
    "😬": "криве обличчя",  # "grimacing face" (nervousness)
    "😋": "смачно",  # "face savoring food"
    "🤤": "смак",  # "drooling face" (delicious, food-related)
    "😎": "крутість",  # "smiling face with sunglasses" (cool)
    "😔": "сумний",  # "pensive face"
    "😞": "невдоволення",  # "disappointed face"
    "😒": "невдоволення",  # "unamused face"
    "🤧": "чихання",  # "sneezing face" (illness, sickness)

    # Gestures
    "👍": "схвалення",
    "👎": "несхвалення",
    "✊": "кулак",
    "🤛": "лівий кулак",
    "🤜": "правий кулак",
    "👏": "аплодисменти",
    "👐": "розкриті руки",
    "🤲": "молитва",
    "🤝": "рукостискання",
    "👉": "вказівний палець",  # "pointing finger"
    "👈": "вказівний палець лівий",  # "left pointing finger"
    "👌": "окей",  # "ok" (thumb and index forming a circle)
    "✋": "стоп",  # "stop" (raised hand)
    "👋": "привітання",  # "waving hand"
    "🖕": "середній палець",  # "middle finger" (disrespect)
    "🤞": "палець схрещений",  # "fingers crossed" (wish for good luck)
    "🙏": "молитва",

    # Fire, Explosion, War
    "🏹": "лук",
    "🛡️": "щит",

    # Money & Economy
    "💳": "кредитна картка",
    "🏦": "банк",

    # Nature & Weather
    "☀️": "сонце",
    "🌞": "сонячний день",
    "🌧️": "дощ",
    "⛈️": "гроза",
    "❄️": "сніг",
    "🌊": "хвиля",

    # Animals
    "🐶": "собака",
    "🐱": "кішка",
    "🦁": "лев",
    "🐺": "вовк",
    "🦊": "лисиця",
    "🐴": "кінь",
    "🐻": "ведмідь",

    # Food & Drinks
    "🍞": "хліб",
    "🥖": "батон",
    "🥔": "картопля",
    "🥕": "морква",
    "🍎": "яблуко",
    "🍇": "виноград",
    "🍓": "полуниця",
    "🥩": "м'ясо",
    "🍗": "курка",
    "🍻": "пиво",
    "🥂": "шампанське",

    # Transport
    "🚗": "автомобіль",
    "🚕": "таксі",
    "🚙": "джип",
    "🚌": "автобус",
    "🚆": "поїзд",
    "✈️": "літак",

    # Technology & Media
    "📱": "телефон",
    "💻": "ноутбук",
    "🎥": "кіно",
    "📸": "камера",
}

import emoji

def replace_emojis(text, lang):
    if lang == 'ru':
        # Use emoji.demojize with Russian descriptions
        text_with_desc = emoji.demojize(text, language="ru")
        return text_with_desc.replace(":", "").replace("_", " ")

    elif lang == 'uk':
        # Manually replace emojis with Ukrainian words
        for emoji_symbol, replacement in ukrainian_emoji_dict.items():
            text = text.replace(emoji_symbol, replacement)
        return text  # Return modified Ukrainian text

    return text  # Return unchanged text if language is neither RU nor UK



In [None]:
# Clean text
nltk.download('punkt', quiet=True)

def clean_text(text, lang):
    if not isinstance(text, str):
        return ""
    text = re.sub(r'http\S+|www\S+|https\S+', '[URL]', text)
    text = re.sub(r'@\w+', '[USER]', text)
    text = re.sub(r'#\w+', '[TAG]', text)
    # text = emoji.replace_emoji(text, replace='[EMOJI]')
    text = replace_emojis(text, lang)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['cleaned_content'] = df.parallel_apply(
    lambda row: clean_text(row['content'], row['lang']), axis=1
)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=956), Label(value='0 / 956'))), HB…

In [None]:
# Set up synonym replacement augmenter
print("Setting up synonym replacement augmenter...")
device = "cuda" if torch.cuda.is_available() else "cpu"
aug = naw.ContextualWordEmbsAug(
    model_path='xlm-roberta-large',
    action="substitute",
    top_k=5,
    # aug_p=0.2,  # 20% of words to be augmented
    aug_p=0.3,  # Increased to 30% for more augmentation
    device=device
)

Setting up synonym replacement augmenter...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

In [None]:
# Define synonym replacement function
def synonym_replace(text, aug):
    if not isinstance(text, str) or len(text) < 10:
        return text
    try:
        augmented_text = aug.augment(text)[0]
        return augmented_text
    except Exception as e:
        print(f"Synonym replacement error: {e}")
        return text

In [None]:
# Identify rare classes and augment data
class_counts = Counter(all_techniques)
total_samples = len(df)
class_percentages = {k: v/total_samples for k, v in class_counts.items()}
RARE_CLASSES = {k for k, v in class_percentages.items() if v < 0.05}
print(f"Rare classes targeted for augmentation: {RARE_CLASSES}")

Rare classes targeted for augmentation: {'straw_man', 'bandwagon', 'whataboutism'}


In [None]:
print("Augmenting data for rare classes...")
augmented_data = []
NUM_AUGMENTATIONS = 7  # Increase to 3 augmentations per sample #5 gave the best ans

Augmenting data for rare classes...


In [None]:
rare_class_samples = df[df['techniques'].apply(lambda x: any(label in RARE_CLASSES for label in x))]
print(f"Found {len(rare_class_samples)} samples with rare classes")

Found 413 samples with rare classes


In [None]:
for _, row in tqdm(rare_class_samples.iterrows(), total=len(rare_class_samples)):
    for _ in range(NUM_AUGMENTATIONS):
        try:
            augmented_content = synonym_replace(row['cleaned_content'], aug)
            if augmented_content != row['cleaned_content']:
                augmented_data.append({
                    'content': augmented_content,
                    'cleaned_content': augmented_content,
                    'techniques': row['techniques'],
                    'lang': row['lang'],
                    'manipulative': row['manipulative']
                })
        except Exception as e:
            print(f"Augmentation error: {e}")

augmented_df = pd.DataFrame(augmented_data)
print(f"Generated {len(augmented_df)} augmented samples")


100%|██████████| 413/413 [00:18<00:00, 22.19it/s]

Generated 455 augmented samples





In [None]:
df = pd.concat([df, augmented_df], ignore_index=True)
print(f"Final dataset size: {len(df)}")

Final dataset size: 4277


In [None]:
# Prepare labels
mlb = MultiLabelBinarizer()
labels = mlb.fit_transform(df['techniques'])
CLASSES = list(mlb.classes_)
print(f"Number of unique techniques: {len(CLASSES)}")

# Perform stratified split
msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_idx, val_idx in msss.split(df, labels):
    train_df = df.iloc[train_idx].reset_index(drop=True)
    val_df = df.iloc[val_idx].reset_index(drop=True)

train_labels = labels[train_idx]
val_labels = labels[val_idx]

Number of unique techniques: 10


In [None]:
# Set up tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_function(examples):
    return tokenizer(
        examples['cleaned_content'],
        padding=False,
        truncation=True,
        max_length=MAX_LENGTH
    )

In [None]:
# Custom Dataset class
class MultiLabelDataset(Dataset):
    def __init__(self, df, labels, tokenizer):
        self.encodings = tokenizer(
            df['cleaned_content'].tolist(),
            padding=True,
            truncation=True,
            max_length=MAX_LENGTH
        )
        self.labels = torch.tensor(labels, dtype=torch.float32)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = MultiLabelDataset(train_df, train_labels, tokenizer)
val_dataset = MultiLabelDataset(val_df, val_labels, tokenizer)

In [None]:
# Define focal loss
class FocalLoss(torch.nn.Module):
    def __init__(self, alpha=0.25, gamma=2.0):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, inputs, targets):
        BCE_loss = BCEWithLogitsLoss(reduction='none')(inputs, targets)
        pt = torch.exp(-BCE_loss)
        F_loss = self.alpha * (1 - pt) ** self.gamma * BCE_loss
        return F_loss.mean()


In [None]:
# Custom model with focal loss
class CustomModel(torch.nn.Module):
    def __init__(self, model_name, num_labels):
        super(CustomModel, self).__init__()
        self.model = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=num_labels,
            problem_type="multi_label_classification"
        )
        self.loss_fn = FocalLoss()

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        if labels is not None:
            loss = self.loss_fn(logits, labels)
            return {'loss': loss, 'logits': logits}
        return {'logits': logits}

In [None]:
# Initialize model
config = AutoConfig.from_pretrained(MODEL_NAME)
model = CustomModel(MODEL_NAME, num_labels=len(CLASSES))

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Custom Trainer
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs = model(**inputs, labels=labels)
        loss = outputs['loss']
        return (loss, outputs) if return_outputs else loss

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir=SAVE_PATH,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=ACCUMULATION_STEPS,
    learning_rate=LR,
    warmup_ratio=WARMUP_RATIO,
    weight_decay=WEIGHT_DECAY,
    logging_dir=LOGS_PATH,
    logging_steps=10,
    # Replace 'evaluation_strategy' with 'eval_strategy'
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",  #macro
    greater_is_better=True,
    fp16=torch.cuda.is_available(),
    report_to="none"
)

In [None]:
# Compute metrics function
def compute_metrics(pred):
    logits = pred.predictions
    labels = pred.label_ids
    preds = (torch.sigmoid(torch.tensor(logits)) > 0.5).int().numpy()
    f1_micro = f1_score(labels, preds, average='micro')
    f1_macro = f1_score(labels, preds, average='macro')
    precision = precision_score(labels, preds, average='micro')
    recall = recall_score(labels, preds, average='micro')
    return {
        'f1_micro': f1_micro,
        'f1_macro': f1_macro,
        'precision': precision,
        'recall': recall
    }

In [None]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],  # Reduce from 3 to 2
    data_collator=DataCollatorWithPadding(tokenizer)
)

In [None]:
# Train the model
print("Training the model...")
trainer.train()

Training the model...


Epoch,Training Loss,Validation Loss


In [None]:
# Find optimal thresholds
print("Finding optimal thresholds...")
val_preds = trainer.predict(val_dataset)
logits = val_preds.predictions
true_labels = val_preds.label_ids

In [None]:
def objective(trial):
    thresholds = [trial.suggest_float(f"threshold_{i}", 0.1, 0.9) for i in range(len(CLASSES))]
    preds = (torch.sigmoid(torch.tensor(logits)) > torch.tensor(thresholds)).int().numpy()
    return -f1_score(true_labels, preds, average='macro')  # Change to macro F1

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)
optimal_thresholds = [study.best_params[f"threshold_{i}"] for i in range(len(CLASSES))]
print("Optimal thresholds:", dict(zip(CLASSES, optimal_thresholds)))

In [None]:
# Final evaluation
val_preds_adjusted = (torch.sigmoid(torch.tensor(logits)) > torch.tensor(optimal_thresholds)).int().numpy()
print("Final evaluation on validation set:")
print(classification_report(true_labels, val_preds_adjusted, target_names=CLASSES))

In [None]:
# [START OF EDITED CODE FOR CONFUSION MATRIX]
# Flatten the labels and predictions for a single 2x2 confusion matrix
true_flat = true_labels.ravel()
pred_flat = val_preds_adjusted.ravel()

# Compute confusion matrix
cm = confusion_matrix(true_flat, pred_flat, labels=[0, 1])

# Plot confusion matrix as a heatmap
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['0', '1'], yticklabels=['0', '1'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix (All Labels Combined)')
plt.savefig(os.path.join(LOGS_PATH, 'confusion_matrix.png'))
plt.close()
print("Confusion matrix saved as 'confusion_matrix.png' in LOGS_PATH")
# [END OF EDITED CODE FOR CONFUSION MATRIX]

In [None]:
# Step 18: Save the model, tokenizer, and thresholds
print("Saving model, tokenizer, and thresholds...")
final_model_path = os.path.join(SAVE_PATH, "final-model")
os.makedirs(final_model_path, exist_ok=True)

In [None]:
# Save model
model.model.save_pretrained(final_model_path)

# Save tokenizer
tokenizer.save_pretrained(final_model_path)

In [None]:
# Re-define label_names before saving
label_names = mlb.classes_

In [None]:
# Save thresholds and label mapping
with open(os.path.join(final_model_path, "thresholds.json"), "w") as f:
    json.dump({
        "thresholds": optimal_thresholds,  # No need to call tolist()
        "labels": label_names.tolist() # label_names still needs conversion
    }, f)

print("Training complete! Model and configurations saved.")

In [None]:
from google.colab import drive
import pandas as pd
import re

# Mount Google Drive to access files
drive.mount('/content/drive')

# Path to the test.csv file in Google Drive
file_path = '/content/drive/MyDrive/Colab Notebooks/test (4).csv'  # Modify this path according to where your file is located

# Preprocess the content of the test data
test_df = pd.read_csv(file_path)

In [None]:
# If you need to clean text like removing URLs, mentions, etc. (you did this for train)
import re

def clean_text(text):
    """Clean text by removing URLs, mentions, and special characters"""
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r"\@\w+", "", text)  # Remove mentions (@username)
    # You can add more cleaning logic as required

    text = text.lower()
    return text

test_df['content'] = test_df['content'].apply(clean_text)

In [None]:
def predict_manipulation(test_df, model_path, thresholds_path=None, batch_size=16):
    """Predict manipulation techniques for new texts and return predictions in submission format"""

    # Load the trained model and tokenizer
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    # Load thresholds and labels if provided
    if thresholds_path:
        with open(thresholds_path, "r") as f:
            config = json.load(f)
            thresholds = np.array(config["thresholds"])
            labels = config["labels"]
    else:
        thresholds = np.array([0.5] * model.config.num_labels)
        labels = [f"label_{i}" for i in range(model.config.num_labels)]

    # Define the required submission labels
    required_labels = [
        'straw_man', 'appeal_to_fear', 'fud', 'bandwagon', 'whataboutism',
        'loaded_language', 'glittering_generalities', 'euphoria', 'cherry_picking', 'cliche'
    ]

    # Verify that the model's labels match the required labels
    if set(labels) != set(required_labels):
        raise ValueError(f"Model labels {labels} do not match required submission labels {required_labels}")

    # Create an empty list to store results
    all_predictions = []

    # Process data in smaller batches
    for i in range(0, len(test_df), batch_size):
        batch = test_df.iloc[i:i + batch_size]
        inputs = tokenizer(batch['content'].tolist(), truncation=True, padding='max_length',
                          max_length=256, return_tensors="pt")

        # Move tensors to the same device as the model
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model.to(device)
        inputs = {key: value.to(device) for key, value in inputs.items()}

        # Get predictions from the model
        model.eval()
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits

        # Apply sigmoid to get probabilities
        probabilities = torch.sigmoid(logits).cpu().numpy()

        # Apply thresholds to get binary predictions
        predictions = (probabilities > thresholds).astype(int)

        # Append predictions with correct indexing
        for batch_idx, (idx, row) in enumerate(batch.iterrows()):
            pred_dict = {'id': row['id']}
            for i, label in enumerate(labels):
                pred_dict[label] = int(predictions[batch_idx, i])  # Use batch-local index
            all_predictions.append(pred_dict)

    # Create the submission DataFrame
    submission_df = pd.DataFrame(all_predictions)

    # Reorder columns to match the required submission format
    submission_df = submission_df[['id'] + required_labels]

    # Save the predictions to a CSV file
    submission_file = 'submission.csv'
    # submission_df.to_csv(submission_file, index=False)
    submission_df.to_csv('/content/drive/MyDrive/Colab Notebooks/submission4.csv', index=False)
    print(f"Prediction complete! ")

In [None]:

predict_manipulation(test_df, model_path='./models/final-model', thresholds_path='./models/final-model/thresholds.json', batch_size=16)

In [None]:
df2 = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/submission2.csv")

In [None]:
df2.head()

In [None]:
df2.shape

## baseline models - *2*

In [None]:
# Install required packages
!pip install transformers datasets torch evaluate accelerate nltk optuna scikit-learn iterative-stratification emoji pandarallel sentencepiece huggingface_hub transformers[sentencepiece] pyarrow jsonlines

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting optuna
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting iterative-stratification
  Downloading iterative_stratification-0.1.9-py3-none-any.whl.metadata (1.3 kB)
Collecting emoji
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Collecting pandarallel
  Downloading pandarallel-1.6.5.tar.gz (14 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting jsonlines
  Downloading jsonlines-4.0.0-py3-none-any.whl.metadata (1.6 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metada

In [None]:
# Import necessary libraries
import os
import re
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
import emoji
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import train_test_split
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
import xgboost as xgb
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

# Define constants and paths
DATA_PATH = "."
SAVE_PATH = "./baseline_models"
os.makedirs(SAVE_PATH, exist_ok=True)

# List of techniques (target labels)
TECHNIQUES = [
    "straw_man",
    "appeal_to_fear",
    "fud",
    "bandwagon",
    "whataboutism",
    "loaded_language",
    "glittering_generalities",
    "euphoria",
    "cherry_picking",
    "cliche",
]

# Set random seed for reproducibility
np.random.seed(42)

# Load dataset
print("Loading dataset...")
df = pd.read_parquet("/content/train.parquet")
print(f"Dataset loaded with {len(df)} samples")



Loading dataset...
Dataset loaded with 3822 samples


In [None]:
# Function to convert techniques to list format
def convert_techniques(x):
    if x is None:
        return []
    if isinstance(x, np.ndarray):
        return x.tolist()
    if isinstance(x, list):
        return x
    if isinstance(x, str):
        try:
            return json.loads(x)
        except:
            return []
    return []

df['techniques'] = df['techniques'].apply(convert_techniques)

# Create binary labels for each technique
for technique in TECHNIQUES:
    df[technique] = df['techniques'].apply(lambda x: 1 if technique in x else 0)

In [None]:
# Text cleaning function
def clean_text(text, lang):
    if not isinstance(text, str):
        return ""
    # Replace URLs, user mentions, and hashtags
    text = re.sub(r'http\S+|www\S+|https\S+', '[URL]', text)
    text = re.sub(r'@\w+', '[USER]', text)
    text = re.sub(r'#\w+', '[TAG]', text)

    # Remove emojis
    text = emoji.replace_emoji(text, '')

    # Clean extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply text cleaning
print("Cleaning text...")
df['cleaned_content'] = df.apply(
    lambda row: clean_text(row['content'], row['lang']), axis=1
)

Cleaning text...


In [None]:
# Feature extraction
def extract_features(df_train, df_test):
    print("Extracting TF-IDF features...")
    # TF-IDF Vectorizer
    tfidf = TfidfVectorizer(
        max_features=15000,
        min_df=5,
        max_df=0.8,
        ngram_range=(1, 2),
        sublinear_tf=True
    )

    X_train = tfidf.fit_transform(df_train['cleaned_content'])
    X_test = tfidf.transform(df_test['cleaned_content'])

    print(f"Train features shape: {X_train.shape}")
    print(f"Test features shape: {X_test.shape}")

    return X_train, X_test, tfidf

In [None]:
# Split the data using stratified sampling
print("Splitting data...")
X = df['cleaned_content']
y = df[TECHNIQUES].values

msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_idx, val_idx in msss.split(X, y):
    train_df = df.iloc[train_idx].reset_index(drop=True)
    val_df = df.iloc[val_idx].reset_index(drop=True)

print(f"Train set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")

Splitting data...
Train set size: 3057
Validation set size: 765


In [None]:
!pip install -U xgboost

Collecting xgboost
  Downloading xgboost-3.0.0-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.0-py3-none-manylinux_2_28_x86_64.whl (253.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.9/253.9 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xgboost
  Attempting uninstall: xgboost
    Found existing installation: xgboost 2.1.4
    Uninstalling xgboost-2.1.4:
      Successfully uninstalled xgboost-2.1.4
Successfully installed xgboost-3.0.0


In [None]:
# Extract features
X_train, X_val, tfidf_vectorizer = extract_features(train_df, val_df)
y_train = train_df[TECHNIQUES].values
y_val = val_df[TECHNIQUES].values

# SVM baseline model
def train_svm_model(X_train, y_train):
    print("Training SVM model...")

    # Using LinearSVC for better performance with large datasets
    svm_model = MultiOutputClassifier(
        LinearSVC(C=1.0, class_weight='balanced', max_iter=10000, random_state=42)
    )

    svm_model.fit(X_train, y_train)
    return svm_model

# XGBoost baseline model
def train_xgboost_model(X_train, y_train):
    print("Training XGBoost model...")

    xgb_model = MultiOutputClassifier(
        xgb.XGBClassifier(
            n_estimators=200,
            learning_rate=0.1,
            max_depth=5,
            subsample=0.8,
            colsample_bytree=0.8,
            objective='binary:logistic',
            random_state=42,
            # use_label_encoder=False,
            eval_metric='logloss'
        )
    )

    xgb_model.fit(X_train, y_train)
    return xgb_model


Extracting TF-IDF features...
Train features shape: (3057, 9245)
Test features shape: (765, 9245)


In [None]:
# Train the SVM model
svm_model = train_svm_model(X_train, y_train)

Training SVM model...


In [None]:
# Evaluate SVM model
print("Evaluating SVM model...")
svm_preds = svm_model.predict(X_val)
svm_f1_macro = f1_score(y_val, svm_preds, average='macro')
print(f"SVM Macro F1 Score: {svm_f1_macro:.4f}")
print("\nSVM Classification Report:")
print(classification_report(y_val, svm_preds, target_names=TECHNIQUES))

Evaluating SVM model...
SVM Macro F1 Score: 0.3307

SVM Classification Report:
                         precision    recall  f1-score   support

              straw_man       0.20      0.04      0.06        28
         appeal_to_fear       0.47      0.33      0.39        60
                    fud       0.47      0.43      0.45        77
              bandwagon       0.40      0.06      0.11        31
           whataboutism       0.15      0.06      0.09        32
        loaded_language       0.69      0.69      0.69       395
glittering_generalities       0.58      0.53      0.55        97
               euphoria       0.36      0.34      0.35        92
         cherry_picking       0.38      0.44      0.41       102
                 cliche       0.23      0.19      0.21        93

              micro avg       0.53      0.47      0.50      1007
              macro avg       0.39      0.31      0.33      1007
           weighted avg       0.51      0.47      0.48      1007
         

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Train the XGBoost model
xgb_model = train_xgboost_model(X_train, y_train)

Training XGBoost model...


In [None]:
# Evaluate XGBoost model
print("Evaluating XGBoost model...")
xgb_preds = xgb_model.predict(X_val)
xgb_f1_macro = f1_score(y_val, xgb_preds, average='macro')
print(f"XGBoost Macro F1 Score: {xgb_f1_macro:.4f}")
print("\nXGBoost Classification Report:")
print(classification_report(y_val, xgb_preds, target_names=TECHNIQUES))

Evaluating XGBoost model...
XGBoost Macro F1 Score: 0.2081

XGBoost Classification Report:
                         precision    recall  f1-score   support

              straw_man       0.00      0.00      0.00        28
         appeal_to_fear       0.40      0.07      0.11        60
                    fud       0.54      0.17      0.26        77
              bandwagon       0.00      0.00      0.00        31
           whataboutism       0.00      0.00      0.00        32
        loaded_language       0.66      0.66      0.66       395
glittering_generalities       0.63      0.38      0.47        97
               euphoria       0.60      0.16      0.26        92
         cherry_picking       0.42      0.21      0.28       102
                 cliche       0.18      0.02      0.04        93

              micro avg       0.61      0.35      0.45      1007
              macro avg       0.34      0.17      0.21      1007
           weighted avg       0.50      0.35      0.39      10

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Save the models
print("Saving models...")
import pickle
with open(os.path.join(SAVE_PATH, "svm_model.pkl"), "wb") as f:
    pickle.dump(svm_model, f)
with open(os.path.join(SAVE_PATH, "xgb_model.pkl"), "wb") as f:
    pickle.dump(xgb_model, f)
with open(os.path.join(SAVE_PATH, "tfidf_vectorizer.pkl"), "wb") as f:
    pickle.dump(tfidf_vectorizer, f)

Saving models...


In [None]:
# Predictions on test data
def predict_on_test(model, vectorizer, test_file_path, output_file_path):
    print(f"Loading test data from {test_file_path}...")
    test_df = pd.read_csv(test_file_path)

    # Clean test data
    test_df['cleaned_content'] = test_df.apply(
        lambda row: clean_text(row['content'], row.get('lang', 'uk')), axis=1
    )

    # Extract features
    X_test = vectorizer.transform(test_df['cleaned_content'])

    # Make predictions
    print("Making predictions...")
    predictions = model.predict(X_test)

    # Create submission dataframe
    submission_df = pd.DataFrame()
    submission_df['id'] = test_df['id']

    for i, technique in enumerate(TECHNIQUES):
        submission_df[technique] = predictions[:, i]

    # Save predictions
    submission_df.to_csv(output_file_path, index=False)
    print(f"Predictions saved to {output_file_path}")

    return submission_df

# Predict using SVM
svm_submission_path = os.path.join(SAVE_PATH, "svm_submission.csv")
predict_on_test(svm_model, tfidf_vectorizer, "/content/test (1).csv", svm_submission_path)

# Predict using XGBoost
xgb_submission_path = os.path.join(SAVE_PATH, "xgb_submission.csv")
predict_on_test(xgb_model, tfidf_vectorizer, "/content/test (1).csv", xgb_submission_path)



Loading test data from /content/test (1).csv...
Making predictions...
Predictions saved to ./baseline_models/svm_submission.csv
Loading test data from /content/test (1).csv...
Making predictions...
Predictions saved to ./baseline_models/xgb_submission.csv


Unnamed: 0,id,straw_man,appeal_to_fear,fud,bandwagon,whataboutism,loaded_language,glittering_generalities,euphoria,cherry_picking,cliche
0,521cd2e8-dd9f-42c4-98ba-c0c8890ff1ba,0,0,0,0,0,1,0,0,0,0
1,9b2a61e4-d14e-4ff7-b304-e73d720319bf,0,0,0,0,0,0,0,0,0,0
2,f0f1c236-80a8-4d25-b30c-a420a39be632,0,0,0,0,0,1,0,0,0,0
3,31ea05ba-2c2b-4b84-aba7-f3cf6841b204,0,0,0,0,0,0,0,0,0,0
4,a79e13ec-6d9a-40b5-b54c-7f4f743a7525,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
5730,e8e22b6d-0068-4afb-b606-4a1baa8a8d4c,0,0,1,0,0,1,0,0,1,0
5731,8b1d69b4-69ce-4e40-b4ba-dd2f370a8b6f,0,0,1,0,0,1,0,0,1,0
5732,c2246217-3358-4f61-bda8-e2ec21aed5b2,0,0,0,0,0,1,0,0,0,0
5733,45aa63c4-2248-4a0e-8f66-f3d23b6828ed,0,0,0,0,0,1,0,0,0,0


In [None]:
# If solution file exists, evaluate on test set
def evaluate_test_predictions(solution_file, prediction_file):
    if os.path.exists(solution_file):
        print(f"Evaluating predictions against {solution_file}...")
        solution_df = pd.read_csv(solution_file)
        prediction_df = pd.read_csv(prediction_file)

        # Calculate macro F1 score
        macro_f1 = f1_score(
            solution_df[TECHNIQUES],
            prediction_df[TECHNIQUES],
            average="macro"
        )
        print(f"Test Macro F1 Score: {macro_f1:.4f}")

        # Detailed report
        print("\nClassification Report on Test Data:")
        print(classification_report(
            solution_df[TECHNIQUES],
            prediction_df[TECHNIQUES],
            target_names=TECHNIQUES
        ))

        return macro_f1
    else:
        print(f"Solution file {solution_file} not found. Cannot evaluate test predictions.")
        return None

# Evaluate if solution file exists
solution_file = "solution.csv"
if os.path.exists(solution_file):
    print("\nEvaluating SVM model on test data:")
    svm_test_f1 = evaluate_test_predictions("/content/solution.csv", svm_submission_path)

    print("\nEvaluating XGBoost model on test data:")
    xgb_test_f1 = evaluate_test_predictions("/content/solution.csv", xgb_submission_path)
   # Compare models
    print("\nModel Comparison:")
    # print(f"SVM Validation F1: {svm_f1_macro:.4f}, Test F1: {svm_test_f1:.4f if svm_test_f1 else 'N/A'}")
    # print(f"XGBoost Validation F1: {xgb_f1_macro:.4f}, Test F1: {xgb_test_f1:.4f if xgb_test_f1 else 'N/A'}")
    print(f"SVM Validation F1: {svm_f1_macro:.4f}, Test F1: {'N/A' if svm_test_f1 is None else f'{svm_test_f1:.4f}'}")
    print(f"XGBoost Validation F1: {xgb_f1_macro:.4f}, Test F1: {'N/A' if xgb_test_f1 is None else f'{xgb_test_f1:.4f}'}")
else:
    print(f"Solution file {solution_file} not found. Evaluation on test data not possible.")

print("Baseline modeling complete!")



Evaluating SVM model on test data:
Evaluating predictions against /content/solution.csv...
Test Macro F1 Score: 0.3060

Classification Report on Test Data:
                         precision    recall  f1-score   support

              straw_man       0.18      0.06      0.09       207
         appeal_to_fear       0.27      0.18      0.22       449
                    fud       0.40      0.35      0.37       576
              bandwagon       0.25      0.06      0.09       236
           whataboutism       0.19      0.07      0.10       235
        loaded_language       0.67      0.67      0.67      2959
glittering_generalities       0.61      0.51      0.55       723
               euphoria       0.44      0.36      0.39       695
         cherry_picking       0.38      0.39      0.39       768
                 cliche       0.22      0.16      0.19       695

              micro avg       0.52      0.44      0.48      7543
              macro avg       0.36      0.28      0.31      7

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
