Task 1 - Method 1 - BERT Classifier Technique 

In [49]:
!pip install transformers
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm
import os


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [50]:
def remove_tags(text):
    return text.replace("<BOS>", "").replace("<EOS>", "").strip()


train_df = pd.read_csv('propaganda_train.tsv',
                       delimiter='\t', names=['label', 'sentence'])
test_df = pd.read_csv('propaganda_val.tsv', delimiter='\t',
                      names=['label', 'sentence'])

train_df['sentence'] = train_df['sentence'].apply(remove_tags)
test_df['sentence'] = test_df['sentence'].apply(remove_tags)

train_df['label'] = train_df['label'].apply(
    lambda x: 0 if x == 'not_propaganda' else 1)
test_df['label'] = test_df['label'].apply(
    lambda x: 0 if x == 'not_propaganda' else 1)

train_df, val_df = train_test_split(
    train_df, test_size=0.1, random_state=42)

train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

In [51]:
class PropagandaDataset(Dataset):
    def __init__(self, sentences, labels, tokenizer, max_length):
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long),
        }

In [52]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_length = 128

train_dataset = PropagandaDataset(
    train_df['sentence'], train_df['label'], tokenizer, max_length)
val_dataset = PropagandaDataset(
    val_df['sentence'], val_df['label'], tokenizer, max_length)
test_dataset = PropagandaDataset(
    test_df['sentence'], test_df['label'], tokenizer, max_length)

batch_size = 16

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased', num_labels=2)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
epochs = 3
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=0, num_training_steps=total_steps)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [53]:
def train_epoch(model, data_loader, optimizer, device, scheduler):
    model.train()
    total_loss = 0
    for batch in tqdm(data_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(
            input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
    return total_loss / len(data_loader)


In [54]:
def eval_epoch(model, data_loader, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in tqdm(data_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(
                input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
    return total_loss / len(data_loader)


def get_classification_metrics(model, data_loader, device):
    model.eval()
    predictions, true_labels = [], []

    with torch.no_grad():
        for batch in tqdm(data_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs.logits, dim=1)

            predictions.extend(preds.tolist())
            true_labels.extend(labels.tolist())

    print(classification_report(true_labels, predictions))


In [55]:
def main():
    mode = input(
        "Enter 'train' to train a new model, or 'load' to load a pre-trained model: ").strip().lower()

    while mode not in ['train', 'load']:
        mode = input(
            "Enter 'train' to train a new model, or 'load' to load a pre-trained model: ").strip().lower()

    if mode == 'train':
        # Train and validate the model
        best_val_loss = float('inf')
        for epoch in range(epochs):
            print(f"Epoch {epoch + 1}/{epochs}")
            train_loss = train_epoch(
                model, train_loader, optimizer, device, scheduler)
            val_loss = eval_epoch(model, val_loader, device)
            print(
                f"Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")

            if val_loss < best_val_loss:
                best_val_loss = val_loss
                torch.save(model.state_dict(), 'bert_task1_model.bin')
                print("Saved best model")

    else:
        # Load the pre-trained model
        model_path = input("Enter the path to the pre-trained model: ").strip()
        model.load_state_dict(torch.load(model_path))
        print("Loaded pre-trained model")

    # Calculate and print classification metrics
    get_classification_metrics(model, test_loader, device)


if __name__ == '__main__':
    main()


Enter 'train' to train a new model, or 'load' to load a pre-trained model: train
Epoch 1/3


100%|██████████| 136/136 [00:49<00:00,  2.77it/s]
100%|██████████| 16/16 [00:01<00:00,  8.55it/s]


Train Loss: 0.5765, Validation Loss: 0.4241
Saved best model
Epoch 2/3


100%|██████████| 136/136 [00:45<00:00,  3.02it/s]
100%|██████████| 16/16 [00:01<00:00,  8.86it/s]


Train Loss: 0.2687, Validation Loss: 0.4410
Epoch 3/3


100%|██████████| 136/136 [00:45<00:00,  2.97it/s]
100%|██████████| 16/16 [00:01<00:00,  8.50it/s]


Train Loss: 0.1104, Validation Loss: 0.5962


100%|██████████| 37/37 [00:04<00:00,  8.54it/s]

              precision    recall  f1-score   support

           0       0.76      0.81      0.79       301
           1       0.78      0.73      0.76       280

    accuracy                           0.77       581
   macro avg       0.77      0.77      0.77       581
weighted avg       0.77      0.77      0.77       581






Task 1, Method 2 - word2vec technique

In [56]:
import os
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score


In [57]:
def remove_tags(text):
    return text.replace("<BOS>", "").replace("<EOS>", "").strip()


train_df = pd.read_csv('propaganda_train.tsv',
                       delimiter='\t', names=['label', 'sentence'])
test_df = pd.read_csv('propaganda_val.tsv',
                      delimiter='\t', names=['label', 'sentence'])

train_df['sentence'] = train_df['sentence'].apply(remove_tags)
test_df['sentence'] = test_df['sentence'].apply(remove_tags)

train_df['label_idx'] = train_df['label'].apply(
    lambda x: 1 if x != 'not_propaganda' else 0)
test_df['label_idx'] = test_df['label'].apply(
    lambda x: 1 if x != 'not_propaganda' else 0)



In [58]:
def get_sentence_vector(sentence, model):
    words = sentence.split()
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    if len(word_vectors) > 0:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(model.vector_size)


In [59]:
word2vec_model = Word2Vec(
    sentences=train_df['sentence'], vector_size=100, window=5, min_count=1, workers=4)

train_embeddings = train_df['sentence'].apply(
    lambda s: get_sentence_vector(s, word2vec_model))
test_embeddings = test_df['sentence'].apply(
    lambda s: get_sentence_vector(s, word2vec_model))

X_train, y_train = np.vstack(
    train_embeddings.values), train_df['label_idx'].values
X_test, y_test = np.vstack(test_embeddings.values), test_df['label_idx'].values

classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train, y_train)

y_pred_train = classifier.predict(X_train)
y_pred_test = classifier.predict(X_test)




In [60]:
print("Word2Vec + Linear Regression")
print("Training performance:")
print(classification_report(y_train, y_pred_train))
print("Test performance:")
print(classification_report(y_test, y_pred_test))


Word2Vec + Linear Regression
Training performance:
              precision    recall  f1-score   support

           0       0.70      0.94      0.80      1191
           1       0.91      0.60      0.73      1224

    accuracy                           0.77      2415
   macro avg       0.81      0.77      0.76      2415
weighted avg       0.81      0.77      0.76      2415

Test performance:
              precision    recall  f1-score   support

           0       0.71      0.95      0.82       301
           1       0.92      0.59      0.72       280

    accuracy                           0.78       581
   macro avg       0.82      0.77      0.77       581
weighted avg       0.81      0.78      0.77       581



Task 2, Method 1 

In [61]:
import os
import torch
import numpy as np
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import classification_report


In [62]:
def extract_tags_and_context(text):
    start = text.find("<BOS>")
    end = text.find("<EOS>") + len("<EOS>")
    return text[:start] + text[start+len("<BOS>"):end-len("<EOS>")] + text[end:]


train_df = pd.read_csv('propaganda_train.tsv',
                       delimiter='\t', names=['label', 'sentence'])
test_df = pd.read_csv('propaganda_val.tsv',
                      delimiter='\t', names=['label', 'sentence'])

train_df['snippet_with_context'] = train_df['sentence'].apply(
    extract_tags_and_context)
test_df['snippet_with_context'] = test_df['sentence'].apply(
    extract_tags_and_context)

label_to_idx = {label: idx for idx,
                label in enumerate(train_df['label'].unique())}
idx_to_label = {idx: label for label, idx in label_to_idx.items()}
train_df['label_idx'] = train_df['label'].map(label_to_idx)
test_df['label_idx'] = test_df['label'].map(label_to_idx)


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [63]:
train_encodings = tokenizer(
    train_df['snippet_with_context'].to_list(),
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors='pt'
)

test_encodings = tokenizer(
    test_df['snippet_with_context'].to_list(),
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors='pt'
)

In [64]:
class PropagandaDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx])
                for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


train_dataset = PropagandaDataset(
    train_encodings, train_df['label_idx'].values)
test_dataset = PropagandaDataset(test_encodings, test_df['label_idx'].values)

In [65]:
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased', num_labels=len(label_to_idx))

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=50,
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer
)


trainer.train()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Step,Training Loss,Validation Loss
50,2.2454,2.195591
100,1.7903,1.811183
150,1.9311,1.728102
200,1.676,1.639129
250,1.6396,1.538205
300,1.613,1.522162
350,1.4727,1.448468
400,1.5092,1.556169
450,1.6892,1.517917
500,1.4703,1.360403


  item = {key: torch.tensor(val[idx])


TrainOutput(global_step=906, training_loss=1.420048457897262, metrics={'train_runtime': 961.5783, 'train_samples_per_second': 7.534, 'train_steps_per_second': 0.942, 'total_flos': 1906376518932480.0, 'train_loss': 1.420048457897262, 'epoch': 3.0})

In [66]:
raw_pred, _, _ = trainer.predict(test_dataset)
y_pred = np.argmax(raw_pred, axis=1)


print("Test performance:")
print(classification_report(
    test_df['label_idx'].values, y_pred, target_names=label_to_idx.keys()))

  item = {key: torch.tensor(val[idx])


Test performance:
                           precision    recall  f1-score   support

                    label       0.00      0.00      0.00         1
           not_propaganda       0.65      0.96      0.77       301
              flag_waving       0.41      0.69      0.51        39
          loaded_language       0.20      0.03      0.05        37
                    doubt       0.33      0.03      0.05        38
    name_calling,labeling       0.25      0.16      0.20        31
 appeal_to_fear_prejudice       0.58      0.16      0.25        43
               repetition       0.00      0.00      0.00        32
causal_oversimplification       0.00      0.00      0.00        31
exaggeration,minimisation       0.30      0.29      0.29        28

                 accuracy                           0.58       581
                macro avg       0.27      0.23      0.21       581
             weighted avg       0.47      0.58      0.48       581



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Task 2, Method 2 - Regression Technique

In [67]:
import os
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [68]:
def extract_tags_and_context(text):
    start = text.find("<BOS>")
    end = text.find("<EOS>") + len("<EOS>")
    return text[:start] + text[start+len("<BOS>"):end-len("<EOS>")] + text[end:]


train_df = pd.read_csv('propaganda_train.tsv',
                       delimiter='\t', names=['label', 'sentence'])
test_df = pd.read_csv('propaganda_val.tsv', delimiter='\t',
                      names=['label', 'sentence'])

train_df['snippet_with_context'] = train_df['sentence'].apply(
    extract_tags_and_context)
test_df['snippet_with_context'] = test_df['sentence'].apply(
    extract_tags_and_context)


In [69]:
label_to_idx = {label: idx for idx,
                label in enumerate(train_df['label'].unique())}
idx_to_label = {idx: label for label, idx in label_to_idx.items()}
train_df['label_idx'] = train_df['label'].map(label_to_idx)
test_df['label_idx'] = test_df['label'].map(label_to_idx)


vectorizer = TfidfVectorizer()
vectorizer.fit(train_df['snippet_with_context'])


X_train = vectorizer.transform(train_df['snippet_with_context'])
y_train = train_df['label_idx'].values
X_test = vectorizer.transform(test_df['snippet_with_context'])
y_test = test_df['label_idx'].values


classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train, y_train)


y_pred_train = classifier.predict(X_train)
y_pred_test = classifier.predict(X_test)

In [70]:
print("Training performance:")
print(classification_report(y_train, y_pred_train))

print("Test performance:")
print(classification_report(y_test, y_pred_test))


Training performance:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.54      1.00      0.70      1191
           2       0.85      0.42      0.56       148
           3       0.82      0.06      0.11       154
           4       1.00      0.04      0.08       144
           5       0.68      0.10      0.17       157
           6       0.97      0.19      0.31       151
           7       1.00      0.06      0.12       147
           8       0.88      0.23      0.36       158
           9       0.85      0.18      0.29       164

    accuracy                           0.57      2415
   macro avg       0.76      0.23      0.27      2415
weighted avg       0.71      0.57      0.47      2415

Test performance:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.54      0.99      0.70       301
           2       0.90      0.23     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
