<a href="https://colab.research.google.com/github/Togotogo98/NLP_tasks/blob/main/NLP_TASK_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#-----------------------------IMPORTS------------------------------------------#

import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import download

#for word2vec embedding
import gensim.downloader as api

#for implementing word2vec neural network
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from keras.utils import to_categorical

#for data preparation and evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

#for bert approach
from transformers import BertModel, BertTokenizer
import torch
from torch.utils.data import DataLoader, Dataset
from torch import nn, optim

download('punkt')
download('stopwords')
download('wordnet')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
#-------------------------------MOUNT THE DATA---------------------------------#

from google.colab import drive
drive.mount('/content/drive')

#TSV file paths
TRAIN_DATA_TSV = "/content/drive/My Drive/ANLP Assignment/propaganda_dataset_v2-1/propaganda_dataset_v2/propaganda_train.tsv"
VAL_DATA_TSV = "/content/drive/My Drive/ANLP Assignment/propaganda_dataset_v2-1/propaganda_dataset_v2/propaganda_val.tsv"

Mounted at /content/drive


In [None]:
#-----------------------------WORD2VEC DOWNLOAD--------------------------------#

# Time to download - 10-12 mins:
WORD_VECTORS = api.load("word2vec-google-news-300")



In [None]:
#Multi-class labels for mapping:

label_mapping = { 'flag_waving': 0,
                  'appeal_to_fear_prejudice': 1,
                  'causal_oversimplification': 2,
                  'doubt': 3,
                  'exaggeration,minimisation': 4,
                  'loaded_language': 5,
                  'name_calling,labeling': 6,
                  'repetition': 7,
                  }

# **TASK - 2**

**TASK 2 APPROACH 1**

For first approach, Word2Vec word embedding is used with a Neural Network

In [None]:
#------------------------------LOAD DATA---------------------------------------#
train_data = pd.read_csv(TRAIN_DATA_TSV, sep='\t')
val_data = pd.read_csv(VAL_DATA_TSV, sep='\t')

In [None]:
# Task 2 requires multiple classes.

# Remove all 'not_propaganda' labelled rows since only propaganda data is needed
train_data = train_data[train_data['label'] != 'not_propaganda'].copy()
val_data = val_data[val_data['label'] != 'not_propaganda'].copy()

train_data.loc[:, 'm_label'] = train_data['label'].replace(label_mapping)
val_data.loc[:, 'm_label'] = val_data['label'].replace(label_mapping)

#test print
train_data[:5]

Unnamed: 0,label,tagged_in_context,m_label
2,flag_waving,The Obama administration misled the <BOS> Amer...,0
5,loaded_language,"Hitler <BOS> annihilated <EOS> 400,000 Germans...",5
8,doubt,"As noted above, at this point literally every ...",3
10,"name_calling,labeling",His account was suspended for violating Twitte...,6
12,appeal_to_fear_prejudice,A couple of seemingly unrelated events this pa...,1


In [None]:
# Extract and tokenize span of text:

def extract_span_and_tokenize(text):
  text = re.search('<bos>(.*)<eos>', text.lower()).group(1)
  text = re.sub(r"[^\w\s]", '', text.lower())
  tokens = [word for word in word_tokenize(text)]
  return tokens

train_data['tokens'] = train_data['tagged_in_context'].apply(extract_span_and_tokenize)
val_data['tokens'] = val_data['tagged_in_context'].apply(extract_span_and_tokenize)

#test print
train_data[:5]

Unnamed: 0,label,tagged_in_context,m_label,tokens
2,flag_waving,The Obama administration misled the <BOS> Amer...,0,"[american, people]"
5,loaded_language,"Hitler <BOS> annihilated <EOS> 400,000 Germans...",5,[annihilated]
8,doubt,"As noted above, at this point literally every ...",3,"[socalled, evidence]"
10,"name_calling,labeling",His account was suspended for violating Twitte...,6,"[hateful, conduct]"
12,appeal_to_fear_prejudice,A couple of seemingly unrelated events this pa...,1,"[point, to, irans, positioning, itself, for, m..."


In [None]:
#-----------------------VECTORIZE USING WORD_VECTORS---------------------------#

def vectorize(tokens):
  vector = np.zeros(300)
  count = 0

  for word in tokens:
     if word in WORD_VECTORS:
         vector += WORD_VECTORS[word]
         count += 1

  if count != 0:
     vector /= count

  return vector

train_data['vectorized'] = train_data['tokens'].apply(vectorize)
val_data['vectorized'] = val_data['tokens'].apply(vectorize)

#test print
train_data[:5]

Unnamed: 0,label,tagged_in_context,m_label,tokens,vectorized
2,flag_waving,The Obama administration misled the <BOS> Amer...,0,"[american, people]","[0.03857421875, -0.00128173828125, 0.140441894..."
5,loaded_language,"Hitler <BOS> annihilated <EOS> 400,000 Germans...",5,[annihilated],"[0.07470703125, 0.25, 0.48828125, -0.106445312..."
8,doubt,"As noted above, at this point literally every ...",3,"[socalled, evidence]","[-0.12451171875, 0.091796875, 0.090087890625, ..."
10,"name_calling,labeling",His account was suspended for violating Twitte...,6,"[hateful, conduct]","[-0.046875, 0.00994873046875, 0.151123046875, ..."
12,appeal_to_fear_prejudice,A couple of seemingly unrelated events this pa...,1,"[point, to, irans, positioning, itself, for, m...","[0.0011930465698242188, 0.08247756958007812, -..."


In [None]:
# Train and val data preparation for training and evaluation:
num_classes = 8
train_set, test_set = train_test_split(train_data, test_size=0.2, random_state=42)

x_train = np.stack(train_set['vectorized'].values)
y_train = train_set['m_label'].values

x_test = np.stack(test_set['vectorized'].values)
y_test = test_set['m_label'].values


x_val = np.stack(val_data['vectorized'].values)
y_val = val_data['m_label'].values

# Converting labels to one-hot encoding for multiple class:
y_train = to_categorical(y_train, num_classes=num_classes)
y_test = to_categorical(y_test, num_classes=num_classes)
y_val = to_categorical(y_val, num_classes=num_classes)

In [None]:
#----------------------------NEURAL NETWORK------------------------------------#
model = Sequential([ Dense(128, activation='relu', input_dim=300),
                     Dropout(0.5),
                     Dense(64, activation='relu'),
                     Dropout(0.5),
                     Dense(8, activation='softmax')
                     ])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
#-----------------------TRAINING AND EVALUATION--------------------------------#

history = model.fit(  x_train,
                      y_train,
                      epochs=30,
                      validation_data=(x_val, y_val),
                      batch_size=32
                      )

# Evaluate the model on the final test dataset
final_loss, final_accuracy = model.evaluate(x_val, y_val)

print("\nFinal Evaluation Results on Validation set:\n")
print(f"Validation loss: {final_loss:.2f},\nValidation accuracy: {final_accuracy:.2f}")

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30

Final Evaluation Results on Validation set:

Validation loss: 1.54,
Validation accuracy: 0.52


In [None]:
# Predict on test data
y_pred = model.predict(x_test)

y_pred_classes = np.argmax(y_pred, axis=1)
y_test_classes = np.argmax(y_test, axis=1)

print("\n---------------------------Performance Report------------------------\n")
print(classification_report(y_test_classes, y_pred_classes, target_names=list(label_mapping.keys())))


---------------------------Performance Report------------------------

                           precision    recall  f1-score   support

              flag_waving       0.59      0.79      0.68        24
 appeal_to_fear_prejudice       0.58      0.52      0.55        27
causal_oversimplification       0.43      0.53      0.48        30
                    doubt       0.44      0.41      0.43        29
exaggeration,minimisation       0.56      0.62      0.59        37
          loaded_language       0.44      0.38      0.41        32
    name_calling,labeling       0.52      0.47      0.49        34
               repetition       0.42      0.34      0.38        32

                 accuracy                           0.50       245
                macro avg       0.50      0.51      0.50       245
             weighted avg       0.50      0.50      0.50       245



**TASK 2 APPROACH 2**

For Second approach, a neural network is used that builds on the pretrained BERT model using 'bert-base-uncased' embedding

*Ref*: *The following codes are adapted from Lab10Solution.ipynb but the code structure is changed to suit the task requirements*

In [None]:
# switch to T4 GPU in google colab to avoid ram crash:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [None]:
# Run the IMPORTS and MOUNT DATA cells on top before runing the following:
# load fresh train_data and val_data for bert:
train_data_bert = pd.read_csv(TRAIN_DATA_TSV, sep='\t')
val_data_bert = pd.read_csv(VAL_DATA_TSV, sep='\t')

# Remove all 'not_propaganda' labelled rows since only propaganda data is needed
train_data_bert = train_data_bert[train_data_bert['label'] != 'not_propaganda'].copy()
val_data_bert = val_data_bert[val_data_bert['label'] != 'not_propaganda'].copy()

train_data_bert.loc[:, 'm_label'] = train_data_bert['label'].replace(label_mapping)
val_data_bert.loc[:, 'm_label'] = val_data_bert['label'].replace(label_mapping)

#test print
train_data_bert[:5]

Unnamed: 0,label,tagged_in_context,m_label
2,flag_waving,The Obama administration misled the <BOS> Amer...,0
5,loaded_language,"Hitler <BOS> annihilated <EOS> 400,000 Germans...",5
8,doubt,"As noted above, at this point literally every ...",3
10,"name_calling,labeling",His account was suspended for violating Twitte...,6
12,appeal_to_fear_prejudice,A couple of seemingly unrelated events this pa...,1


In [None]:
# Extract span of text:

def extract_span(text):
  text = re.search('<bos>(.*)<eos>', text.lower()).group(1)
  text = re.sub(r"[^\w\s]", '', text.lower())
  return text

train_data_bert['sentence'] = train_data_bert['tagged_in_context'].apply(extract_span)
val_data_bert['sentence'] = val_data_bert['tagged_in_context'].apply(extract_span)

#test print
train_data_bert[:5]

Unnamed: 0,label,tagged_in_context,m_label,sentence
2,flag_waving,The Obama administration misled the <BOS> Amer...,0,american people
5,loaded_language,"Hitler <BOS> annihilated <EOS> 400,000 Germans...",5,annihilated
8,doubt,"As noted above, at this point literally every ...",3,socalled evidence
10,"name_calling,labeling",His account was suspended for violating Twitte...,6,hateful conduct
12,appeal_to_fear_prejudice,A couple of seemingly unrelated events this pa...,1,point to irans positioning itself for more ag...


In [None]:
# Bert
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
#----------------------------------DATASET-------------------------------------#

class PropagandaDataset(Dataset):
    def __init__(self, data, tokenizer, max_len=512):
        self.tokenizer = tokenizer
        self.text = data['sentence'].tolist()
        self.labels = data['m_label'].tolist()
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, idx):
        text = str(self.text[idx])
        labels = self.labels[idx]
        encoding = self.tokenizer.encode_plus(  text.lower(),
                                                max_length=self.max_len,
                                                padding='max_length',
                                                return_tensors='pt',
                                                truncation=True
                                                )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(labels, dtype=torch.long)
            }

In [None]:
# Prepare Data into Dataset and DataLoader for training and evaluation:
train_data_bert_80, test_data_bert_20 = train_test_split(train_data_bert, test_size=0.2, random_state=42)

train_dataset = PropagandaDataset(train_data_bert_80, tokenizer)
test_dataset = PropagandaDataset(test_data_bert_20, tokenizer)
val_dataset = PropagandaDataset(val_data_bert, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)

In [None]:
#-------------------------------CLASSIFIER-------------------------------------#

class PropagandaClassifier(nn.Module):
    def __init__(self, bert_model, num_classes=8):
        super(PropagandaClassifier, self).__init__()
        self.bert = bert_model.to(device)
        self.dropout = nn.Dropout(p=0.5)
        self.fc1 = nn.Linear(self.bert.config.hidden_size, 128)
        self.leakyrelu = nn.LeakyReLU(0.1)
        self.fc2 = nn.Linear(128, 32)
        self.out = nn.Linear(32, num_classes)

    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=False)
        x = self.fc1(pooled_output)
        x = self.leakyrelu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.leakyrelu(x)
        x = self.dropout(x)
        logits = self.out(x)
        return logits

In [None]:
#----------------------------TRAINING FUNCTION---------------------------------#

def train_model(model, train_loader, val_loader, optimizer, loss_fn, device, epochs=3):
    for epoch in range(epochs):
        model.train()
        total_train_loss = 0
        correct_preds = 0
        total_preds = 0

        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            logits = model(input_ids, attention_mask)
            loss = loss_fn(logits, labels)
            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()
            _, preds = torch.max(logits, dim=1)
            correct_preds += (preds == labels).sum().item()
            total_preds += labels.size(0)

        average_train_loss = total_train_loss / len(train_loader)
        train_accuracy = correct_preds / total_preds
        print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {average_train_loss}, Train Accuracy: {train_accuracy:.4f}")

        # Evaluation on validation data:
        model.eval()
        total_val_loss = 0
        correct_val_preds = 0
        total_val_preds = 0

        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                logits = model(input_ids, attention_mask)
                loss = loss_fn(logits, labels)
                total_val_loss += loss.item()

                _, preds = torch.max(logits, dim=1)
                correct_val_preds += (preds == labels).sum().item()
                total_val_preds += labels.size(0)

        average_val_loss = total_val_loss / len(val_loader)
        val_accuracy = correct_val_preds / total_val_preds
        print(f"Epoch {epoch + 1}/{epochs}, Validation Loss: {average_val_loss}, Validation Accuracy: {val_accuracy:.4f}")

In [None]:
#----------------------------EVALUATE FUNCTION---------------------------------#

def evaluate_model(model, data_loader, loss_fn, device):
    model.eval()
    total_loss = 0
    correct_preds = 0
    total_preds = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            logits = model(input_ids, attention_mask)
            loss = loss_fn(logits, labels)
            total_loss += loss.item()

            _, preds = torch.max(logits, dim=1)
            correct_preds += (preds == labels).sum().item()
            total_preds += labels.size(0)

            all_preds.extend(preds.tolist())
            all_labels.extend(labels.tolist())

    average_loss = total_loss / len(data_loader)
    accuracy = correct_preds / total_preds
    print(f"Test Loss: {average_loss:.4f}, Test Accuracy: {accuracy:.4f}")

    # Classification report:
    print("\nClassification Report:\n")
    print(classification_report(all_labels, all_preds, target_names=list(label_mapping.keys()), zero_division=0))


In [None]:
# There may be issues with cuda memory allocation. Run this cell to clear cache
torch.cuda.empty_cache()

In [None]:
#-----------------------TRAINING THE CLASSIFIER--------------------------------#
# Define classifier:
bert_classifier = PropagandaClassifier(bert_model).to(device)

optimizer = optim.Adam(bert_classifier.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss()


train_model(bert_classifier, train_loader, val_loader, optimizer, loss_fn, device, epochs=3)

Epoch 1/3, Train Loss: 2.0131572285476995, Train Accuracy: 0.2045
Epoch 1/3, Validation Loss: 1.953461710044316, Validation Accuracy: 0.2115
Epoch 2/3, Train Loss: 1.9121048859187535, Train Accuracy: 0.2638
Epoch 2/3, Validation Loss: 1.8407691257340568, Validation Accuracy: 0.2545
Epoch 3/3, Train Loss: 1.7856709086165137, Train Accuracy: 0.3119
Epoch 3/3, Validation Loss: 1.7312098247664316, Validation Accuracy: 0.2796


In [None]:
# Evaluate the bert_classifier on test set:
evaluate_model(bert_classifier, test_loader, nn.CrossEntropyLoss(), device)

Test Loss: 1.7762, Test Accuracy: 0.2816

Classification Report:

                           precision    recall  f1-score   support

              flag_waving       0.23      0.75      0.35        24
 appeal_to_fear_prejudice       0.00      0.00      0.00        27
causal_oversimplification       0.38      0.87      0.53        30
                    doubt       0.00      0.00      0.00        29
exaggeration,minimisation       0.00      0.00      0.00        37
          loaded_language       0.21      0.09      0.13        32
    name_calling,labeling       0.00      0.00      0.00        34
               repetition       0.26      0.69      0.38        32

                 accuracy                           0.28       245
                macro avg       0.14      0.30      0.17       245
             weighted avg       0.13      0.28      0.17       245

