<a href="https://colab.research.google.com/github/Togotogo98/NLP_tasks/blob/main/NLP_TASK_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#-----------------------------IMPORTS------------------------------------------#

import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import download

#for word2vec embedding
import gensim.downloader as api

#for implementing word2vec neural network
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

#for data preparation and evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

#for bert approach
from transformers import BertModel, BertTokenizer
import torch
from torch.utils.data import DataLoader, Dataset
from torch import nn, optim

download('punkt')
download('stopwords')
download('wordnet')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
#-------------------------------MOUNT THE DATA---------------------------------#

from google.colab import drive
drive.mount('/content/drive')

#TSV file paths
TRAIN_DATA_TSV = "/content/drive/My Drive/ANLP Assignment/propaganda_dataset_v2-1/propaganda_dataset_v2/propaganda_train.tsv"
VAL_DATA_TSV = "/content/drive/My Drive/ANLP Assignment/propaganda_dataset_v2-1/propaganda_dataset_v2/propaganda_val.tsv"

Mounted at /content/drive


In [None]:
#-----------------------------WORD2VEC DOWNLOAD--------------------------------#

WORD_VECTORS = api.load("word2vec-google-news-300")



# **TASK - 1**

**TASK 1 APPROACH 1**

For first approach, Word2Vec word embedding is used with a 2-layer Neural Network

In [None]:
#------------------------------LOAD DATA---------------------------------------#
train_data = pd.read_csv(TRAIN_DATA_TSV, sep='\t')
val_data = pd.read_csv(VAL_DATA_TSV, sep='\t')

In [None]:
# Adding binary labels for Task 1.
# 0 - Not Propaganda
# 1 - Propaganda

train_data['bi_label'] = np.where(train_data['label'] == 'not_propaganda', 0, 1)
val_data['bi_label'] = np.where(val_data['label'] == 'not_propaganda', 0, 1)

#test print
train_data[:5]

Unnamed: 0,label,tagged_in_context,bi_label
0,not_propaganda,"No, <BOS> he <EOS> will not be confirmed.",0
1,not_propaganda,This declassification effort <BOS> won’t make ...,0
2,flag_waving,The Obama administration misled the <BOS> Amer...,1
3,not_propaganda,“It looks like we’re capturing the demise of t...,0
4,not_propaganda,"<BOS> Location: Westerville, Ohio <EOS>",0


In [None]:
# Tokenize for Word2Vec:

def tokenize(text):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    text = re.sub(r"<bos>|<eos>|[^\w\s]", '', text.lower())
    tokens = [lemmatizer.lemmatize(word) for word in word_tokenize(text) if word not in stop_words]
    return tokens

train_data['tokens'] = train_data['tagged_in_context'].apply(tokenize)
val_data['tokens'] = val_data['tagged_in_context'].apply(tokenize)

In [None]:
#-----------------------VECTORIZE USING WORD_VECTORS---------------------------#

def vectorize(tokens):
  vector = np.zeros(300)
  count = 0

  for word in tokens:
     if word in WORD_VECTORS:
         vector += WORD_VECTORS[word]
         count += 1

  if count != 0:
     vector /= count

  return vector

train_data['vectorized'] = train_data['tokens'].apply(vectorize)
val_data['vectorized'] = val_data['tokens'].apply(vectorize)

#test print
train_data[:5]

Unnamed: 0,label,tagged_in_context,bi_label,tokens,vectorized
0,not_propaganda,"No, <BOS> he <EOS> will not be confirmed.",0,[confirmed],"[-0.0050048828125, -0.2041015625, -0.044433593..."
1,not_propaganda,This declassification effort <BOS> won’t make ...,0,"[declassification, effort, wont, make, thing, ...","[0.044994354248046875, 0.0304107666015625, 0.0..."
2,flag_waving,The Obama administration misled the <BOS> Amer...,1,"[obama, administration, misled, american, peop...","[0.054951985677083336, 0.062123616536458336, 0..."
3,not_propaganda,“It looks like we’re capturing the demise of t...,0,"[look, like, capturing, demise, dark, vortex, ...","[-0.0194244384765625, 0.08739356994628907, 0.0..."
4,not_propaganda,"<BOS> Location: Westerville, Ohio <EOS>",0,"[location, westerville, ohio]","[0.081787109375, -0.0255126953125, 0.055297851..."


In [None]:
# Train and val data preparation for training and evaluation:

train_set, test_set = train_test_split(train_data, test_size=0.2, random_state=42)

x_train = np.stack(train_set['vectorized'].values)
y_train = train_set['bi_label'].values

x_test = np.stack(test_set['vectorized'].values)
y_test = test_set['bi_label'].values


x_val = np.stack(val_data['vectorized'].values)
y_val = val_data['bi_label'].values

In [None]:
#----------------------------NEURAL NETWORK------------------------------------#
model = Sequential([ Dense(128, activation='relu', input_dim=300),
                     Dropout(0.1),
                     Dense(64, activation='relu'),
                     Dropout(0.1),
                     Dense(1, activation='sigmoid')
                     ])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
#-----------------------TRAINING AND EVALUATION--------------------------------#

history = model.fit(  x_train,
                      y_train,
                      epochs=10,
                      validation_data=(x_val, y_val),
                      batch_size=32
                      )

# Evaluate the model on the final validation dataset
final_loss, final_accuracy = model.evaluate(x_val, y_val)

print("\nFinal Evaluation Results on Validation set:\n")
print(f"Validation loss: {final_loss:.2f},\nValidation accuracy: {final_accuracy:.2f}")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

Final Evaluation Results on Validation set:

Validation loss: 0.65,
Validation accuracy: 0.72


In [None]:
# Predict on test data
y_pred = (model.predict(x_test) > 0.5).astype(int).flatten()

print("\n-------------------Performance Report------------------\n")
print(classification_report(y_test, y_pred, target_names=['Non propaganda', 'propaganda']))


-------------------Performance Report------------------

                precision    recall  f1-score   support

Non propaganda       0.75      0.71      0.73       241
    propaganda       0.73      0.76      0.74       242

      accuracy                           0.74       483
     macro avg       0.74      0.74      0.74       483
  weighted avg       0.74      0.74      0.74       483



**TASK 1 APPROACH 2**

For Second approach, a neural network is used that builds on the pretrained BERT model using 'bert-base-uncased' embedding

*Ref*: *The following codes are adapted from Lab10Solution.ipynb but the code structure is changed to suit the task requirements*

In [None]:
# switch to T4 GPU in google colab to avoid ram crash:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [None]:
# Run the IMPORTS and MOUNT DATA cells on top before runing the following:

# load fresh train_data and val_data for bert:
train_data_bert = pd.read_csv(TRAIN_DATA_TSV, sep='\t')
val_data_bert = pd.read_csv(VAL_DATA_TSV, sep='\t')

# Removing <BOS> and <EOS> tags:
train_data_bert['sentence'] = train_data_bert['tagged_in_context'].replace({'<BOS>': '', '<EOS>': ''}, regex=True)
val_data_bert['sentence'] = val_data_bert['tagged_in_context'].replace({'<BOS>': '', '<EOS>': ''}, regex=True)

# Adding binary labels same as before:
# 0 - Not Propaganda
# 1 - Propaganda

train_data_bert['bi_label'] = np.where(train_data_bert['label'] == 'not_propaganda', 0, 1)
val_data_bert['bi_label'] = np.where(val_data_bert['label'] == 'not_propaganda', 0, 1)

#test print
train_data_bert[:5]

Unnamed: 0,label,tagged_in_context,sentence,bi_label
0,not_propaganda,"No, <BOS> he <EOS> will not be confirmed.","No, he will not be confirmed.",0
1,not_propaganda,This declassification effort <BOS> won’t make ...,This declassification effort won’t make thing...,0
2,flag_waving,The Obama administration misled the <BOS> Amer...,The Obama administration misled the American ...,1
3,not_propaganda,“It looks like we’re capturing the demise of t...,“It looks like we’re capturing the demise of t...,0
4,not_propaganda,"<BOS> Location: Westerville, Ohio <EOS>","Location: Westerville, Ohio",0


In [None]:
# Bert
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
#----------------------------------DATASET-------------------------------------#

class PropagandaDataset(Dataset):
    def __init__(self, data, tokenizer, max_len=512):
        self.tokenizer = tokenizer
        self.text = data['sentence'].tolist()
        self.labels = data['bi_label'].tolist()
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, idx):
        text = str(self.text[idx])
        labels = self.labels[idx]
        encoding = self.tokenizer.encode_plus(  text.lower(),
                                                max_length=self.max_len,
                                                padding='max_length',
                                                return_tensors='pt',
                                                truncation=True
                                                )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(labels, dtype=torch.long)
            }

In [None]:
# Prepare Data into Dataset and DataLoader for training and evaluation:
train_data_bert_80, test_data_bert_20 = train_test_split(train_data_bert, test_size=0.2, random_state=42)

train_dataset = PropagandaDataset(train_data_bert_80, tokenizer)
test_dataset = PropagandaDataset(test_data_bert_20, tokenizer)
val_dataset = PropagandaDataset(val_data_bert, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)

In [None]:
#-------------------------------CLASSIFIER-------------------------------------#

class PropagandaClassifier(nn.Module):
    def __init__(self, bert_model):
        super(PropagandaClassifier, self).__init__()
        self.bert = bert_model.to(device)
        self.drop = nn.Dropout(p=0.5)
        self.out = nn.Linear(self.bert.config.hidden_size, 2)

    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert( input_ids=input_ids,
                                      attention_mask=attention_mask,
                                      return_dict=False
                                      )
        output = self.drop(pooled_output)
        logits = self.out(output)
        return logits

In [None]:
#----------------------------TRAINING FUNCTION---------------------------------#

def train_model(model, train_loader, val_loader, optimizer, loss_fn, device, epochs=3):
    for epoch in range(epochs):
        model.train()
        total_train_loss = 0
        correct_preds = 0
        total_preds = 0
        total_train_loss = 0

        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            logits = model(input_ids, attention_mask)
            loss = loss_fn(logits, labels)
            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()

            # Calculate accuracy using logits:
            _, preds = torch.max(logits, dim=1)
            correct_preds += (preds == labels).sum().item()
            total_preds += labels.size(0)

        # Train loss and accuracy for current epoach:
        average_train_loss = total_train_loss / len(train_loader)
        train_accuracy = correct_preds / total_preds
        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {average_train_loss}, Train Accuracy: {train_accuracy:.4f}")

        # Evaluation on val_data(val_loader):
        model.eval()
        total_val_loss = 0
        correct_val_preds = 0
        total_val_preds = 0
        total_val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                logits = model(input_ids, attention_mask)
                loss = loss_fn(logits, labels)
                total_val_loss += loss.item()

                # Calculate accuracy using logits:
                _, preds = torch.max(logits, dim=1)
                correct_val_preds += (preds == labels).sum().item()
                total_val_preds += labels.size(0)

        average_val_loss = total_val_loss / len(val_loader)
        val_accuracy = correct_val_preds / total_val_preds
        print(f"Epoch {epoch+1}/{epochs}, Validation Loss: {average_val_loss}, Validation Accuracy: {val_accuracy:.4f}")


In [None]:
#----------------------------EVALUATE FUNCTION---------------------------------#

def evaluate_model(model, data_loader, loss_fn, device):
    model.eval()
    total_loss = 0
    correct_preds = 0
    total_preds = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            logits = model(input_ids, attention_mask)
            loss = loss_fn(logits, labels)
            total_loss += loss.item()

            _, preds = torch.max(logits, dim=1)
            correct_preds += (preds == labels).sum().item()
            total_preds += labels.size(0)

            all_preds.extend(preds.tolist())
            all_labels.extend(labels.tolist())

    average_loss = total_loss / len(data_loader)
    accuracy = correct_preds / total_preds
    print(f"Test Loss: {average_loss:.4f}, Test Accuracy: {accuracy:.4f}")

    # Classification report:
    print("\n-------------------Performance Report------------------\n")
    print(classification_report(all_labels, all_preds, target_names=['Not Propaganda', 'Propaganda']))


In [None]:
# There may be issues with cuda memory allocation. Run this cell to clear cache
torch.cuda.empty_cache()

In [None]:
#-----------------------TRAINING THE CLASSIFIER--------------------------------#
# Define classifier:
bert_classifier = PropagandaClassifier(bert_model).to(device)

optimizer = optim.Adam(bert_classifier.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss()

train_model(bert_classifier, train_loader, val_loader, optimizer, loss_fn, device, epochs=3)

KeyboardInterrupt: 

In [None]:
# Evaluate the bert_classifier on test set:
evaluate_model(bert_classifier, test_loader, nn.CrossEntropyLoss(), device)