## **Mount Drive**

In [1]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [2]:
cd /content/gdrive/MyDrive/VIT/Tamil Argumentation

/content/gdrive/MyDrive/VIT/Tamil Argumentation


## **Install**

In [3]:
pip install transformers

Collecting transformers
  Downloading transformers-4.33.2-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m58.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.17.2-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.9/294.9 kB[0m [31m30.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m107.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m73.9 MB/s[0m eta [36m0:00:

## **Import Libraries**

In [5]:
import pandas as pd
import numpy as np

from tqdm import tqdm

from copy import deepcopy

from sklearn import metrics
from sklearn.model_selection import KFold

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

from transformers import AutoTokenizer,AutoModel

## **Import Dataset**

In [6]:
df = pd.read_excel("/content/gdrive/MyDrive/VIT/Tamil Argumentation/Twitter Comment Dataset.xlsx")

In [7]:
df.head()

Unnamed: 0,S No,Tweet,Date of Tweet,Topic,Parent Tweet,Language,Quality,Stance,Argument,Comment,Responding to Tone,Discussing Writer Characteristics,Remark,Relevancy
0,1,"Bro imagine today is Friday , big star movie i...",2018-05-22,Jalikattu,"And tamil people, jalikattu maadu for money an...",ENGLISH,Med,Undetermined,0,1,0,0,0,Relevant
1,2,Dei unnoda akkarai TN mela not on others and w...,2018-05-22,Jalikattu,"And tamil people, jalikattu maadu for money an...",ENGLISH,Med,Against,0,1,0,1,0,Relevant
2,3,En ninga ivara matum mention panuringa naraiya...,2018-05-22,Jalikattu,"And tamil people, jalikattu maadu for money an...",CODE-MIXED,Med,For,0,1,0,0,0,Relevant
3,4,What is happening in Thoothukudi is totally no...,2018-05-22,Jalikattu,"And tamil people, jalikattu maadu for money an...",ENGLISH,High,Against,1,1,0,0,0,Relevant
4,5,Ungaluku Sterlite protest prachanaya illa Bala...,2018-05-22,Jalikattu,"And tamil people, jalikattu maadu for money an...",CODE-MIXED,Med,Undetermined,0,0,1,0,0,Relevant


In [8]:
df.columns

Index(['S No', 'Tweet', 'Date of Tweet', 'Topic', 'Parent Tweet', 'Language',
       'Quality', 'Stance', 'Argument', 'Comment', 'Responding to Tone',
       'Discussing Writer Characteristics', 'Remark', 'Relevancy'],
      dtype='object')

## **Load Text and Labels**

In [54]:
text = df["Tweet"].to_numpy()

Quality_label = df["Quality"].to_numpy()
Argument_label = df["Argument"].to_numpy()
Comment_label = df["Comment"].to_numpy()
Writer_label = df["Discussing Writer Characteristics"].to_numpy()
Tone_label = df["Responding to Tone"].to_numpy()
Remark_label = df["Remark"].to_numpy()
Relevancy_label = df["Relevancy"].to_numpy()

## **Label Encoding**

In [53]:
encode_dict_quality = {
    "High": np.array([1, 0, 0]),
    "Med": np.array([0, 1, 0]),
    "Low": np.array([0, 0, 1]),
}

encode_dict = {
    0: np.array([1, 0]),
    1: np.array([0, 1]),
}

encode_dict_relevancy = {
    "Relevant": np.array([1, 0]),
    "Irrelevant": np.array([0, 1]),
}

In [55]:
Quality_label = np.array([encode_dict_quality[label] for label in Quality_label])
Argument_label = np.array([encode_dict[label] for label in Argument_label])
Comment_label = np.array([encode_dict[label] for label in Comment_label])
Writer_label = np.array([encode_dict[label] for label in Writer_label])
Tone_label = np.array([encode_dict[label] for label in Tone_label])
Remark_label = np.array([encode_dict[label] for label in Remark_label])
Relevancy_label = np.array([encode_dict_relevancy[label] for label in Relevancy_label])

## **Pre-Config for mBERT**

In [14]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

MAX_LEN = np.max([len(x) for x in text])
MAX_LEN = np.min([MAX_LEN, 510])

BATCH_SIZE = 32
LEARNING_RATE = 1e-1

In [15]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

## **Build Dataset for mBERT**

In [16]:
class ModelDataset(Dataset):
    def __init__(self, X, y, tokenizer, max_len):
        self.max_len = max_len
        self.text = X
        self.tokenizer = tokenizer
        self.targets = y

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, index):
        text = self.text[index]
        inputs = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True
        )

        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        return {
            'ids': torch.tensor(ids, dtype=torch.long).to(device),
            'mask': torch.tensor(mask, dtype=torch.long).to(device),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long).to(device),
            'targets': torch.tensor(self.targets[index], dtype=torch.float).to(device)
        }

## **Build Model**

In [25]:
class CustomModel(nn.Module):

    def __init__(self):
        super(CustomModel, self).__init__()
        self.bert = AutoModel.from_pretrained('bert-base-multilingual-cased')

        for param in self.bert.parameters():
            param.requires_grad = False

        self.out_layer = nn.Linear(768, 2)

    def forward(self, ids, mask, token_type_ids):
        _, features = self.bert(
            ids, token_type_ids=token_type_ids,
            attention_mask=mask, return_dict=False
        )

        output = self.out_layer(features)

        return output

## **Train Model**

In [28]:
def train(epoch, model, train_loader, loss_fn, optimizer):

    model.train()

    for batch in tqdm(train_loader):

        optimizer.zero_grad()

        ids = batch['ids'].to(device, dtype = torch.long)
        mask = batch['mask'].to(device, dtype = torch.long)
        token_type_ids = batch['token_type_ids'].to(device, dtype = torch.long)
        targets = batch['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        loss = loss_fn(outputs, targets)

        loss.backward()
        optimizer.step()


    print(f'Epoch: {epoch + 1}, Loss:  {loss.item()}')

In [21]:
def validation(data_loader, model):

    model.eval()
    targets = []
    outputs = []

    with torch.no_grad():

        for batch in data_loader:

            ids = batch['ids'].to(device, dtype = torch.long)
            mask = batch['mask'].to(device, dtype = torch.long)
            token_type_ids = batch['token_type_ids'].to(device, dtype = torch.long)
            batch_targets = batch['targets'].to(device, dtype = torch.float)

            batch_outputs = model(ids, mask, token_type_ids)

            targets.extend(batch_targets.cpu().numpy().tolist())
            outputs.extend(batch_outputs.cpu().numpy().tolist())

    return outputs, targets

In [58]:
kf = KFold(n_splits=5)

model_targets = []
model_labels = []

model = CustomModel().to(device)

loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(params=model.parameters(), lr=LEARNING_RATE)

for train_index, test_index in kf.split(text):

    text_train, text_test = text[train_index], text[test_index]
    labels_train, labels_test = Writer_label[train_index], Writer_label[test_index]

    train_data = ModelDataset(text_train, labels_train, tokenizer, MAX_LEN)
    test_data = ModelDataset(text_test, labels_test, tokenizer, MAX_LEN)

    train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
    test_loader = DataLoader(test_data, batch_size=BATCH_SIZE)

    best_score = -np.inf
    best_weights = None

    EPOCHS = 10

    torch.save(model.state_dict(),"/content/gdrive/MyDrive/VIT/Tamil Argumentation/twitter_model_writer.pth")

    for epoch in range(EPOCHS):

        train(epoch, model, train_loader, loss_fn, optimizer)
        outputs, targets = validation(test_loader, model)

        targets = [np.argmax(x) for x in targets]
        outputs = [np.argmax(x) for x in outputs]

        score = metrics.f1_score(targets, outputs, average='weighted')

        if score > best_score:
            best_score = score
            best_weights = deepcopy(model.state_dict())

    model.load_state_dict(best_weights)
    torch.save(model.state_dict(),"/content/gdrive/MyDrive/VIT/Tamil Argumentation/twitter_model_writer.pth")

    outputs, targets = validation(test_loader, model)

    targets = [np.argmax(x) for x in targets]
    outputs = [np.argmax(x) for x in outputs]

    model_targets.extend(targets)
    model_labels.extend(outputs)

100%|██████████| 34/34 [00:19<00:00,  1.70it/s]


Epoch: 1, Loss:  0.48440903425216675


100%|██████████| 34/34 [00:19<00:00,  1.71it/s]


Epoch: 2, Loss:  0.41226726770401


100%|██████████| 34/34 [00:19<00:00,  1.71it/s]


Epoch: 3, Loss:  1.3366432189941406


100%|██████████| 34/34 [00:19<00:00,  1.71it/s]


Epoch: 4, Loss:  0.5270129442214966


100%|██████████| 34/34 [00:19<00:00,  1.72it/s]


Epoch: 5, Loss:  0.6998328566551208


100%|██████████| 34/34 [00:19<00:00,  1.71it/s]


Epoch: 6, Loss:  0.3987298905849457


100%|██████████| 34/34 [00:19<00:00,  1.71it/s]


Epoch: 7, Loss:  1.1304773092269897


100%|██████████| 34/34 [00:19<00:00,  1.71it/s]


Epoch: 8, Loss:  1.2748146057128906


100%|██████████| 34/34 [00:19<00:00,  1.71it/s]


Epoch: 9, Loss:  0.27446645498275757


100%|██████████| 34/34 [00:19<00:00,  1.71it/s]


Epoch: 10, Loss:  0.7894580364227295


100%|██████████| 34/34 [00:19<00:00,  1.70it/s]


Epoch: 1, Loss:  0.6382333636283875


100%|██████████| 34/34 [00:20<00:00,  1.68it/s]


Epoch: 2, Loss:  0.6566433906555176


100%|██████████| 34/34 [00:19<00:00,  1.72it/s]


Epoch: 3, Loss:  0.7445432543754578


100%|██████████| 34/34 [00:19<00:00,  1.72it/s]


Epoch: 4, Loss:  0.1872003972530365


100%|██████████| 34/34 [00:19<00:00,  1.71it/s]


Epoch: 5, Loss:  1.8044631481170654


100%|██████████| 34/34 [00:19<00:00,  1.71it/s]


Epoch: 6, Loss:  1.4837462902069092


100%|██████████| 34/34 [00:19<00:00,  1.71it/s]


Epoch: 7, Loss:  0.3318694829940796


100%|██████████| 34/34 [00:19<00:00,  1.71it/s]


Epoch: 8, Loss:  0.47818523645401


100%|██████████| 34/34 [00:19<00:00,  1.71it/s]


Epoch: 9, Loss:  0.756565511226654


100%|██████████| 34/34 [00:19<00:00,  1.72it/s]


Epoch: 10, Loss:  1.0160120725631714


100%|██████████| 34/34 [00:19<00:00,  1.70it/s]


Epoch: 1, Loss:  0.4627341628074646


100%|██████████| 34/34 [00:20<00:00,  1.69it/s]


Epoch: 2, Loss:  0.43949711322784424


100%|██████████| 34/34 [00:19<00:00,  1.72it/s]


Epoch: 3, Loss:  0.5389676094055176


100%|██████████| 34/34 [00:19<00:00,  1.72it/s]


Epoch: 4, Loss:  0.7606317400932312


100%|██████████| 34/34 [00:19<00:00,  1.71it/s]


Epoch: 5, Loss:  0.35104840993881226


100%|██████████| 34/34 [00:19<00:00,  1.71it/s]


Epoch: 6, Loss:  0.6267610788345337


100%|██████████| 34/34 [00:19<00:00,  1.71it/s]


Epoch: 7, Loss:  0.7543174624443054


100%|██████████| 34/34 [00:19<00:00,  1.71it/s]


Epoch: 8, Loss:  0.47292423248291016


100%|██████████| 34/34 [00:19<00:00,  1.72it/s]


Epoch: 9, Loss:  0.5912003517150879


100%|██████████| 34/34 [00:19<00:00,  1.71it/s]


Epoch: 10, Loss:  0.561236560344696


100%|██████████| 34/34 [00:19<00:00,  1.70it/s]


Epoch: 1, Loss:  0.7887446284294128


100%|██████████| 34/34 [00:20<00:00,  1.69it/s]


Epoch: 2, Loss:  1.433197259902954


100%|██████████| 34/34 [00:19<00:00,  1.72it/s]


Epoch: 3, Loss:  0.34140318632125854


100%|██████████| 34/34 [00:19<00:00,  1.72it/s]


Epoch: 4, Loss:  0.7105922698974609


100%|██████████| 34/34 [00:19<00:00,  1.71it/s]


Epoch: 5, Loss:  0.7140242457389832


100%|██████████| 34/34 [00:19<00:00,  1.71it/s]


Epoch: 6, Loss:  0.4517383575439453


100%|██████████| 34/34 [00:19<00:00,  1.71it/s]


Epoch: 7, Loss:  0.36017364263534546


100%|██████████| 34/34 [00:19<00:00,  1.71it/s]


Epoch: 8, Loss:  0.374699205160141


100%|██████████| 34/34 [00:19<00:00,  1.72it/s]


Epoch: 9, Loss:  0.7941216230392456


100%|██████████| 34/34 [00:19<00:00,  1.71it/s]


Epoch: 10, Loss:  0.4274430274963379


100%|██████████| 34/34 [00:19<00:00,  1.70it/s]


Epoch: 1, Loss:  0.7922083139419556


100%|██████████| 34/34 [00:20<00:00,  1.70it/s]


Epoch: 2, Loss:  0.39517641067504883


100%|██████████| 34/34 [00:19<00:00,  1.71it/s]


Epoch: 3, Loss:  0.5465056300163269


100%|██████████| 34/34 [00:19<00:00,  1.72it/s]


Epoch: 4, Loss:  0.4337256848812103


100%|██████████| 34/34 [00:19<00:00,  1.71it/s]


Epoch: 5, Loss:  0.9886152148246765


100%|██████████| 34/34 [00:19<00:00,  1.70it/s]


Epoch: 6, Loss:  0.8762394785881042


100%|██████████| 34/34 [00:19<00:00,  1.71it/s]


Epoch: 7, Loss:  1.044708013534546


100%|██████████| 34/34 [00:19<00:00,  1.70it/s]


Epoch: 8, Loss:  0.635799765586853


100%|██████████| 34/34 [00:19<00:00,  1.71it/s]


Epoch: 9, Loss:  0.5076146125793457


100%|██████████| 34/34 [00:19<00:00,  1.71it/s]


Epoch: 10, Loss:  0.7606154680252075


In [59]:
accuracy = metrics.accuracy_score(model_targets, model_labels)
f1_score_w_avg = metrics.f1_score(model_targets, model_labels, average='weighted')

print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Weighted) = {f1_score_w_avg}")

print(metrics.classification_report(model_targets, model_labels))

Accuracy Score = 0.7637037037037037
F1 Score (Weighted) = 0.7407216317715691
              precision    recall  f1-score   support

           0       0.79      0.92      0.85       990
           1       0.60      0.35      0.44       360

    accuracy                           0.76      1350
   macro avg       0.70      0.63      0.64      1350
weighted avg       0.74      0.76      0.74      1350

