## **Mount Drive**

In [3]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [4]:
cd /content/gdrive/MyDrive/VIT/Tamil Argumentation

/content/gdrive/MyDrive/VIT/Tamil Argumentation


## **Install**

In [85]:
pip install transformers nltk



In [86]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## **Import Libraries**

In [87]:
import pandas as pd
import numpy as np

from tqdm import tqdm

from copy import deepcopy

from sklearn import metrics
from sklearn.model_selection import KFold

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

from transformers import AutoTokenizer,AutoModel,BertTokenizer

from nltk.tokenize import word_tokenize

## **Import Dataset**

In [7]:
df = pd.read_excel("/content/gdrive/MyDrive/VIT/Tamil Argumentation/Twitter Comment Dataset.xlsx")

In [8]:
df.head()

Unnamed: 0,S No,Tweet,Date of Tweet,Topic,Parent Tweet,Language,Quality,Stance,Argument,Comment,Responding to Tone,Discussing Writer Characteristics,Remark,Relevancy
0,1,"Bro imagine today is Friday , big star movie i...",2018-05-22,Jalikattu,"And tamil people, jalikattu maadu for money an...",ENGLISH,Med,Undetermined,0,1,0,0,0,Relevant
1,2,Dei unnoda akkarai TN mela not on others and w...,2018-05-22,Jalikattu,"And tamil people, jalikattu maadu for money an...",ENGLISH,Med,Against,0,1,0,1,0,Relevant
2,3,En ninga ivara matum mention panuringa naraiya...,2018-05-22,Jalikattu,"And tamil people, jalikattu maadu for money an...",CODE-MIXED,Med,For,0,1,0,0,0,Relevant
3,4,What is happening in Thoothukudi is totally no...,2018-05-22,Jalikattu,"And tamil people, jalikattu maadu for money an...",ENGLISH,High,Against,1,1,0,0,0,Relevant
4,5,Ungaluku Sterlite protest prachanaya illa Bala...,2018-05-22,Jalikattu,"And tamil people, jalikattu maadu for money an...",CODE-MIXED,Med,Undetermined,0,0,1,0,0,Relevant


In [9]:
df.columns

Index(['S No', 'Tweet', 'Date of Tweet', 'Topic', 'Parent Tweet', 'Language',
       'Quality', 'Stance', 'Argument', 'Comment', 'Responding to Tone',
       'Discussing Writer Characteristics', 'Remark', 'Relevancy'],
      dtype='object')

## **Load Text and Labels**

In [181]:
text = df["Tweet"].to_numpy()
pt = df["Parent Tweet"].to_numpy()
topic = df["Topic"].to_numpy()

Stance_label = df["Stance"].to_numpy()
Quality_label = df["Quality"].to_numpy()
Argument_label = df["Argument"].to_numpy()
Comment_label = df["Comment"].to_numpy()
Writer_label = df["Discussing Writer Characteristics"].to_numpy()
Tone_label = df["Responding to Tone"].to_numpy()
Remark_label = df["Remark"].to_numpy()
Relevancy_label = df["Relevancy"].to_numpy()

## **Label Encoding**

In [182]:
encode_dict_quality = {
    "High": np.array([1, 0, 0]),
    "Med": np.array([0, 1, 0]),
    "Low": np.array([0, 0, 1]),
}

encode_dict_stance = {
    "For": np.array([1, 0, 0]),
    "Against": np.array([0, 1, 0]),
    "Undetermined": np.array([0, 0, 1]),
}

encode_dict = {
    1: np.array([1, 0]),
    0: np.array([0, 1]),
}

encode_dict_relevancy = {
    "Relevant": np.array([1, 0]),
    "Irrelevant": np.array([0, 1]),
}

In [183]:
Quality_label = np.array([encode_dict_quality[label] for label in Quality_label])
Argument_label = np.array([encode_dict[label] for label in Argument_label])
Comment_label = np.array([encode_dict[label] for label in Comment_label])
Writer_label = np.array([encode_dict[label] for label in Writer_label])
Tone_label = np.array([encode_dict[label] for label in Tone_label])
Remark_label = np.array([encode_dict[label] for label in Remark_label])
Relevancy_label = np.array([encode_dict_relevancy[label] for label in Relevancy_label])
Stance_label = np.array([encode_dict_stance[label] for label in Stance_label])

## **Pre-Config for mBERT**

In [191]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

MAX_LEN = np.max([(len(x)+len(y)+1) for x,y in zip(text,pt)])
MAX_LEN = np.min([MAX_LEN, 510])

BATCH_SIZE = 32
LEARNING_RATE = 1e-1

In [27]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')

## **Build Dataset for mBERT**

In [192]:
class ModelDataset(Dataset):
    def __init__(self, X, PT, topic, y, tokenizer, max_len):
        self.max_len = max_len
        self.text = X
        self.topic = topic
        self.pt = PT
        self.tokenizer = tokenizer
        self.targets = y

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, index):
        text = self.text[index]
        pt = self.pt[index]
        topic = self.topic[index]

        concat_text = f"{topic} {pt}"

        inputs = self.tokenizer(
            concat_text,text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True)

        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs['token_type_ids']

        return {
            'ids': torch.tensor(ids, dtype=torch.long).to(device),
            'mask': torch.tensor(mask, dtype=torch.long).to(device),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long).to(device),
            'targets': torch.tensor(self.targets[index], dtype=torch.float).to(device),
            'text_length': len(word_tokenize(text)),
            'pt_length': len(word_tokenize(pt)),
            'topic_length': len(word_tokenize(topic)),
            'topic': pt
        }

## **Build Model**

In [193]:
class CustomModel(nn.Module):

    def __init__(self):
        super(CustomModel, self).__init__()
        self.bert = AutoModel.from_pretrained('bert-base-multilingual-cased')

        for param in self.bert.parameters():
            param.requires_grad = False

        self.out_layer = nn.Linear(768, 3)

    def forward(self, ids, mask, token_type_ids):
        _, features = self.bert(
            ids, token_type_ids=token_type_ids,
            attention_mask=mask, return_dict=False
        )

        output = self.out_layer(features)

        return output

## **Train Model**

In [187]:
def train(epoch, model, train_loader, loss_fn, optimizer):

    model.train()

    for batch in tqdm(train_loader):

        optimizer.zero_grad()

        ids = batch['ids'].to(device, dtype = torch.long)
        mask = batch['mask'].to(device, dtype = torch.long)
        token_type_ids = batch['token_type_ids'].to(device, dtype = torch.long)
        targets = batch['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        loss = loss_fn(outputs, targets)

        loss.backward()
        optimizer.step()


    print(f'Epoch: {epoch + 1}, Loss:  {loss.item()}')

In [188]:
def validation(data_loader, model):

    model.eval()
    targets = []
    outputs = []

    with torch.no_grad():

        for batch in data_loader:

            ids = batch['ids'].to(device, dtype = torch.long)
            mask = batch['mask'].to(device, dtype = torch.long)
            token_type_ids = batch['token_type_ids'].to(device, dtype = torch.long)
            batch_targets = batch['targets'].to(device, dtype = torch.float)

            batch_outputs = model(ids, mask, token_type_ids)

            targets.extend(batch_targets.cpu().numpy().tolist())
            outputs.extend(batch_outputs.cpu().numpy().tolist())

    return outputs, targets

# Model initialization

In [189]:
kf = KFold(n_splits=5)

model_targets = []
model_labels = []

model = CustomModel().to(device)

loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(params=model.parameters(), lr=LEARNING_RATE)

In [None]:
for train_index, test_index in kf.split(text):

    text_train, text_test = text[train_index], text[test_index]
    pt_train, pt_test = pt[train_index], pt[test_index]
    topic_train, topic_test = topic[train_index], topic[test_index]
    labels_train, labels_test = Stance_label[train_index], Stance_label[test_index]

    train_data = ModelDataset(text_train, pt_train, topic_train, labels_train, tokenizer, MAX_LEN)
    test_data = ModelDataset(text_test, pt_test, topic_test, labels_test, tokenizer, MAX_LEN)

    train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
    test_loader = DataLoader(test_data, batch_size=BATCH_SIZE)

    best_score = -np.inf
    best_weights = None

    EPOCHS = 10

    torch.save(model.state_dict(),"/content/gdrive/MyDrive/VIT/Tamil Argumentation/twitter_model_stance.pth")

    for epoch in range(EPOCHS):

        train(epoch, model, train_loader, loss_fn, optimizer)
        outputs, targets = validation(test_loader, model)

        print(outputs)
        print(" ")
        print(targets)

        targets = [np.argmax(x) for x in targets]
        outputs = [np.argmax(x) for x in outputs]

        score = metrics.f1_score(targets, outputs, average='weighted')

        if score > best_score:
            best_score = score
            best_weights = deepcopy(model.state_dict())

    model.load_state_dict(best_weights)
    torch.save(model.state_dict(),"/content/gdrive/MyDrive/VIT/Tamil Argumentation/twitter_model_stance.pth")

    outputs, targets = validation(test_loader, model)

    targets = [np.argmax(x) for x in targets]
    outputs = [np.argmax(x) for x in outputs]

    model_targets.extend(targets)
    model_labels.extend(outputs)

 12%|█▏        | 4/34 [00:06<00:39,  1.33s/it]

In [None]:
accuracy = metrics.accuracy_score(model_targets, model_labels)
f1_score_w_avg = metrics.f1_score(model_targets, model_labels, average='weighted')

print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Weighted) = {f1_score_w_avg}")

print(metrics.classification_report(model_targets, model_labels))

Accuracy Score = 0.6718518518518518
F1 Score (Weighted) = 0.660670534429505
              precision    recall  f1-score   support

           0       0.57      0.37      0.44       197
           1       0.73      0.81      0.77       849
           2       0.53      0.48      0.50       304

    accuracy                           0.67      1350
   macro avg       0.61      0.55      0.57      1350
weighted avg       0.66      0.67      0.66      1350



# Tokenization testing

In [174]:
print(tokenizer.decode([ 16938,
          112,   188, 45476, 10114, 23763, 10531,   119,   119,   119, 10678,
        10114, 94992, 10219, 10111, 23763, 10105, 11561, 40414,   119, 14453,
        44096,   189, 11337, 10678, 10114, 21852, 10479, 10301, 22489, 11426,
          102]))

don't dare to say this... come to merina and say the same thing. Surely u will come to know who are Tamil people [SEP]


In [172]:
print(len(train_data[0]["token_type_ids"]))
print(len(train_data[0]["targets"]))
print(len(train_data[0]["ids"]))
print(len(train_data[0]["mask"]))

print(train_data[0]["text_length"])
print(train_data[0]["pt_length"])
print(train_data[0]["topic_length"])

print(train_data[0]["token_type_ids"])
print(train_data[0]["ids"])


626
2
626
626
26
31
1
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0