## **Mount Drive**

In [1]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [2]:
cd /content/gdrive/MyDrive/VIT/Tamil Argumentation

/content/gdrive/MyDrive/VIT/Tamil Argumentation


## **Install**

In [3]:
pip install transformers nltk scikit-learn

Collecting transformers
  Downloading transformers-4.33.3-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m49.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m43.7 MB/s[0m eta [36m0:00:0

In [4]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## **Import Libraries**

In [5]:
import pandas as pd
import numpy as np

from tqdm import tqdm

from copy import deepcopy

from sklearn import metrics
from sklearn.model_selection import KFold

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

from transformers import AutoTokenizer,AutoModel,BertTokenizer

from nltk.tokenize import word_tokenize

from sklearn.utils import resample, shuffle

## **Import Dataset**

In [6]:
df = pd.read_excel("/content/gdrive/MyDrive/VIT/Tamil Argumentation/Dataset/Twitter/Twitter Comment Dataset.xlsx")

In [7]:
df.head()

Unnamed: 0,S No,Tweet,Date of Tweet,Topic,Parent Tweet,Language,Quality,Stance,Argument,Comment,Responding to Tone,Discussing Writer Characteristics,Remark,Relevancy
0,1,"Bro imagine today is Friday , big star movie i...",2018-05-22,Jalikattu - Ban or Allow,"And tamil people, jalikattu maadu for money an...",ENGLISH,Med,Undetermined,0,1,0,0,0,Relevant
1,2,Dei unnoda akkarai TN mela not on others and w...,2018-05-22,Jalikattu - Ban or Allow,"And tamil people, jalikattu maadu for money an...",ENGLISH,Med,Against,0,1,0,1,0,Relevant
2,3,En ninga ivara matum mention panuringa naraiya...,2018-05-22,Jalikattu - Ban or Allow,"And tamil people, jalikattu maadu for money an...",CODE-MIXED,Med,For,0,1,0,0,0,Relevant
3,4,What is happening in Thoothukudi is totally no...,2018-05-22,Jalikattu - Ban or Allow,"And tamil people, jalikattu maadu for money an...",ENGLISH,High,Against,1,1,0,0,0,Relevant
4,5,Ungaluku Sterlite protest prachanaya illa Bala...,2018-05-22,Jalikattu - Ban or Allow,"And tamil people, jalikattu maadu for money an...",CODE-MIXED,Med,Undetermined,0,0,1,0,0,Relevant


In [30]:
df.columns

Index(['S No', 'Tweet', 'Date of Tweet', 'Topic', 'Parent Tweet', 'Language',
       'Quality', 'Stance', 'Argument', 'Comment', 'Responding to Tone',
       'Discussing Writer Characteristics', 'Remark', 'Relevancy'],
      dtype='object')

## **Load Text and Labels**

In [7]:
text = df["Tweet"].to_numpy()
pt = df["Parent Tweet"].to_numpy()
topic = df["Topic"].to_numpy()

Language_label = df["Language"].to_numpy()
Stance_label = df["Stance"].to_numpy()
Quality_label = df["Quality"].to_numpy()
Argument_label = df["Argument"].to_numpy()
Comment_label = df["Comment"].to_numpy()
Writer_label = df["Discussing Writer Characteristics"].to_numpy()
Tone_label = df["Responding to Tone"].to_numpy()
Remark_label = df["Remark"].to_numpy()
Relevancy_label = df["Relevancy"].to_numpy()

## **Label Encoding**

In [8]:
encode_dict_quality = {
    "High": np.array([1, 0, 0]),
    "Med": np.array([0, 1, 0]),
    "Low": np.array([0, 0, 1]),
}

encode_dict_language = {
    "ENGLISH": np.array([1, 0, 0]),
    "TAMIL": np.array([0, 1, 0]),
    "CODE-MIXED": np.array([0, 0, 1]),
}

encode_dict_stance = {
    "For": np.array([1, 0, 0]),
    "Against": np.array([0, 1, 0]),
    "Undetermined": np.array([0, 0, 1]),
}

encode_dict = {
    0: np.array([1, 0]),
    1: np.array([0, 1]),
}

encode_dict_relevancy = {
    "Relevant": np.array([1, 0]),
    "Irrelevant": np.array([0, 1]),
}

In [9]:
Language_label = np.array([encode_dict_language[label] for label in Language_label])
Quality_label = np.array([encode_dict_quality[label] for label in Quality_label])
Argument_label = np.array([encode_dict[label] for label in Argument_label])
Comment_label = np.array([encode_dict[label] for label in Comment_label])
Writer_label = np.array([encode_dict[label] for label in Writer_label])
Tone_label = np.array([encode_dict[label] for label in Tone_label])
Remark_label = np.array([encode_dict[label] for label in Remark_label])
Relevancy_label = np.array([encode_dict_relevancy[label] for label in Relevancy_label])
Stance_label = np.array([encode_dict_stance[label] for label in Stance_label])

# Resampling

## Quality

In [16]:
High_tweets = df[df["Quality"]=="High"]["Tweet"]
Med_tweets = df[df["Quality"]=="Med"]["Tweet"]
Low_tweets = df[df["Quality"]=="Low"]["Tweet"]

upsampled_high_tweets = resample(High_tweets,replace=True,n_samples = 2*len(High_tweets))
upsampled_low_tweets = resample(Low_tweets,replace=True,n_samples = 2*len(Low_tweets))

quality_tweets_concat = np.concatenate((upsampled_high_tweets,upsampled_low_tweets,Med_tweets))
quality_labels = np.concatenate((["High"]*len(upsampled_high_tweets),["Low"]*len(upsampled_low_tweets),["Med"]*len(Med_tweets)))

In [17]:
quality_df = pd.DataFrame({"Tweet":quality_tweets_concat,"Quality":quality_labels})
quality_df = shuffle(quality_df)

quality_df.head()

Unnamed: 0,Tweet,Quality
1100,I don't want to educate you about #jallikattu ...,Med
1754,U r purely DMK supporter..... Here l know... W...,Med
1352,ஆதாரம் காட்டு.,Med
1844,Yes sure… he ‘demanded’ and pm agreed!,Med
677,செத்துடு.,Low


In [19]:
text = quality_df["Tweet"].to_numpy()
Quality_label = quality_df["Quality"].to_numpy()

Quality_label = np.array([encode_dict_quality[label] for label in Quality_label])

## Arguments


In [10]:
argument_tweets = df[df["Argument"]==1]["Tweet"]
not_argument_tweets = df[df["Argument"]==0]["Tweet"]
upsampled_argument_tweets = resample(argument_tweets,replace=True,n_samples = 2*len(argument_tweets))

argument_tweets_concat = np.concatenate((upsampled_argument_tweets,not_argument_tweets))
argument_labels = np.concatenate(([1]*len(upsampled_argument_tweets),[0]*len(not_argument_tweets)))

In [11]:
argument_df = pd.DataFrame({"Tweet":argument_tweets_concat,"Argument":argument_labels})
argument_df = shuffle(argument_df)

argument_df.head()

Unnamed: 0,Tweet,Argument
1218,Elei Neutral daaw.. Elei,0
1224,As usual you do not want to thank Modi vaccine...,0
384,Tamilians are protesting peacefully but if SC ...,1
649,shut up u idiot,0
1393,என்னடா மதுவிலக்கு அமலில் இருக்குற குஜராத்துக்க...,0


In [12]:
text = argument_df["Tweet"].to_numpy()
Argument_label = argument_df["Argument"].to_numpy()

Argument_label = np.array([encode_dict[label] for label in Argument_label])

## Relevancy

In [None]:
relevant_tweets = df[df["Relevancy"]=="Relevant"]["Tweet"]
irrelevant_tweets = df[df["Relevancy"]=="Irrelevant"]["Tweet"]
upsampled_irrelevant_tweets = resample(irrelevant_tweets,replace=True,n_samples = 600)

relevant_tweets_concat = np.concatenate((upsampled_irrelevant_tweets,relevant_tweets))
relevant_labels = np.concatenate((["Irrelevant"]*len(upsampled_irrelevant_tweets),["Relevant"]*len(relevant_tweets)))

In [None]:
relevant_df = pd.DataFrame({"Tweet":relevant_tweets_concat,"Relevancy":relevant_labels})
relevant_df = shuffle(relevant_df)

relevant_df.head()

Unnamed: 0,Tweet,Relevancy
44,सिर्फ हरामजादे ही #jallikattu का विरोध कर सकते...,Irrelevant
452,Sir\nPlease submit a report of \n2020-2021 - H...,Irrelevant
1221,@USER_49,Relevant
93,"துட்டு வாங்கிட்டு தானே, பெண்களோடு அனுப்பி வச்ச...",Irrelevant
1645,"Yeah ,but need to reduce",Relevant


In [None]:
text = relevant_df["Tweet"].to_numpy()
Relevant_label = relevant_df["Relevancy"].to_numpy()

Relevant_label = np.array([encode_dict_relevancy[label] for label in Relevant_label])

## **Pre-Config for mBERT**

In [13]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

MAX_LEN = np.max([(len(x)+len(y)+1) for x,y in zip(text,pt)])
MAX_LEN = np.min([MAX_LEN, 510])

BATCH_SIZE = 32
LEARNING_RATE = 1e-1

In [14]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

## **Build Dataset for mBERT**

In [15]:
class ModelDataset(Dataset):
    def __init__(self, X, y, tokenizer, max_len):
        self.max_len = max_len
        self.text = X
        self.tokenizer = tokenizer
        self.targets = y

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, index):
        text = self.text[index]

        inputs = self.tokenizer(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True)

        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs['token_type_ids']

        return {
            'ids': torch.tensor(ids, dtype=torch.long).to(device),
            'mask': torch.tensor(mask, dtype=torch.long).to(device),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long).to(device),
            'targets': torch.tensor(self.targets[index], dtype=torch.float).to(device),
        }

## **Build Model**

In [16]:
class CustomModel(nn.Module):

    def __init__(self):
        super(CustomModel, self).__init__()
        self.bert = AutoModel.from_pretrained('bert-base-multilingual-cased')

        for param in self.bert.parameters():
            param.requires_grad = False

        self.out_layer = nn.Linear(768, 2)

    def forward(self, ids, mask, token_type_ids):
        _, features = self.bert(
            ids, token_type_ids=token_type_ids,
            attention_mask=mask, return_dict=False
        )

        output = self.out_layer(features)

        return output

## **Train Model**

In [17]:
def train(epoch, model, train_loader, loss_fn, optimizer):

    model.train()

    for batch in tqdm(train_loader):

        optimizer.zero_grad()

        ids = batch['ids'].to(device, dtype = torch.long)
        mask = batch['mask'].to(device, dtype = torch.long)
        token_type_ids = batch['token_type_ids'].to(device, dtype = torch.long)
        targets = batch['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        loss = loss_fn(outputs, targets)

        loss.backward()
        optimizer.step()


    print(f'Epoch: {epoch + 1}, Loss:  {loss.item()}')

In [18]:
def validation(data_loader, model):

    model.eval()
    targets = []
    outputs = []

    with torch.no_grad():

        for batch in data_loader:

            ids = batch['ids'].to(device, dtype = torch.long)
            mask = batch['mask'].to(device, dtype = torch.long)
            token_type_ids = batch['token_type_ids'].to(device, dtype = torch.long)
            batch_targets = batch['targets'].to(device, dtype = torch.float)

            batch_outputs = model(ids, mask, token_type_ids)

            targets.extend(batch_targets.cpu().numpy().tolist())
            outputs.extend(batch_outputs.cpu().numpy().tolist())

    return outputs, targets

# Model initialization

In [19]:
kf = KFold(n_splits=5)

model_targets = []
model_labels = []

model = CustomModel().to(device)

loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(params=model.parameters(), lr=LEARNING_RATE)

Downloading model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

In [None]:
ksi = 1

for train_index, test_index in kf.split(text):

    text_train, text_test = text[train_index], text[test_index]
    labels_train, labels_test = Argument_label[train_index], Argument_label[test_index]

    train_data = ModelDataset(text_train, labels_train, tokenizer, MAX_LEN)
    test_data = ModelDataset(text_test, labels_test, tokenizer, MAX_LEN)

    train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
    test_loader = DataLoader(test_data, batch_size=BATCH_SIZE)

    best_score = -np.inf
    best_weights = None

    EPOCHS = 30

    print(f"Split   ---->    {ksi}")
    ksi+=1

    torch.save(model.state_dict(),"/content/gdrive/MyDrive/VIT/Tamil Argumentation/models/twitter_argument.pth")

    for epoch in range(EPOCHS):

        train(epoch, model, train_loader, loss_fn, optimizer)
        outputs, targets = validation(test_loader, model)

        print(outputs)
        print(" ")
        print(targets)

        targets = [np.argmax(x) for x in targets]
        outputs = [np.argmax(x) for x in outputs]

        score = metrics.f1_score(targets, outputs, average='weighted')

        if score > best_score:
            best_score = score
            best_weights = deepcopy(model.state_dict())

    model.load_state_dict(best_weights)

    torch.save(model.state_dict(),"/content/gdrive/MyDrive/VIT/Tamil Argumentation/models/twitter_argument.pth")

    outputs, targets = validation(test_loader, model)

    targets = [np.argmax(x) for x in targets]
    outputs = [np.argmax(x) for x in outputs]

    model_targets.extend(targets)
    model_labels.extend(outputs)

Split   ---->    1


100%|██████████| 39/39 [59:03<00:00, 90.87s/it]


Epoch: 1, Loss:  0.5091083645820618


In [28]:
accuracy = metrics.accuracy_score(model_targets, model_labels)
f1_score_w_avg = metrics.f1_score(model_targets, model_labels, average='weighted')

print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Weighted) = {f1_score_w_avg}")

print(metrics.classification_report(model_targets, model_labels))

Accuracy Score = 0.6769313884386818
F1 Score (Weighted) = 0.6745961682381202
              precision    recall  f1-score   support

           0       0.74      0.53      0.62       394
           1       0.66      0.73      0.70       849
           2       0.67      0.70      0.68       608

    accuracy                           0.68      1851
   macro avg       0.69      0.65      0.66      1851
weighted avg       0.68      0.68      0.67      1851



# Tokenization testing

In [None]:
print(tokenizer.decode([ 101, 28248, 35732, 22044,   118, 21631, 10345, 11101, 16602,   119,
        10159, 48502, 10107, 10108,   146, 16994, 10806,   117, 10231, 87150,
        22525, 22489, 11309, 10161, 21528, 10114, 63376, 11915, 10135,   108,
        10201, 35732, 22044,   123, 31081, 10169, 22528, 10114, 52824, 10123,
        11345,   119, 12882, 14796, 10944,   189, 11419, 51511, 51354, 10188,
        11049, 11309, 10161,   136,   102, 16938,   112,   188, 45476, 10114,
        23763, 10531,   119,   119,   119, 10678, 10114, 94992, 10219, 10111,
        23763, 10105, 11561, 40414,   119, 14453, 44096,   189, 11337, 10678,
        10114, 21852, 10479, 10301, 22489, 11426,   102]))

[CLS] Jalikattu - Ban or Allow. Lakhs of Idiots, uneducated Tamil ppl want to lift ban on # jalikattu 2play with animals to hurt them. So how can u expect gud from such ppl? [SEP] don't dare to say this... come to merina and say the same thing. Surely u will come to know who are Tamil people [SEP]


In [None]:
print(len(train_data[0]["token_type_ids"]))
print(len(train_data[0]["targets"]))
print(len(train_data[0]["ids"]))
print(len(train_data[0]["mask"]))

#print(train_data[0]["text_length"])
#print(train_data[0]["pt_length"])
#print(train_data[0]["topic_length"])

print(train_data[0]["token_type_ids"])
print(train_data[0]["ids"])


510
2
510
510
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 