## **Mount Drive**

In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

In [None]:
cd /content/gdrive/MyDrive/VIT/Tamil Argumentation

## **Install**

In [1]:
pip install transformers nltk scikit-learn openpyxl sacremoses

Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m880.6/880.6 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25ldone
[?25h  Created wheel for sacremoses: filename=sacremoses-0.0.53-py3-none-any.whl size=895241 sha256=9ccdbdfa7857a1e97d27c8fff4989a15308734f330ff285f34e5d6cffd9586bf
  Stored in directory: /root/.cache/pip/wheels/00/24/97/a2ea5324f36bc626e1ea0267f33db6aa80d157ee977e9e42fb
Successfully built sacremoses
Installing collected packages: sacremoses
Successfully installed sacremoses-0.0.53
Note: you may need to restart the kernel to use updated packages.


In [2]:
import nltk
nltk.download('punkt')



[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## **Import Libraries**

In [3]:
import pandas as pd
import numpy as np

from tqdm import tqdm

from copy import deepcopy

from sklearn import metrics
from sklearn.model_selection import KFold

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

from transformers import AutoTokenizer,AutoModel,BertTokenizer

from nltk.tokenize import word_tokenize

from sklearn.utils import resample, shuffle

## **Import Dataset**

In [4]:
df = pd.read_excel("/kaggle/input/merged-data/merged_data_train.xlsx")

In [5]:
df.head()

Unnamed: 0,Sno,Dataset,Topic,Tweet and Comments,Language,Quality,Stance wrt topic,Stance wrt content,Argument,Comment,Responding to Tone,Discussing Writer Characteristics,Remark,Relevancy
0,1,Youtube,Free bus commute for women. boon or bane,"Nalla thittam than but, give it to people who ...",Code-Mixed,Med,For,Undetermined,1,1,0,0,0,Relevant
1,2,Twitter,Covid vaccine - Boon or bane,Shameless,Code-Mixed,Low,Undetermined,Against,0,0,0,0,1,Relevant
2,3,Twitter,Alcohol - Ban or Allow,This is the way Dravidian stocks to Data Analy...,English,Med,Undetermined,Against,0,1,0,0,0,Relevant
3,4,Twitter,Covid vaccine - Boon or bane,அட மானங் கெட்டவனே. பல லட்சம் பேரை கொன்றவனுக்க...,Tamil,Med,Undetermined,Against,0,1,1,1,1,Relevant
4,5,Twitter,Free bus commute for women. boon or bane,Superb,Code-Mixed,Med,For,For,0,0,0,0,1,Relevant


In [6]:
df.columns

Index(['Sno', 'Dataset', 'Topic', 'Tweet and Comments', 'Language', 'Quality',
       'Stance wrt topic', 'Stance wrt content', 'Argument', 'Comment',
       'Responding to Tone', 'Discussing Writer Characteristics', 'Remark',
       'Relevancy'],
      dtype='object')

## **Load Text and Labels**

In [7]:
text = df["Tweet and Comments"].to_numpy()
dt = df["Dataset"].to_numpy()
topic = df["Topic"].to_numpy()

Language_label = df["Language"].to_numpy()
Stance_topic = df["Stance wrt content"].to_numpy()
Stance_content = df["Stance wrt topic"].to_numpy()
Quality_label = df["Quality"].to_numpy()
Argument_label = df["Argument"].to_numpy()
Comment_label = df["Comment"].to_numpy()
Writer_label = df["Discussing Writer Characteristics"].to_numpy()
Tone_label = df["Responding to Tone"].to_numpy()
Remark_label = df["Remark"].to_numpy()
Relevancy_label = df["Relevancy"].to_numpy()

## **Label Encoding**

In [8]:
encode_dict_quality = {
    "High": np.array([1, 0, 0]),
    "Med": np.array([0, 1, 0]),
    "Med ": np.array([0, 1, 0]),
    "Low": np.array([0, 0, 1]),
}

encode_dict_language = {
    "English": np.array([1, 0, 0]),
    "Tamil": np.array([0, 1, 0]),
    "Code-Mixed": np.array([0, 0, 1]),
}

encode_dict_stance = {
    "For": np.array([1, 0, 0]),
    "Against": np.array([0, 1, 0]),
    "Undetermined": np.array([0, 0, 1]),
}

encode_dict = {
    0: np.array([1, 0]),
    1: np.array([0, 1]),
}

encode_dict_relevancy = {
    "Relevant": np.array([1, 0]),
    "Irrelevant": np.array([0, 1]),
}

In [9]:
Language_label = np.array([encode_dict_language[label] for label in Language_label])
Quality_label = np.array([encode_dict_quality[label] for label in Quality_label])
Argument_label = np.array([encode_dict[label] for label in Argument_label])
Comment_label = np.array([encode_dict[label] for label in Comment_label])
Writer_label = np.array([encode_dict[label] for label in Writer_label])
Tone_label = np.array([encode_dict[label] for label in Tone_label])
Remark_label = np.array([encode_dict[label] for label in Remark_label])
Relevancy_label = np.array([encode_dict_relevancy[label] for label in Relevancy_label])
Stance_topic = np.array([encode_dict_stance[label] for label in Stance_topic])
Stance_content = np.array([encode_dict_stance[label] for label in Stance_content])

# Resampling

## Quality

In [None]:
High_tweets = df[df["Quality"]=="High"]["Tweet"]
Med_tweets = df[df["Quality"]=="Med"]["Tweet"]
Low_tweets = df[df["Quality"]=="Low"]["Tweet"]

upsampled_high_tweets = resample(High_tweets,replace=True,n_samples = 2*len(High_tweets))
upsampled_low_tweets = resample(Low_tweets,replace=True,n_samples = 2*len(Low_tweets))

quality_tweets_concat = np.concatenate((upsampled_high_tweets,upsampled_low_tweets,Med_tweets))
quality_labels = np.concatenate((["High"]*len(upsampled_high_tweets),["Low"]*len(upsampled_low_tweets),["Med"]*len(Med_tweets)))

In [None]:
quality_df = pd.DataFrame({"Tweet":quality_tweets_concat,"Quality":quality_labels})
quality_df = shuffle(quality_df)

quality_df.head()

In [None]:
text = quality_df["Tweet"].to_numpy()
Quality_label = quality_df["Quality"].to_numpy()

Quality_label = np.array([encode_dict_quality[label] for label in Quality_label])

## Comments

In [None]:
comment_tweets = df[df["Comment"]==1]["Tweet"]
not_comment_tweets = df[df["Comment"]==0]["Tweet"]

upsampled_not_comment_tweets = resample(not_comment_tweets,replace=True,n_samples = 2*len(not_comment_tweets))

comment_tweets_concat = np.concatenate((upsampled_not_comment_tweets,comment_tweets))
comment_labels = np.concatenate(([0]*len(upsampled_not_comment_tweets),[1]*len(comment_tweets)))

In [None]:
comment_df = pd.DataFrame({"Tweet":comment_tweets_concat,"Comment":comment_labels})
comment_df = shuffle(comment_df)

comment_df.head()

In [None]:
text = comment_df["Tweet"].to_numpy()
Comment_label = comment_df["Comment"].to_numpy()

Comment_label = np.array([encode_dict[label] for label in Comment_label])

## Arguments


In [None]:
argument_tweets = df[df["Argument"]==1]["Tweet"]
not_argument_tweets = df[df["Argument"]==0]["Tweet"]
upsampled_argument_tweets = resample(argument_tweets,replace=True,n_samples = 2*len(argument_tweets))

argument_tweets_concat = np.concatenate((upsampled_argument_tweets,not_argument_tweets))
argument_labels = np.concatenate(([1]*len(upsampled_argument_tweets),[0]*len(not_argument_tweets)))

In [None]:
argument_df = pd.DataFrame({"Tweet":argument_tweets_concat,"Argument":argument_labels})
argument_df = shuffle(argument_df)

argument_df.head()

In [None]:
text = argument_df["Tweet"].to_numpy()
Argument_label = argument_df["Argument"].to_numpy()

Argument_label = np.array([encode_dict[label] for label in Argument_label])

## Relevancy

In [None]:
relevant_tweets = df[df["Relevancy"]=="Relevant"]["Tweet"]
irrelevant_tweets = df[df["Relevancy"]=="Irrelevant"]["Tweet"]
upsampled_irrelevant_tweets = resample(irrelevant_tweets,replace=True,n_samples = 2*len(irrelevant_tweets))

relevant_tweets_concat = np.concatenate((upsampled_irrelevant_tweets,relevant_tweets))
relevant_labels = np.concatenate((["Irrelevant"]*len(upsampled_irrelevant_tweets),["Relevant"]*len(relevant_tweets)))

In [None]:
relevant_df = pd.DataFrame({"Tweet":relevant_tweets_concat,"Relevancy":relevant_labels})
relevant_df = shuffle(relevant_df)

relevant_df.head()

In [None]:
relevant_df.shape

In [None]:
text = relevant_df["Tweet"].to_numpy()
Relevancy_label = relevant_df["Relevancy"].to_numpy()

Relevancy_label = np.array([encode_dict_relevancy[label] for label in Relevancy_label])

## **Pre-Config for mBERT**

In [10]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

MAX_LEN = np.max([len(x) for x in text])
MAX_LEN = np.min([MAX_LEN, 510])

BATCH_SIZE = 32
LEARNING_RATE = 1e-1

print(MAX_LEN)
print(device)

510
cuda


In [11]:
tokenizer = AutoTokenizer.from_pretrained('xlm-mlm-100-1280')

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.35k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/41.9k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/5.72M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/2.97M [00:00<?, ?B/s]

## **Build Dataset for mBERT**

In [12]:
class ModelDataset(Dataset):
    def __init__(self, X, y, tokenizer, max_len):
        self.max_len = max_len
        self.text = X
        self.tokenizer = tokenizer
        self.targets = y

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, index):
        text = self.text[index]

        inputs = self.tokenizer(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True)

        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs['token_type_ids']

        return {
            'ids': torch.tensor(ids, dtype=torch.long).to(device),
            'mask': torch.tensor(mask, dtype=torch.long).to(device),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long).to(device),
            'targets': torch.tensor(self.targets[index], dtype=torch.float).to(device),
        }

## **Build Model**

In [18]:
class CustomModel(nn.Module):

    def __init__(self):
        super(CustomModel, self).__init__()
        self.bert = AutoModel.from_pretrained('xlm-mlm-100-1280')

        for param in self.bert.parameters():
            param.requires_grad = False

        self.out_layer = nn.Linear(652800, 3)

    def forward(self, ids, mask, token_type_ids):
        features = self.bert(
            ids, token_type_ids=token_type_ids,
            attention_mask=mask, return_dict=False
        )

        features = features[0].view(-1,652800)
        output = self.out_layer(features)
        
        return output

## **Train Model**

In [14]:
def train(epoch, model, train_loader, loss_fn, optimizer):

    model.train()

    for batch in tqdm(train_loader):

        optimizer.zero_grad()

        ids = batch['ids'].to(device, dtype = torch.long)
        mask = batch['mask'].to(device, dtype = torch.long)
        token_type_ids = batch['token_type_ids'].to(device, dtype = torch.long)
        targets = batch['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        loss = loss_fn(outputs, targets)

        loss.backward()
        optimizer.step()


    print(f'Epoch: {epoch + 1}, Loss:  {loss.item()}')

In [15]:
def validation(data_loader, model):

    model.eval()
    targets = []
    outputs = []

    with torch.no_grad():

        for batch in data_loader:

            ids = batch['ids'].to(device, dtype = torch.long)
            mask = batch['mask'].to(device, dtype = torch.long)
            token_type_ids = batch['token_type_ids'].to(device, dtype = torch.long)
            batch_targets = batch['targets'].to(device, dtype = torch.float)

            batch_outputs = model(ids, mask, token_type_ids)

            targets.extend(batch_targets.cpu().numpy().tolist())
            outputs.extend(batch_outputs.cpu().numpy().tolist())

    return outputs, targets

# Model initialization

In [16]:
kf = KFold(n_splits=5)

model_targets = []
model_labels = []

model = CustomModel().to(device)

loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(params=model.parameters(), lr=LEARNING_RATE)

Downloading pytorch_model.bin:   0%|          | 0.00/1.14G [00:00<?, ?B/s]

In [17]:
ksi = 1

for train_index, test_index in kf.split(text):

    text_train, text_test = text[train_index], text[test_index]
    labels_train, labels_test = Quality_label[train_index], Quality_label[test_index]

    train_data = ModelDataset(text_train, labels_train, tokenizer, MAX_LEN)
    test_data = ModelDataset(text_test, labels_test, tokenizer, MAX_LEN)

    train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
    test_loader = DataLoader(test_data, batch_size=BATCH_SIZE)

    best_score = -np.inf
    best_weights = None

    EPOCHS = 10

    print(f"Split   ---->    {ksi}")
    ksi+=1

    torch.save(model.state_dict(),"/kaggle/working/merged_quality_mlm.pth")

    for epoch in range(EPOCHS):

        train(epoch, model, train_loader, loss_fn, optimizer)
        outputs, targets = validation(test_loader, model)

        targets = [np.argmax(x) for x in targets]
        outputs = [np.argmax(x) for x in outputs]

        score = metrics.f1_score(targets, outputs, average='weighted')

        if score > best_score:
            best_score = score
            best_weights = deepcopy(model.state_dict())

    model.load_state_dict(best_weights)

    torch.save(model.state_dict(),"/kaggle/working/merged_quality_mlm.pth")

    outputs, targets = validation(test_loader, model)

    targets = [np.argmax(x) for x in targets]
    outputs = [np.argmax(x) for x in outputs]
    
    accuracy = metrics.accuracy_score(targets, outputs)
    print("Accuracy : ",accuracy)

    model_targets.extend(targets)
    model_labels.extend(outputs)

Split   ---->    1


 98%|█████████▊| 65/66 [02:08<00:01,  1.97s/it]


RuntimeError: mat1 and mat2 shapes cannot be multiplied (32x204000 and 652800x3)

In [25]:
accuracy = metrics.accuracy_score(model_targets, model_labels)
f1_score_w_avg = metrics.f1_score(model_targets, model_labels, average='weighted')

print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Weighted) = {f1_score_w_avg}")

print(metrics.classification_report(model_targets, model_labels))

Accuracy Score = 0.669345579793341
F1 Score (Weighted) = 0.6414028336463341
              precision    recall  f1-score   support

           0       0.44      0.51      0.47       414
           1       0.74      0.83      0.79      1782
           2       0.40      0.13      0.19       417

    accuracy                           0.67      2613
   macro avg       0.53      0.49      0.48      2613
weighted avg       0.64      0.67      0.64      2613



# Tokenization testing

In [None]:
print(tokenizer.decode([ 101, 28248, 35732, 22044,   118, 21631, 10345, 11101, 16602,   119,
        10159, 48502, 10107, 10108,   146, 16994, 10806,   117, 10231, 87150,
        22525, 22489, 11309, 10161, 21528, 10114, 63376, 11915, 10135,   108,
        10201, 35732, 22044,   123, 31081, 10169, 22528, 10114, 52824, 10123,
        11345,   119, 12882, 14796, 10944,   189, 11419, 51511, 51354, 10188,
        11049, 11309, 10161,   136,   102, 16938,   112,   188, 45476, 10114,
        23763, 10531,   119,   119,   119, 10678, 10114, 94992, 10219, 10111,
        23763, 10105, 11561, 40414,   119, 14453, 44096,   189, 11337, 10678,
        10114, 21852, 10479, 10301, 22489, 11426,   102]))

In [None]:
print(len(train_data[0]["token_type_ids"]))
print(len(train_data[0]["targets"]))
print(len(train_data[0]["ids"]))
print(len(train_data[0]["mask"]))

#print(train_data[0]["text_length"])
#print(train_data[0]["pt_length"])
#print(train_data[0]["topic_length"])

print(train_data[0]["token_type_ids"])
print(train_data[0]["ids"])
