In [1]:
from google.colab import drive
drive.mount('/content/drive')


ModuleNotFoundError: No module named 'google.colab'

In [2]:
import pandas as pd
import numpy as np

In [3]:
train_en = pd.read_json('/kaggle/input/icprdata/ICPR/train-en.json')
val_en = pd.read_json('/kaggle/input/icprdata/ICPR/val-en.json')


In [4]:
print(val_en.iloc[9])

index                                                          9
claims         [{'index': 0, 'start': 6, 'end': 14, 'terms': ...
text_tokens    [Wait, !, Did, I, just, hear, Hancock, backtra...
Name: 9, dtype: object


In [5]:

print(train_en.iloc[9])

index                                                        509
claims         [{'index': 0, 'start': 26, 'end': 33, 'terms':...
text_tokens    [COVID, 19, mortality, Stats, have, been, hamm...
Name: 9, dtype: object


In [6]:
print(train_en.iloc[9]['text_tokens'][26:33])

['imprisonment', 'to', 'prove', 'living', 'in', 'abject', 'fear']


In [7]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import XLMRobertaTokenizerFast

# Load the tokenizer
tokenizer = XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base", add_prefix_space=True)

def assign_labels(example):
    claims = example['claims']
    tokens = example['text_tokens']
    labels = np.zeros(len(tokens))
    for claim in claims:
        start = claim['start']
        end = claim['end']
        for i in range(start, end):
            labels[i] = 1
    return labels

def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    for word_id in word_ids:
        if word_id is None:
            new_labels.append(-100)
        else:
            new_labels.append(labels[word_id])
    return new_labels

def preprocess(train_set, tokenizer):
    new_labels = []
    encodings = tokenizer(train_set['text_tokens'].tolist(), is_split_into_words=True, padding=False)
    for i, example in train_set.iterrows():
        labels = assign_labels(example)
        word_ids = encodings.word_ids(batch_index=i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))
    return encodings, new_labels

class CustomDataset(Dataset):
    def __init__(self, dataset, tokenizer):
        self.encodings, self.labels = preprocess(dataset, tokenizer)
        self.input_ids = self.encodings['input_ids']
        self.attention_mask = self.encodings['attention_mask']

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        input_ids = torch.tensor(self.input_ids[idx], dtype=torch.int64)
        attention_mask = torch.tensor(self.attention_mask[idx], dtype=torch.int64)
        labels = torch.tensor(self.labels[idx], dtype=torch.int64)
        return input_ids, attention_mask, labels

def collate_fn(batch):
    input_ids = [item[0] for item in batch]
    attention_mask = [item[1] for item in batch]
    labels = [item[2] for item in batch]

    max_length = max(len(ids) for ids in input_ids)

    padded_input_ids = torch.stack([torch.nn.functional.pad(ids, (0, max_length - len(ids)), value=tokenizer.pad_token_id) for ids in input_ids])
    padded_attention_mask = torch.stack([torch.nn.functional.pad(mask, (0, max_length - len(mask)), value=0) for mask in attention_mask])
    padded_labels = torch.stack([torch.nn.functional.pad(label, (0, max_length - len(label)), value=-100) for label in labels])

    return padded_input_ids, padded_attention_mask, padded_labels




tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

In [8]:
train_dataset = CustomDataset(train_en, tokenizer)
val_dataset = CustomDataset(val_en, tokenizer)

BATCH_SIZE = 16

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn) 

In [9]:
next(iter(train_dataloader))

(tensor([[     0,    468,  16350,  ...,     31,   2525,      2],
         [     0,   1401,     25,  ...,      1,      1,      1],
         [     0,  88046,    450,  ...,      1,      1,      1],
         ...,
         [     0,   1374,   1062,  ...,      1,      1,      1],
         [     0,  45764,     92,  ...,      1,      1,      1],
         [     0, 103993,  43975,  ...,      1,      1,      1]]),
 tensor([[1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 tensor([[-100,    0,    0,  ...,    0,    0, -100],
         [-100,    1,    1,  ..., -100, -100, -100],
         [-100,    0,    0,  ..., -100, -100, -100],
         ...,
         [-100,    0,    0,  ..., -100, -100, -100],
         [-100,    1,    1,  ..., -100, -100, -100],
         [-100,    0,    0,  ..., -100, -100, -100]]))

In [10]:
len(train_dataloader) , len(val_dataloader)


(375, 32)

In [11]:
import torch
import torch.nn as nn
from transformers import XLMRobertaModel, BertPreTrainedModel

class BaseModel1(BertPreTrainedModel):
    def __init__(self, conf):
        super(BaseModel1, self).__init__(conf)
        self.roberta = XLMRobertaModel.from_pretrained('xlm-roberta-large', output_hidden_states=True, add_pooling_layer=False)
        self.high_dropout = torch.nn.Dropout(0.3)
        self.classifier = nn.Sequential(
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, 2)
        )
        
        for module in self.classifier:
            if isinstance(module, nn.Linear):
                nn.init.xavier_uniform_(module.weight)

        self.softmax = nn.Softmax(dim=-1)

    def forward(self, ids, attention_mask):
        outputs = self.roberta(ids, attention_mask=attention_mask)
        out = outputs.last_hidden_state
        
        out = torch.mean(torch.stack([
            self.classifier(self.high_dropout(out))
            for _ in range(8)
        ], dim=0), dim=0)
        

        out = self.softmax(out)
        
        return out


In [12]:
import torch
import torch.nn.functional as F
loss_function = nn.CrossEntropyLoss(ignore_index=-100)

def smooth_labels(labels, smoothing=0.25):
    num_classes = labels.size(1)
    smooth_labels = labels * (1 - smoothing) + (smoothing / num_classes)
    mask = labels == -100
    smooth_labels[mask] = labels[mask]
    return smooth_labels

def cross_entropy_with_label_smoothing(y_true, y_pred, smoothing=0.25):
    y_true_smoothed = smooth_labels(y_true, smoothing)
    y_true_smoothed = y_true_smoothed.view(-1)
    
    log_probs = torch.log(y_pred)
    log_probs = torch.clamp(log_probs , min = -1e10)

    log_probs1 =log_probs[:,:,1].view(-1)
    log_probs2 =log_probs[:,:,0].view(-1)

    valid_mask = y_true != -100
    valid_mask =  valid_mask.view(-1)

    loss1 = -(y_true_smoothed[valid_mask] * log_probs1[valid_mask]).sum() / valid_mask.sum()
    loss2 = -((1 - y_true_smoothed[valid_mask]) * log_probs2[valid_mask]).sum() / valid_mask.sum()
 
    
    return loss1 + loss2

In [13]:
import torch
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm
import torch.nn.functional as F
import torch.nn as nn

def accuracy_score(preds, labels):
    mask = labels != -100
    preds = preds[mask]
    labels = labels[mask]
    return (preds == labels).sum().item() / len(labels)

def train_model(model, train_dataloader, val_dataloader, device, epochs=3):
    model = model.to(device)
    optimizer = AdamW(model.parameters(), lr=1e-5)
    total_steps = len(train_dataloader) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
    loss_function = nn.CrossEntropyLoss(ignore_index=-100)

    for epoch in range(epochs):
        model.train()
        train_loss = 0.0
        correct_predictions = 0.0  

        print(f"Epoch {epoch+1}/{epochs}")

        train_progress_bar = tqdm(train_dataloader, desc=f'Epoch {epoch+1}/{epochs}, Training')

        for batch in train_progress_bar:
            ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            labels = batch[2].to(device)

            outputs = model(ids, attention_mask)

            loss =  cross_entropy_with_label_smoothing(labels.float(),outputs.float())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()

            train_loss += loss.item()

            _, preds = torch.max(outputs, dim=2)
            correct_predictions += accuracy_score(preds, labels) * ids.size(0)  # Update correct_predictions

            train_progress_bar.set_postfix({'Training Loss': train_loss / (train_progress_bar.n + 1), 'Training Acc': correct_predictions / len(train_dataloader.dataset)})
     
        avg_train_loss = train_loss / len(train_dataloader)
        train_accuracy = correct_predictions / len(train_dataloader.dataset)  # Calculate accuracy over entire dataset

        print(f"Training loss: {avg_train_loss}, Training accuracy: {train_accuracy}")

        model.eval()
        val_loss = 0.0
        correct_predictions = 0.0  

        val_progress_bar = tqdm(val_dataloader, desc=f'Epoch {epoch+1}/{epochs}, Validation')

        with torch.no_grad():
            for batch in val_progress_bar:
                ids = batch[0].to(device)
                attention_mask = batch[1].to(device)
                labels = batch[2].to(device)

                outputs = model(ids, attention_mask)
                loss =  cross_entropy_with_label_smoothing(labels.float(),outputs.float())

                val_loss += loss.item()

                _, preds = torch.max(outputs, dim=2)
                correct_predictions += accuracy_score(preds, labels) * ids.size(0)  # Update correct_predictions

                val_progress_bar.set_postfix({'Validation Loss': val_loss / (val_progress_bar.n + 1), 'Validation Acc': correct_predictions / len(val_dataloader.dataset)})

        avg_val_loss = val_loss / len(val_dataloader)
        val_accuracy = correct_predictions / len(val_dataloader.dataset)  # Calculate accuracy over entire dataset

        print(f"Validation loss: {avg_val_loss}, Validation accuracy: {val_accuracy}")


In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import XLMRobertaModel, BertPreTrainedModel, XLMRobertaConfig

device = 'cuda' if torch.cuda.is_available() else 'cpu'

configuration = XLMRobertaConfig()

model = BaseModel1(configuration).to(device)


config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

In [15]:
train_model(model, train_dataloader, val_dataloader, device, epochs=5)



Epoch 1/5


Epoch 1/5, Training: 100%|██████████| 375/375 [03:30<00:00,  1.78it/s, Training Loss=0.427, Training Acc=0.773] 


Training loss: 0.4270086154937744, Training accuracy: 0.7732111416937256


Epoch 1/5, Validation: 100%|██████████| 32/32 [00:05<00:00,  5.89it/s, Validation Loss=0.404, Validation Acc=0.799]


Validation loss: 0.40431526210159063, Validation accuracy: 0.7993676802094607
Epoch 2/5


Epoch 2/5, Training: 100%|██████████| 375/375 [03:29<00:00,  1.79it/s, Training Loss=0.381, Training Acc=0.824]


Training loss: 0.3809818849563599, Training accuracy: 0.8238940780399886


Epoch 2/5, Validation: 100%|██████████| 32/32 [00:05<00:00,  5.88it/s, Validation Loss=0.412, Validation Acc=0.807]


Validation loss: 0.4120252002030611, Validation accuracy: 0.807085424854929
Epoch 3/5


Epoch 3/5, Training: 100%|██████████| 375/375 [03:29<00:00,  1.79it/s, Training Loss=0.358, Training Acc=0.851]


Training loss: 0.35825894383589424, Training accuracy: 0.8506386861599843


Epoch 3/5, Validation: 100%|██████████| 32/32 [00:05<00:00,  5.92it/s, Validation Loss=0.423, Validation Acc=0.811]


Validation loss: 0.422676682472229, Validation accuracy: 0.8112979356239557
Epoch 4/5


Epoch 4/5, Training: 100%|██████████| 375/375 [03:29<00:00,  1.79it/s, Training Loss=0.336, Training Acc=0.874]


Training loss: 0.33603576521078743, Training accuracy: 0.8739802586841703


Epoch 4/5, Validation: 100%|██████████| 32/32 [00:05<00:00,  5.91it/s, Validation Loss=0.434, Validation Acc=0.81] 


Validation loss: 0.4339686958119273, Validation accuracy: 0.8102193251091466
Epoch 5/5


Epoch 5/5, Training: 100%|██████████| 375/375 [03:29<00:00,  1.79it/s, Training Loss=0.317, Training Acc=0.892]


Training loss: 0.31723575095335643, Training accuracy: 0.8923869514917406


Epoch 5/5, Validation: 100%|██████████| 32/32 [00:05<00:00,  5.92it/s, Validation Loss=0.428, Validation Acc=0.811]

Validation loss: 0.42769539076834917, Validation accuracy: 0.8114371350824616





In [16]:
# PATH = "model__0.pt"

# torch.save(model.state_dict(),PATH)

## Taking Prediction for the Val Set

In [17]:
BATCH_SIZE = 1
val_dataloader = DataLoader(val_dataset, batch_size = BATCH_SIZE , shuffle = False)

In [18]:
predictions = []
pred = None
with torch.no_grad():
        for batch in tqdm(val_dataloader):
                ids = batch[0].to(device)
                attention_mask = batch[1].to(device)
                labels = batch[2].to(device)

                outputs = model(ids, attention_mask)
                pred = F.softmax(outputs, dim=-1)
                predictions.append(pred[:,:,1].tolist())



100%|██████████| 500/500 [00:09<00:00, 50.47it/s]


In [19]:
reshaped_predictions = []
for prediction in predictions:
    for i in prediction:
        reshaped_predictions.append(i)
predictions = reshaped_predictions

In [20]:
predictions = np.array(predictions)


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (500,) + inhomogeneous part.

## Post Processing

In [21]:
def preprocess(train_set):
  encoding = tokenizer(train_set['text_tokens'].tolist(), is_split_into_words = True, padding = True, return_tensors ='pt',)
  new_labels = []
  word_ids_list = []
  for i,example in train_set.iterrows():
    labels = assign_labels(example)
    word_ids = encoding[i].word_ids
    new_labels.append(allign_labels_with_tokens(labels,word_ids))
    word_ids_list.append( word_ids)
  padded_labels = pad_labels(new_labels)
  encoding['targets'] = padded_labels
  encoding['word_ids'] = word_ids_list
  return encoding


In [22]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import XLMRobertaTokenizerFast

# Load the tokenizer
tokenizer = XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base", add_prefix_space=True)

def assign_labels(example):
    claims = example['claims']
    tokens = example['text_tokens']
    labels = np.zeros(len(tokens))
    for claim in claims:
        start = claim['start']
        end = claim['end']
        for i in range(start, end):
            labels[i] = 1
    return labels

def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    for word_id in word_ids:
        if word_id is None:
            new_labels.append(-100)
        else:
            new_labels.append(labels[word_id])
    return new_labels

def preprocess(train_set, tokenizer):
    new_labels = []
    word_ids_list = []
    encodings = tokenizer(train_set['text_tokens'].tolist(), is_split_into_words=True, padding=False)
    for i, example in train_set.iterrows():
        labels = assign_labels(example)
        word_ids = encodings.word_ids(batch_index=i)
        word_ids_list.append(word_ids)
        new_labels.append(align_labels_with_tokens(labels, word_ids))
    return encodings, new_labels ,word_ids_list

class CustomDataset(Dataset):
    def __init__(self, dataset, tokenizer):
        self.encodings, self.labels = preprocess(dataset, tokenizer)
        self.input_ids = self.encodings['input_ids']
        self.attention_mask = self.encodings['attention_mask']

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        input_ids = torch.tensor(self.input_ids[idx], dtype=torch.int64)
        attention_mask = torch.tensor(self.attention_mask[idx], dtype=torch.int64)
        labels = torch.tensor(self.labels[idx], dtype=torch.int64)
        return input_ids, attention_mask, labels

def collate_fn(batch):
    input_ids = [item[0] for item in batch]
    attention_mask = [item[1] for item in batch]
    labels = [item[2] for item in batch]

    max_length = max(len(ids) for ids in input_ids)

    padded_input_ids = torch.stack([torch.nn.functional.pad(ids, (0, max_length - len(ids)), value=tokenizer.pad_token_id) for ids in input_ids])
    padded_attention_mask = torch.stack([torch.nn.functional.pad(mask, (0, max_length - len(mask)), value=0) for mask in attention_mask])
    padded_labels = torch.stack([torch.nn.functional.pad(label, (0, max_length - len(label)), value=-100) for label in labels])

    return padded_input_ids, padded_attention_mask, padded_labels




In [23]:
input_ids = preprocess(val_en,tokenizer)[0]['input_ids']
targets =  preprocess(val_en,tokenizer)[1]
word_ids =  preprocess(val_en,tokenizer)[2]

In [24]:
len(word_ids[0]) , len(targets[0]) , len(word_ids[0]), len(prediction[0])

(51, 51, 51, 43)

In [25]:
def create_final_probability_list(predictions, word_ids):
    final_probabilities = []

    for row_idx in range(len(predictions)):
        sum_dict = {}
        count_dict = {}

        for token_idx in range(len(predictions[row_idx])):
            word_id = word_ids[row_idx][token_idx]
            if word_id is not None:
                if word_id not in sum_dict:
                    sum_dict[word_id] = 0.0
                    count_dict[word_id] = 0
                sum_dict[word_id] += predictions[row_idx][token_idx]
                count_dict[word_id] += 1

        avg_predictions = {}
        for word_id in sum_dict:
            avg_predictions[word_id] = sum_dict[word_id] / count_dict[word_id]

        final_probabilities.append(avg_predictions)

    return final_probabilities

In [26]:
probablity_list = np.array(create_final_probability_list(predictions, word_ids))

#for missing wordIDs problem with XLMR tokeniser

def adjust_probabilities(text_tokens_list, probability_list):
    adjusted_probabilities = []

    for i in range(len(text_tokens_list)):
        num_tokens = len(text_tokens_list[i])
        probabilities = probability_list[i]

        adjusted_prob = {word_id: 0.0 for word_id in range(num_tokens)}

        for word_id, prob in probabilities.items():
            adjusted_prob[word_id] = prob

        adjusted_probabilities.append(adjusted_prob)

    return adjusted_probabilities

adjusted_probabilities = adjust_probabilities(val_en['text_tokens'], probablity_list)

In [27]:
threshold = 0.5
final_submission = []

for dictionary in adjusted_probabilities:
    y = [1 if value > threshold else 0 for value in dictionary.values()]
    final_submission.append(y)

In [111]:
word_ids[0]

[None,
 0,
 1,
 2,
 2,
 3,
 4,
 4,
 5,
 6,
 6,
 7,
 8,
 9,
 9,
 10,
 11,
 11,
 12,
 13,
 14,
 15,
 15,
 16,
 17,
 17,
 18,
 19,
 20,
 20,
 21,
 22,
 23,
 24,
 24,
 25,
 25,
 25,
 26,
 26,
 26,
 26,
 26,
 26,
 26,
 26,
 26,
 26,
 26,
 26,
 None]

In [110]:
probablity_list[0]

{0: 0.002788395853713155,
 1: 0.004873983561992645,
 2: 0.003113170270808041,
 3: 0.30393317341804504,
 4: 0.8318581581115723,
 5: 0.8045937418937683,
 6: 0.12299303989857435,
 7: 0.13307200372219086,
 8: 0.11679500341415405,
 9: 0.011267980560660362,
 10: 0.014554060995578766,
 11: 0.02543263416737318,
 12: 0.008038471452891827,
 13: 0.009613219648599625,
 14: 0.045567646622657776,
 15: 0.05991574563086033,
 16: 0.06538075953722,
 17: 0.05968308262526989,
 18: 0.0770503580570221,
 19: 0.1137475073337555,
 20: 0.10871043428778648,
 21: 0.16789878904819489,
 22: 0.14478625357151031,
 23: 0.14571265876293182,
 24: 0.0025689737376524135,
 25: 0.0006162291877747824,
 26: 0.0004327175447542686}

In [28]:
import json

output_file = '11_xlmr-25Multisample.json'

with open(output_file, 'w') as f:
    json.dump(final_submission, f)

print(f"JSON file '{output_file}' has been created.")

JSON file '11_xlmr-25Multisample.json' has been created.
