# Import needed libraries

In [1]:
!pip install sacremoses sentencepiece

Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sacremoses
Successfully installed sacremoses-0.1.1


In [2]:
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import StepLR
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer

from tqdm import tqdm

from torch.utils.tensorboard import SummaryWriter

import os

2024-07-15 02:20:04.419735: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-15 02:20:04.419850: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-15 02:20:04.514528: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Device agnostic code

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"

# Set default device
torch.set_default_device(device)
print(f"Default device set to: {device}")

Default device set to: cuda


In [4]:
train_df = pd.read_csv("/kaggle/input/contradictory-my-dear-watson/train.csv")
train_df.head(5)

Unnamed: 0,id,premise,hypothesis,lang_abv,language,label
0,5130fd2cb5,and these comments were considered in formulat...,The rules developed in the interim were put to...,en,English,0
1,5b72532a0b,These are issues that we wrestle with in pract...,Practice groups are not permitted to work on t...,en,English,2
2,3931fbe82a,Des petites choses comme celles-là font une di...,J'essayais d'accomplir quelque chose.,fr,French,0
3,5622f0c60b,you know they can't really defend themselves l...,They can't defend themselves because of their ...,en,English,0
4,86aaa48b45,ในการเล่นบทบาทสมมุติก็เช่นกัน โอกาสที่จะได้แสด...,เด็กสามารถเห็นได้ว่าชาติพันธุ์แตกต่างกันอย่างไร,th,Thai,1


In [5]:
train_df.drop(columns=["id", "lang_abv", "language"], inplace=True)

In [6]:
train_df.sample(5)

Unnamed: 0,premise,hypothesis,label
4571,It is that prospect that may bring Republicans...,Republicans have always defended such inaccura...,1
351,it sure will well good to talk to,Let's talk again soon.,1
10034,Die Endtheorien identifizieren Epochen mit cha...,Die Theorien identifizieren das Alter anhand d...,1
125,"ähm wie denkst du, dass die Zeitung in Colorad...","Glauben Sie, dass die lokalen Zeitungen nicht ...",1
1441,"You see, he said sadly, ""you have no instincts.""",He said that I had no willpower.,2


In [7]:
tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [8]:
class CustomDataset(Dataset):
    def __init__(self, df: pd.DataFrame, tokenizer: object, max_length: int):
        self.dataset = df
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        premise = self.dataset["premise"].iloc[idx]
        hypothesis = self.dataset["hypothesis"].iloc[idx]
        
        token_dict = self.tokenizer.encode_plus(premise, hypothesis, padding=True, truncation=True, max_length=self.max_length, return_tensors="pt")
    
        return {
            "input_ids": token_dict["input_ids"].flatten(),
            "token_type_ids": token_dict["token_type_ids"].flatten(),
            "attention_mask": token_dict["attention_mask"].flatten(),
            "label": torch.tensor(self.dataset["label"].iloc[idx], dtype=torch.long)
        }
    
    @staticmethod
    def collate_fn(batch):
        batch_input_ids = [batch_item["input_ids"] for batch_item in batch]
        batch_type_ids = [batch_item["token_type_ids"] for batch_item in batch]
        batch_attention_masks = [batch_item["attention_mask"] for batch_item in batch]
        
        batch_input_ids = pad_sequence(batch_input_ids, batch_first=True, padding_value=0)
        batch_type_ids  = pad_sequence(batch_type_ids, batch_first=True, padding_value=0) # will get ignored by the attention mask when going through the model; very important
        batch_attention_masks = pad_sequence(batch_attention_masks, batch_first=True, padding_value=0)
        
        return {
            "input_ids": batch_input_ids,
            "token_type_ids": batch_type_ids,
            "attention_mask": batch_attention_masks,
            "labels": torch.stack([batch_item["label"] for batch_item in batch], dim=0)
        }
        

In [9]:
class CustomDatasetSubmission(Dataset):
    def __init__(self, df: pd.DataFrame, tokenizer: object, max_length: int):
        self.dataset = df
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        premise = self.dataset["premise"].iloc[idx]
        hypothesis = self.dataset["hypothesis"].iloc[idx]
        
        token_dict = self.tokenizer.encode_plus(premise, hypothesis, padding=True, truncation=True, max_length=self.max_length, return_tensors="pt")
    
        return {
            "input_ids": token_dict["input_ids"].flatten(),
            "token_type_ids": token_dict["token_type_ids"].flatten(),
            "attention_mask": token_dict["attention_mask"].flatten(),
            "id": self.dataset["id"].iloc[idx]
        }
    
    @staticmethod
    def collate_fn(batch):
        batch_input_ids = [batch_item["input_ids"] for batch_item in batch]
        batch_type_ids = [batch_item["token_type_ids"] for batch_item in batch]
        batch_attention_masks = [batch_item["attention_mask"] for batch_item in batch]
        
        batch_input_ids = pad_sequence(batch_input_ids, batch_first=True, padding_value=0)
        batch_type_ids  = pad_sequence(batch_type_ids, batch_first=True, padding_value=0) # will get ignored by the attention mask when going through the model; very important
        batch_attention_masks = pad_sequence(batch_attention_masks, batch_first=True, padding_value=0)
        
        return {
            "input_ids": batch_input_ids,
            "token_type_ids": batch_type_ids,
            "attention_mask": batch_attention_masks,
            "ids": [batch_item["id"] for batch_item in batch]
        }
 

In [10]:
max_length = max(max([len(premise) for premise in train_df["premise"]]), max([len(hypothesis) for hypothesis in train_df["hypothesis"]]))
train_split, val_split = train_test_split(train_df, test_size=0.25, shuffle=True)
train_dataset = CustomDataset(df=train_split, tokenizer=tokenizer, max_length=max_length)
val_dataset = CustomDataset(df=val_split, tokenizer=tokenizer, max_length=max_length)

In [11]:
train_dataloader = DataLoader(dataset=train_dataset, batch_size=8, shuffle=True, generator=torch.Generator(device=device), collate_fn=train_dataset.collate_fn)
val_dataloader = DataLoader(dataset=val_dataset, batch_size=4, shuffle=True, generator=torch.Generator(device=device), collate_fn=val_dataset.collate_fn)

In [12]:
next(iter(val_dataloader))

{'input_ids': tensor([[  101,  1159, 29737, 29723, 18199,  1159, 29731, 18199, 29733, 29155,
           1158, 24824, 29727, 29739, 29733, 29723, 18199,  1169, 29734, 18199,
           1174, 29730,  2012,  2696,  1173, 29735, 29728, 29731, 29723, 29732,
          18199, 29727, 14608, 29728, 29720, 14608, 16177, 29723,  1159, 16177,
          14608,  1170, 29735, 29732, 24824, 16177, 18199, 29726, 29730,  1159,
          29732, 29721, 29730, 29733, 29734, 14608, 29733, 18199, 29730,  1173,
          29734, 29730, 16177,  1170, 29732, 29730, 29726, 14608, 29734, 14608,
          29732, 29726, 29734, 18199, 29726, 29730,  1174, 26789,  1164, 14608,
          29734, 14608, 29727, 29730, 29721, 29730,  1173, 29734, 29730, 29737,
          29739, 16177,  1010,  1155, 29727, 29727, 14608,  1169,  8026, 14804,
           2378,  1155, 29731, 29730, 29736, 14608, 29733, 18199, 29733, 29723,
           1167, 14608,  1159, 29721, 29726, 14608, 29734, 14608, 29727, 29723,
          18199, 29738, 297

In [13]:
bert_model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'google-bert/bert-large-uncased')

Downloading: "https://github.com/huggingface/pytorch-transformers/zipball/main" to /root/.cache/torch/hub/main.zip


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

In [14]:
class Classifier(nn.Module):
    def __init__(self, n_inputs, hidden_size, n_classes, dropout):
        super(Classifier, self).__init__()
        self.linear1 = nn.Linear(in_features=n_inputs, out_features=hidden_size)
        self.linear2 = nn.Linear(in_features=hidden_size, out_features=n_classes)
        self.dropout = nn.Dropout(dropout)

        self.act_fn = nn.ReLU()

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.act_fn(self.linear1(x))
        x = self.dropout(x)
        x = self.linear2(x)
        return x


In [15]:
class BertClassifier(nn.Module):
    def __init__(self, bert_model, classifier):
        super(BertClassifier, self).__init__()
        self.bert_model = bert_model
        self.classifier = classifier

    def forward(self, x: torch.Tensor, token_type_ids: torch.Tensor, attention_mask: torch.Tensor = None) -> torch.Tensor:
        x = self.bert_model(x, attention_mask=attention_mask, token_type_ids=token_type_ids)
        x = x.last_hidden_state[:, 0, :]
        x = self.classifier(x)
        return x

In [16]:
classifier = Classifier(n_inputs=1024, n_classes=3, hidden_size=768, dropout=0.35)
model = BertClassifier(bert_model=bert_model, classifier=classifier)
bert_model.config.hidden_dropout_prob = 0.35
bert_model.config

BertConfig {
  "_name_or_path": "google-bert/bert-large-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.35,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.42.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [17]:
optimizer = torch.optim.AdamW(params=model.parameters(), lr=1e-5)
loss_fn = nn.CrossEntropyLoss()

In [18]:
def train_model(model, dataloader, val_dataloader, loss_fn, optimizer, scheduler, epochs):
    writer = SummaryWriter()
    writer.add_graph(model, (torch.zeros(size=(32, 32), dtype=torch.long), torch.zeros(size=(32, 32), dtype=torch.long)))
    for epoch in range(epochs):
        model.eval()
        total_inputs = 0
        total_correct = 0

        with torch.inference_mode():
            for batch in tqdm(val_dataloader):
                logits = model(batch["input_ids"], attention_mask=batch["attention_mask"], token_type_ids=batch["token_type_ids"])
                percents = torch.softmax(logits, dim=1)
                preds = torch.argmax(percents, dim=1)

                total_correct += (preds==batch["labels"]).sum().item()
                total_inputs += batch["labels"].view(-1).shape[0]

            print(f"{total_correct} out of {total_inputs}")
            print(f"acc of {total_correct/total_inputs*100}%")

            writer.add_scalar("Val acc", total_correct/total_inputs*100, epoch)

        model.train()
        for batch_idx, batch in tqdm(enumerate(dataloader)):
            logits = model(batch["input_ids"], attention_mask=batch["attention_mask"], token_type_ids=batch["token_type_ids"])
            loss = loss_fn(logits, batch["labels"])
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if (batch_idx+1) % 40 == 0:
                print(f"loss for batch {batch_idx+1} --> {loss} at epoch {epoch}")

            writer.add_scalar("Loss", loss, batch_idx)

        scheduler.step()

    writer.close()

In [19]:
scheduler = StepLR(gamma=0.5, step_size=1, optimizer=optimizer)
train_model(model=model, dataloader=train_dataloader, val_dataloader=val_dataloader, loss_fn=loss_fn, optimizer=optimizer, scheduler=scheduler, epochs=3)

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
100%|██████████| 758/758 [01:08<00:00, 11.14it/s]


1018 out of 3030
acc of 33.5973597359736%


40it [00:29,  1.17it/s]

loss for batch 40 --> 0.9956830143928528 at epoch 0


80it [00:56,  1.48it/s]

loss for batch 80 --> 1.1042290925979614 at epoch 0


120it [01:24,  1.37it/s]

loss for batch 120 --> 0.970927894115448 at epoch 0


160it [01:52,  1.49it/s]

loss for batch 160 --> 1.0279371738433838 at epoch 0


200it [02:19,  1.63it/s]

loss for batch 200 --> 1.1026151180267334 at epoch 0


240it [02:48,  1.13it/s]

loss for batch 240 --> 0.9894375801086426 at epoch 0


280it [03:19,  1.25it/s]

loss for batch 280 --> 1.0310730934143066 at epoch 0


320it [03:48,  1.48it/s]

loss for batch 320 --> 1.0537317991256714 at epoch 0


360it [04:16,  1.58it/s]

loss for batch 360 --> 0.6272004842758179 at epoch 0


400it [04:45,  1.77it/s]

loss for batch 400 --> 0.812451958656311 at epoch 0


440it [05:15,  1.35it/s]

loss for batch 440 --> 0.8343583345413208 at epoch 0


480it [05:43,  1.74it/s]

loss for batch 480 --> 0.9245427250862122 at epoch 0


520it [06:12,  1.78it/s]

loss for batch 520 --> 0.9552139639854431 at epoch 0


560it [06:42,  1.19it/s]

loss for batch 560 --> 0.8317797780036926 at epoch 0


600it [07:10,  1.79it/s]

loss for batch 600 --> 0.9986124038696289 at epoch 0


640it [07:39,  1.41it/s]

loss for batch 640 --> 0.8551827073097229 at epoch 0


680it [08:07,  1.70it/s]

loss for batch 680 --> 0.6772903800010681 at epoch 0


720it [08:37,  1.48it/s]

loss for batch 720 --> 0.9736149311065674 at epoch 0


760it [09:08,  1.10it/s]

loss for batch 760 --> 1.0864001512527466 at epoch 0


800it [09:37,  1.29it/s]

loss for batch 800 --> 0.7208545804023743 at epoch 0


840it [10:05,  1.45it/s]

loss for batch 840 --> 0.9044332504272461 at epoch 0


880it [10:34,  1.84it/s]

loss for batch 880 --> 0.5807430744171143 at epoch 0


920it [11:02,  1.78it/s]

loss for batch 920 --> 0.6015552282333374 at epoch 0


960it [11:30,  1.38it/s]

loss for batch 960 --> 1.0609254837036133 at epoch 0


1000it [11:59,  1.52it/s]

loss for batch 1000 --> 0.825131893157959 at epoch 0


1040it [12:28,  1.66it/s]

loss for batch 1040 --> 0.5045881271362305 at epoch 0


1080it [12:58,  1.70it/s]

loss for batch 1080 --> 1.1042442321777344 at epoch 0


1120it [13:27,  1.30it/s]

loss for batch 1120 --> 0.9168157577514648 at epoch 0


1137it [13:40,  1.39it/s]
100%|██████████| 758/758 [01:13<00:00, 10.26it/s]


1814 out of 3030
acc of 59.86798679867987%


40it [00:29,  1.49it/s]

loss for batch 40 --> 1.0039572715759277 at epoch 1


80it [00:58,  1.51it/s]

loss for batch 80 --> 0.49586743116378784 at epoch 1


120it [01:27,  1.26it/s]

loss for batch 120 --> 0.4452100396156311 at epoch 1


160it [01:54,  1.47it/s]

loss for batch 160 --> 0.9273083209991455 at epoch 1


200it [02:23,  1.42it/s]

loss for batch 200 --> 0.8402706384658813 at epoch 1


240it [02:51,  1.49it/s]

loss for batch 240 --> 0.48026368021965027 at epoch 1


280it [03:19,  1.27it/s]

loss for batch 280 --> 0.7173271179199219 at epoch 1


320it [03:48,  1.76it/s]

loss for batch 320 --> 0.890231728553772 at epoch 1


360it [04:18,  1.28it/s]

loss for batch 360 --> 0.6262438893318176 at epoch 1


400it [04:43,  1.54it/s]

loss for batch 400 --> 0.8341893553733826 at epoch 1


440it [05:13,  1.47it/s]

loss for batch 440 --> 0.46418026089668274 at epoch 1


480it [05:40,  1.82it/s]

loss for batch 480 --> 0.7094260454177856 at epoch 1


520it [06:10,  1.43it/s]

loss for batch 520 --> 1.2022100687026978 at epoch 1


560it [06:39,  1.15it/s]

loss for batch 560 --> 0.7691429853439331 at epoch 1


600it [07:06,  1.50it/s]

loss for batch 600 --> 0.4340836703777313 at epoch 1


640it [07:34,  1.31it/s]

loss for batch 640 --> 1.1267638206481934 at epoch 1


680it [08:03,  1.47it/s]

loss for batch 680 --> 0.45323342084884644 at epoch 1


720it [08:30,  1.42it/s]

loss for batch 720 --> 0.9644420146942139 at epoch 1


760it [08:59,  1.43it/s]

loss for batch 760 --> 0.6568692326545715 at epoch 1


800it [09:27,  1.35it/s]

loss for batch 800 --> 0.938772976398468 at epoch 1


840it [09:58,  1.26it/s]

loss for batch 840 --> 0.7216342687606812 at epoch 1


880it [10:26,  1.11it/s]

loss for batch 880 --> 0.6449624300003052 at epoch 1


920it [10:55,  1.30it/s]

loss for batch 920 --> 0.5198278427124023 at epoch 1


960it [11:26,  1.19it/s]

loss for batch 960 --> 0.5282492637634277 at epoch 1


1000it [11:59,  1.13it/s]

loss for batch 1000 --> 0.9665766954421997 at epoch 1


1040it [12:28,  1.52it/s]

loss for batch 1040 --> 0.9803767800331116 at epoch 1


1080it [12:59,  1.27it/s]

loss for batch 1080 --> 0.9441114664077759 at epoch 1


1120it [13:28,  1.44it/s]

loss for batch 1120 --> 1.012452244758606 at epoch 1


1137it [13:38,  1.39it/s]
100%|██████████| 758/758 [01:14<00:00, 10.15it/s]


1907 out of 3030
acc of 62.93729372937293%


40it [00:27,  1.48it/s]

loss for batch 40 --> 0.4693128764629364 at epoch 2


80it [00:56,  1.03s/it]

loss for batch 80 --> 0.44614094495773315 at epoch 2


120it [01:24,  1.40it/s]

loss for batch 120 --> 0.3921858072280884 at epoch 2


160it [01:55,  1.35it/s]

loss for batch 160 --> 0.6634893417358398 at epoch 2


200it [02:24,  1.34it/s]

loss for batch 200 --> 0.23800566792488098 at epoch 2


240it [02:51,  1.42it/s]

loss for batch 240 --> 0.4145723581314087 at epoch 2


280it [03:20,  1.49it/s]

loss for batch 280 --> 0.6914445757865906 at epoch 2


320it [03:52,  1.29it/s]

loss for batch 320 --> 0.7240576148033142 at epoch 2


360it [04:21,  1.41it/s]

loss for batch 360 --> 0.6963423490524292 at epoch 2


400it [04:51,  1.45it/s]

loss for batch 400 --> 0.48045217990875244 at epoch 2


440it [05:20,  1.35it/s]

loss for batch 440 --> 0.5638783574104309 at epoch 2


480it [05:47,  1.30it/s]

loss for batch 480 --> 0.463367223739624 at epoch 2


520it [06:17,  1.38it/s]

loss for batch 520 --> 0.9598110318183899 at epoch 2


560it [06:46,  1.57it/s]

loss for batch 560 --> 0.49293240904808044 at epoch 2


600it [07:15,  1.20it/s]

loss for batch 600 --> 0.5166592001914978 at epoch 2


640it [07:44,  1.52it/s]

loss for batch 640 --> 0.25701695680618286 at epoch 2


680it [08:12,  1.42it/s]

loss for batch 680 --> 0.5492500066757202 at epoch 2


720it [08:41,  1.50it/s]

loss for batch 720 --> 0.5498210191726685 at epoch 2


760it [09:08,  1.38it/s]

loss for batch 760 --> 0.670006275177002 at epoch 2


800it [09:38,  1.31it/s]

loss for batch 800 --> 0.5101931095123291 at epoch 2


840it [10:04,  1.72it/s]

loss for batch 840 --> 0.2934879660606384 at epoch 2


880it [10:33,  1.38it/s]

loss for batch 880 --> 0.32918938994407654 at epoch 2


920it [11:02,  1.49it/s]

loss for batch 920 --> 0.24953801929950714 at epoch 2


960it [11:33,  1.22it/s]

loss for batch 960 --> 0.6236779093742371 at epoch 2


1000it [12:02,  1.18it/s]

loss for batch 1000 --> 0.18219737708568573 at epoch 2


1040it [12:31,  1.28it/s]

loss for batch 1040 --> 0.49757689237594604 at epoch 2


1080it [12:59,  1.12it/s]

loss for batch 1080 --> 0.4389224648475647 at epoch 2


1120it [13:27,  1.72it/s]

loss for batch 1120 --> 0.1273920238018036 at epoch 2


1137it [13:39,  1.39it/s]


In [20]:
submission_csv = pd.read_csv("/kaggle/input/contradictory-my-dear-watson/test.csv")
submission_csv.drop(columns=["lang_abv", "language"])

Unnamed: 0,id,premise,hypothesis
0,c6d58c3f69,بکس، کیسی، راہیل، یسعیاہ، کیلی، کیلی، اور کولم...,"کیسی کے لئے کوئی یادگار نہیں ہوگا, کولمین ہائی..."
1,cefcc82292,هذا هو ما تم نصحنا به.,عندما يتم إخبارهم بما يجب عليهم فعله ، فشلت ال...
2,e98005252c,et cela est en grande partie dû au fait que le...,Les mères se droguent.
3,58518c10ba,与城市及其他公民及社区组织代表就IMA的艺术发展进行对话&amp,IMA与其他组织合作，因为它们都依靠共享资金。
4,c32b0d16df,Она все еще была там.,"Мы думали, что она ушла, однако, она осталась."
...,...,...,...
5190,5f90dd59b0,نیند نے وعدہ کیا کہ موٹل نے سوال میں تحقیق کی.,نیمیتھ کو موٹل کی تفتیش کے لئے معاوضہ دیا جارہ...
5191,f357a04e86,The rock has a soft texture and can be bough...,The rock is harder than most types of rock.
5192,1f0ea92118,她目前的存在，并考虑到他与沃佛斯顿争执的本质，那是尴尬的。,她在与Wolverstone的打斗结束后才在场的事实被看作是很尴尬的。
5193,0407b48afb,isn't it i can remember i've only been here ei...,I could see downtown Dallas from where I lived...


In [21]:
submission_dataset = CustomDatasetSubmission(df=submission_csv.drop(columns=["lang_abv", "language"]), max_length=max_length, tokenizer=tokenizer)

In [22]:
submission_dataloder = DataLoader(dataset=submission_dataset, batch_size=2, shuffle=False, generator=torch.Generator(device=device), collate_fn=submission_dataset.collate_fn)

In [23]:
final_output = {
    "id": [],
    "prediction": []
}

model.eval()
with torch.inference_mode():
    for batch in tqdm(submission_dataloder):
        logits = model(batch["input_ids"], attention_mask=batch["attention_mask"], token_type_ids=batch["token_type_ids"])
        percents = torch.softmax(logits, dim=1)
        preds = torch.argmax(percents, dim=1)
        for idx, prediction in zip(batch["ids"], preds.view(-1).tolist()):
            final_output["id"].append(idx)
            final_output["prediction"].append(prediction)


100%|██████████| 2598/2598 [01:54<00:00, 22.62it/s]


In [24]:
final_output_df = pd.DataFrame(final_output)
final_output_df.to_csv("./submission.csv", index=False)

In [25]:
final_output_df

Unnamed: 0,id,prediction
0,c6d58c3f69,1
1,cefcc82292,1
2,e98005252c,0
3,58518c10ba,1
4,c32b0d16df,2
...,...,...
5190,5f90dd59b0,1
5191,f357a04e86,1
5192,1f0ea92118,1
5193,0407b48afb,0
