<a href="https://www.kaggle.com/code/evelynartoria/solving-disaster-tweets-dataset-with-mlp-pytorch?scriptVersionId=188262232" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

!kaggle competitions download -c nlp-getting-started

!unzip -x ./nlp-getting-started.zip

In [1]:
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from transformers import AutoTokenizer

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
generator = torch.Generator(device=device)
torch.set_default_device(device)
print(f"default device set to {device}")

default device set to cuda


In [3]:
bert_tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [4]:
sample_text = ["asfognofgnsdfg", "fgdofgnigifnfdpsfmgdpmgsp"]
print(bert_tokenizer(sample_text, add_special_tokens=True, max_length=512, truncation=True, padding=True))

{'input_ids': [[101, 2004, 14876, 26745, 2546, 16206, 16150, 2546, 2290, 102, 0, 0, 0, 0, 0, 0, 0], [101, 1042, 2290, 3527, 2546, 29076, 5856, 2546, 2078, 2546, 18927, 22747, 24798, 18927, 24798, 13102, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}


In [5]:
train_dataset = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
train_dataset.sample(10)

Unnamed: 0,id,keyword,location,text,target
1789,2568,crash,"21.462446,-158.022017",The Next Financial Crash. 'The Writing is on t...,0
5500,7849,quarantined,"Cumming, GA",Reddit's new content policy goes into effect m...,0
738,1069,bleeding,"Basketball City, USA",@burberryant bleeding on the brain don't know ...,0
6472,9257,sunk,Beacon Hills,'Blaaaaaaa' he said as he sunk his face into y...,0
1339,1937,burning%20buildings,,@foxnewsvideo @AIIAmericanGirI @ANHQDC So ... ...,1
6066,8665,sinkhole,Êwagger!ÌominicanÌ÷,#LoMasVisto THOUSANDS OF HIPSTERS FEARED LOST:...,1
3274,4699,epicentre,,[Question] Is anybody else having this problem...,0
4298,6104,hellfire,"Denver, Colorado",(Also I dont think sewing thought a leather be...,0
5882,8401,sandstorm,hkXfYMhEx,Watch This Airport Get Swallowed Up By A Sands...,1
911,1318,bloody,,Meet the bloody RS5 http://t.co/RVczMimfVx,0


In [6]:
train_dataset.drop(columns=["location", "keyword"], inplace=True)

In [7]:
bert_tokenizer.pad_token_id

0

In [8]:
class TweetsDataset(Dataset):
    def __init__(self, dataset: pd.DataFrame, tokenizer: object, max_length: int):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        inputs = self.dataset["text"].iloc[idx]
        labels = self.dataset["target"].iloc[idx]
        encoded_dict = self.tokenizer(inputs, max_length=self.max_length, truncation=True, padding=True, return_tensors="pt")

        return {
            "input_ids": encoded_dict["input_ids"].flatten(),
            "type_ids": encoded_dict["token_type_ids"].flatten(),
            "attention_mask": encoded_dict["attention_mask"].flatten(),
            "label": torch.tensor(labels, dtype=torch.long)
        }
        
    @staticmethod
    def collate_fn(batch):
        batch_input_ids = [item["input_ids"] for item in batch]
        batch_type_ids = [item["type_ids"] for item in batch]
        batch_masks = [item["attention_mask"] for item in batch]
        labels = [item["label"] for item in batch]


        batch_input_ids = pad_sequence(batch_input_ids, batch_first=True, padding_value=0)
        batch_type_ids = pad_sequence(batch_type_ids, batch_first=True, padding_value=0)
        batch_masks = pad_sequence(batch_masks, batch_first=True, padding_value=0)

        return {
            "input_ids": batch_input_ids,
            "type_ids": batch_type_ids,
            "attention_mask": batch_masks,
            "labels": torch.stack(labels)
        }



In [9]:
class TweetsDatasetTest(Dataset):
    def __init__(self, dataset: pd.DataFrame, tokenizer: object, max_length: int):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        inputs = self.dataset["text"].iloc[idx]
        encoded_dict = self.tokenizer(inputs, max_length=self.max_length, truncation=True, padding=True, return_tensors="pt")

        return {
            "input_ids": encoded_dict["input_ids"].flatten(),
            "type_ids": encoded_dict["token_type_ids"].flatten(),
            "attention_mask": encoded_dict["attention_mask"].flatten(),
            "id": self.dataset["id"].iloc[idx]
        }
        
    @staticmethod
    def collate_fn(batch):
        batch_input_ids = [item["input_ids"] for item in batch]
        batch_type_ids = [item["type_ids"] for item in batch]
        batch_masks = [item["attention_mask"] for item in batch]
        ids = [item["id"] for item in batch]


        batch_input_ids = pad_sequence(batch_input_ids, batch_first=True, padding_value=0)
        batch_type_ids = pad_sequence(batch_type_ids, batch_first=True, padding_value=0)
        batch_masks = pad_sequence(batch_masks, batch_first=True, padding_value=0)

        return {
            "input_ids": batch_input_ids,
            "type_ids": batch_type_ids,
            "attention_mask": batch_masks,
            "ids": ids
        }


In [10]:
max_length = max([len(text) for text in train_dataset["text"]])
print(max_length)
loaded_dataset = TweetsDataset(dataset=train_dataset, tokenizer=bert_tokenizer, max_length=max_length)

157


In [11]:
train_dataloader = DataLoader(dataset=loaded_dataset, batch_size=2, shuffle=True, generator=generator, collate_fn=loaded_dataset.collate_fn)
next(iter(train_dataloader))

{'input_ids': tensor([[  101,  1045, 16393, 14588,  2000,  1996,  1052,  1012,  1051,  1012,
           1052,  1012,  1041,  1012,  1998,  1996,  5255,  3121,  1997,  8680,
           2103,  1012,  1029,  1029,  1029,  1029,  1029,  1029,   102],
         [  101, 21318,  3367, 13465,  7474,  1999,  1019,  8117,  1013,  1013,
           5882,  5877,  2226,  8299,  1024,  1013,  1013,  1056,  1012,  2522,
           1013,  1021,  4160, 26952, 17914,  6784,  2581,  2615,   102]],
        device='cuda:0'),
 'type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0]], device='cuda:0'),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1]], device='cu

In [12]:
class MLP(nn.Module):
    def __init__(self, vocab_size, n_embd, context_size, n_classes, dropout):
        super(MLP, self).__init__()

        self.token_embedding_table = nn.Embedding(vocab_size, n_embd) # (B, T, C)
        self.position_embedding_table = nn.Embedding(context_size, n_embd) # (T, C)
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(in_features=n_embd, out_features=8*8)
        self.act_fn = nn.ReLU()
        self.lm_head = nn.Linear(in_features=8*8, out_features=n_classes) # (B, T, C) @ (B, C, n_classes) --> (B, C, n_classes)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        B, T = x.shape

        positions = torch.arange(start=0, end=T, step=1)
        x = self.token_embedding_table(x) + self.position_embedding_table(positions)
        x = self.dropout(x)
        x = self.linear(x) # (B, T, C) @ (B, C, 8*8) --> (B, T, 8*8)
        x = self.act_fn(x)
        x, _ = torch.max(x, dim=1) # max across the T dimensions
        x = self.lm_head(x) # (B, T, C) @ (B, C, n_classes) --> (B, T, n_classes)
        return x 

In [13]:
n_embd = 256
context_size = max_length
mlp = MLP(
    vocab_size=bert_tokenizer.vocab_size,
    n_embd=n_embd, context_size=context_size, 
    n_classes=1, dropout=0.45)

In [14]:
train_dataloader = DataLoader(dataset=loaded_dataset, batch_size=32, shuffle=True, generator=generator, collate_fn=loaded_dataset.collate_fn)

In [15]:
mlp.eval()
with torch.inference_mode():
    batch = next(iter(train_dataloader))
    logits = mlp(batch["input_ids"])
    test_loss_fn = nn.BCEWithLogitsLoss()
    loss = test_loss_fn(logits.view(-1), batch["labels"].to(torch.float32))
    print(logits.shape) # B, n_classes
    print(loss)

torch.Size([32, 1])
tensor(0.9840, device='cuda:0')


In [16]:
def train_model(model, dataloader, loss_fn, optimizer, epochs):
    #writer = SummaryWriter()
    #writer.add_graph(model, torch.zeros(size=(32, context_size), dtype=torch.long)) # (B, T)
    for epoch in range(epochs):
        for batch_idx, batch in tqdm(enumerate(dataloader)):
            logits = model(batch["input_ids"]) # B, n_clases
            loss = loss_fn(logits.view(-1), batch["labels"].to(torch.float32))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            #writer.add_scalar("Loss", loss.item(), batch_idx)

            if (batch_idx+1) % 100 == 0:
                print(f"loss for {batch_idx+1} --> {loss} at epoch {epoch+1}")
                probs = torch.sigmoid(logits)
                pred = (probs > 0.5).int()
                acc = (torch.sum((pred.view(-1) == batch["labels"])).item() / batch["labels"].shape[0]) * 100
                print(f"{acc}% accuracy for batch {batch_idx+1} at {epoch+1}")

    #writer.close()

In [17]:
optimizer = torch.optim.AdamW(params=mlp.parameters(), lr=1e-3)
loss_fn = nn.BCEWithLogitsLoss()
train_model(model=mlp, dataloader=train_dataloader, loss_fn=loss_fn, optimizer=optimizer, epochs=100)

106it [00:02, 48.70it/s]

loss for 100 --> 0.6196631193161011 at epoch 1
68.75% accuracy for batch 100 at 1


204it [00:04, 45.47it/s]

loss for 200 --> 0.7330230474472046 at epoch 1
56.25% accuracy for batch 200 at 1


238it [00:05, 46.65it/s]
110it [00:02, 49.96it/s]

loss for 100 --> 0.4076136350631714 at epoch 2
84.375% accuracy for batch 100 at 2


206it [00:04, 48.56it/s]

loss for 200 --> 0.5025712251663208 at epoch 2
68.75% accuracy for batch 200 at 2


238it [00:04, 49.23it/s]
109it [00:02, 49.31it/s]

loss for 100 --> 0.4620068073272705 at epoch 3
81.25% accuracy for batch 100 at 3


207it [00:04, 49.16it/s]

loss for 200 --> 0.3456932008266449 at epoch 3
90.625% accuracy for batch 200 at 3


238it [00:04, 49.11it/s]
109it [00:02, 50.03it/s]

loss for 100 --> 0.36549824476242065 at epoch 4
84.375% accuracy for batch 100 at 4


205it [00:04, 49.73it/s]

loss for 200 --> 0.4805631637573242 at epoch 4
78.125% accuracy for batch 200 at 4


238it [00:04, 49.14it/s]
107it [00:02, 49.45it/s]

loss for 100 --> 0.19871824979782104 at epoch 5
96.875% accuracy for batch 100 at 5


206it [00:04, 48.92it/s]

loss for 200 --> 0.231065571308136 at epoch 5
96.875% accuracy for batch 200 at 5


238it [00:04, 48.24it/s]
105it [00:02, 48.34it/s]

loss for 100 --> 0.10290949791669846 at epoch 6
100.0% accuracy for batch 100 at 6


208it [00:04, 49.04it/s]

loss for 200 --> 0.1512489765882492 at epoch 6
100.0% accuracy for batch 200 at 6


238it [00:04, 48.18it/s]
105it [00:02, 48.64it/s]

loss for 100 --> 0.11844538152217865 at epoch 7
96.875% accuracy for batch 100 at 7


205it [00:04, 48.37it/s]

loss for 200 --> 0.18894442915916443 at epoch 7
96.875% accuracy for batch 200 at 7


238it [00:04, 48.07it/s]
105it [00:02, 48.60it/s]

loss for 100 --> 0.04380862042307854 at epoch 8
100.0% accuracy for batch 100 at 8


205it [00:04, 48.79it/s]

loss for 200 --> 0.03131725266575813 at epoch 8
100.0% accuracy for batch 200 at 8


238it [00:04, 48.47it/s]
106it [00:02, 47.91it/s]

loss for 100 --> 0.03827829658985138 at epoch 9
100.0% accuracy for batch 100 at 9


206it [00:04, 47.98it/s]

loss for 200 --> 0.029578473418951035 at epoch 9
100.0% accuracy for batch 200 at 9


238it [00:04, 48.39it/s]
107it [00:02, 49.11it/s]

loss for 100 --> 0.07603678107261658 at epoch 10
96.875% accuracy for batch 100 at 10


205it [00:04, 48.65it/s]

loss for 200 --> 0.02070838212966919 at epoch 10
100.0% accuracy for batch 200 at 10


238it [00:04, 48.26it/s]
106it [00:02, 46.62it/s]

loss for 100 --> 0.016328580677509308 at epoch 11
100.0% accuracy for batch 100 at 11


207it [00:04, 48.93it/s]

loss for 200 --> 0.02422419935464859 at epoch 11
100.0% accuracy for batch 200 at 11


238it [00:05, 47.27it/s]
110it [00:02, 50.24it/s]

loss for 100 --> 0.2028253823518753 at epoch 12
93.75% accuracy for batch 100 at 12


205it [00:04, 49.23it/s]

loss for 200 --> 0.01227628905326128 at epoch 12
100.0% accuracy for batch 200 at 12


238it [00:04, 48.98it/s]
107it [00:02, 48.77it/s]

loss for 100 --> 0.008143090642988682 at epoch 13
100.0% accuracy for batch 100 at 13


208it [00:04, 49.84it/s]

loss for 200 --> 0.010427107103168964 at epoch 13
100.0% accuracy for batch 200 at 13


238it [00:04, 49.11it/s]
109it [00:02, 48.98it/s]

loss for 100 --> 0.06794191151857376 at epoch 14
96.875% accuracy for batch 100 at 14


206it [00:04, 48.96it/s]

loss for 200 --> 0.011565780267119408 at epoch 14
100.0% accuracy for batch 200 at 14


238it [00:04, 49.47it/s]
108it [00:02, 49.82it/s]

loss for 100 --> 0.012561476789414883 at epoch 15
100.0% accuracy for batch 100 at 15


207it [00:04, 50.22it/s]

loss for 200 --> 0.03196942061185837 at epoch 15
100.0% accuracy for batch 200 at 15


238it [00:04, 49.71it/s]
106it [00:02, 49.58it/s]

loss for 100 --> 0.031048484146595 at epoch 16
96.875% accuracy for batch 100 at 16


208it [00:04, 49.81it/s]

loss for 200 --> 0.007397493347525597 at epoch 16
100.0% accuracy for batch 200 at 16


238it [00:04, 49.63it/s]
107it [00:02, 49.66it/s]

loss for 100 --> 0.009031446650624275 at epoch 17
100.0% accuracy for batch 100 at 17


207it [00:04, 49.43it/s]

loss for 200 --> 0.007884522899985313 at epoch 17
100.0% accuracy for batch 200 at 17


238it [00:04, 49.37it/s]
106it [00:02, 46.25it/s]

loss for 100 --> 0.007877523079514503 at epoch 18
100.0% accuracy for batch 100 at 18


207it [00:04, 49.11it/s]

loss for 200 --> 0.0023291553370654583 at epoch 18
100.0% accuracy for batch 200 at 18


238it [00:04, 48.74it/s]
109it [00:02, 49.54it/s]

loss for 100 --> 0.003243488259613514 at epoch 19
100.0% accuracy for batch 100 at 19


210it [00:04, 49.42it/s]

loss for 200 --> 0.003973012324422598 at epoch 19
100.0% accuracy for batch 200 at 19


238it [00:04, 49.50it/s]
109it [00:02, 49.49it/s]

loss for 100 --> 0.1432047188282013 at epoch 20
96.875% accuracy for batch 100 at 20


205it [00:04, 49.82it/s]

loss for 200 --> 0.23729687929153442 at epoch 20
96.875% accuracy for batch 200 at 20


238it [00:04, 49.59it/s]
108it [00:02, 49.48it/s]

loss for 100 --> 0.0053978050127625465 at epoch 21
100.0% accuracy for batch 100 at 21


207it [00:04, 49.32it/s]

loss for 200 --> 0.0017738027963787317 at epoch 21
100.0% accuracy for batch 200 at 21


238it [00:04, 49.73it/s]
106it [00:02, 47.62it/s]

loss for 100 --> 0.018939757719635963 at epoch 22
100.0% accuracy for batch 100 at 22


205it [00:04, 49.44it/s]

loss for 200 --> 0.16280891001224518 at epoch 22
96.875% accuracy for batch 200 at 22


238it [00:04, 49.40it/s]
110it [00:02, 50.04it/s]

loss for 100 --> 0.002422731602564454 at epoch 23
100.0% accuracy for batch 100 at 23


205it [00:04, 49.66it/s]

loss for 200 --> 0.011178381741046906 at epoch 23
100.0% accuracy for batch 200 at 23


238it [00:04, 49.89it/s]
108it [00:02, 49.10it/s]

loss for 100 --> 0.01014401763677597 at epoch 24
100.0% accuracy for batch 100 at 24


205it [00:04, 50.16it/s]

loss for 200 --> 0.001527271349914372 at epoch 24
100.0% accuracy for batch 200 at 24


238it [00:04, 48.55it/s]
109it [00:02, 50.11it/s]

loss for 100 --> 0.0031660578679293394 at epoch 25
100.0% accuracy for batch 100 at 25


210it [00:04, 50.22it/s]

loss for 200 --> 0.003650715108960867 at epoch 25
100.0% accuracy for batch 200 at 25


238it [00:04, 49.87it/s]
109it [00:02, 49.96it/s]

loss for 100 --> 0.004006360657513142 at epoch 26
100.0% accuracy for batch 100 at 26


207it [00:04, 49.67it/s]

loss for 200 --> 0.002559222746640444 at epoch 26
100.0% accuracy for batch 200 at 26


238it [00:04, 49.60it/s]
108it [00:02, 50.36it/s]

loss for 100 --> 0.011457357555627823 at epoch 27
100.0% accuracy for batch 100 at 27


210it [00:04, 50.39it/s]

loss for 200 --> 0.0037385113537311554 at epoch 27
100.0% accuracy for batch 200 at 27


238it [00:04, 50.35it/s]
108it [00:02, 50.06it/s]

loss for 100 --> 0.0012234740424901247 at epoch 28
100.0% accuracy for batch 100 at 28


207it [00:04, 50.28it/s]

loss for 200 --> 0.002580087399110198 at epoch 28
100.0% accuracy for batch 200 at 28


238it [00:04, 49.96it/s]
109it [00:02, 50.23it/s]

loss for 100 --> 0.002118742326274514 at epoch 29
100.0% accuracy for batch 100 at 29


205it [00:04, 50.91it/s]

loss for 200 --> 0.0013083029771223664 at epoch 29
100.0% accuracy for batch 200 at 29


238it [00:04, 50.41it/s]
105it [00:02, 50.13it/s]

loss for 100 --> 0.0024348448496311903 at epoch 30
100.0% accuracy for batch 100 at 30


205it [00:04, 49.66it/s]

loss for 200 --> 0.0017428901046514511 at epoch 30
100.0% accuracy for batch 200 at 30


238it [00:04, 49.77it/s]
107it [00:02, 50.45it/s]

loss for 100 --> 0.0048635550774633884 at epoch 31
100.0% accuracy for batch 100 at 31


208it [00:04, 49.76it/s]

loss for 200 --> 0.0008248725207522511 at epoch 31
100.0% accuracy for batch 200 at 31


238it [00:04, 49.48it/s]
105it [00:02, 50.29it/s]

loss for 100 --> 0.0017065935535356402 at epoch 32
100.0% accuracy for batch 100 at 32


209it [00:04, 48.18it/s]

loss for 200 --> 0.0011307941749691963 at epoch 32
100.0% accuracy for batch 200 at 32


238it [00:04, 49.36it/s]
105it [00:02, 50.21it/s]

loss for 100 --> 0.0007572636241093278 at epoch 33
100.0% accuracy for batch 100 at 33


206it [00:04, 50.34it/s]

loss for 200 --> 0.002235831692814827 at epoch 33
100.0% accuracy for batch 200 at 33


238it [00:04, 50.12it/s]
106it [00:02, 50.02it/s]

loss for 100 --> 0.000561692169867456 at epoch 34
100.0% accuracy for batch 100 at 34


208it [00:04, 50.90it/s]

loss for 200 --> 0.17769958078861237 at epoch 34
96.875% accuracy for batch 200 at 34


238it [00:04, 50.09it/s]
108it [00:02, 50.75it/s]

loss for 100 --> 0.0017125962767750025 at epoch 35
100.0% accuracy for batch 100 at 35


210it [00:04, 50.95it/s]

loss for 200 --> 0.0012594436993822455 at epoch 35
100.0% accuracy for batch 200 at 35


238it [00:04, 50.63it/s]
106it [00:02, 50.31it/s]

loss for 100 --> 0.0016498027835041285 at epoch 36
100.0% accuracy for batch 100 at 36


205it [00:04, 48.60it/s]

loss for 200 --> 0.2968498468399048 at epoch 36
96.875% accuracy for batch 200 at 36


238it [00:04, 49.50it/s]
107it [00:02, 50.12it/s]

loss for 100 --> 0.0011158420238643885 at epoch 37
100.0% accuracy for batch 100 at 37


209it [00:04, 50.37it/s]

loss for 200 --> 0.0007388542871922255 at epoch 37
100.0% accuracy for batch 200 at 37


238it [00:04, 50.01it/s]
109it [00:02, 49.98it/s]

loss for 100 --> 0.000789744546636939 at epoch 38
100.0% accuracy for batch 100 at 38


209it [00:04, 50.02it/s]

loss for 200 --> 0.02250836417078972 at epoch 38
100.0% accuracy for batch 200 at 38


238it [00:04, 49.35it/s]
105it [00:02, 49.23it/s]

loss for 100 --> 0.0019964929670095444 at epoch 39
100.0% accuracy for batch 100 at 39


206it [00:04, 50.43it/s]

loss for 200 --> 0.004224894568324089 at epoch 39
100.0% accuracy for batch 200 at 39


238it [00:04, 49.48it/s]
107it [00:02, 50.44it/s]

loss for 100 --> 0.0019998629577457905 at epoch 40
100.0% accuracy for batch 100 at 40


207it [00:04, 49.58it/s]

loss for 200 --> 0.0008808790007606149 at epoch 40
100.0% accuracy for batch 200 at 40


238it [00:04, 49.60it/s]
109it [00:02, 49.17it/s]

loss for 100 --> 0.00727866031229496 at epoch 41
100.0% accuracy for batch 100 at 41


210it [00:04, 49.75it/s]

loss for 200 --> 0.0018666083924472332 at epoch 41
100.0% accuracy for batch 200 at 41


238it [00:04, 49.01it/s]
106it [00:02, 49.67it/s]

loss for 100 --> 0.10938652604818344 at epoch 42
96.875% accuracy for batch 100 at 42


207it [00:04, 50.04it/s]

loss for 200 --> 0.0005938064423389733 at epoch 42
100.0% accuracy for batch 200 at 42


238it [00:04, 49.29it/s]
106it [00:02, 49.42it/s]

loss for 100 --> 0.056961387395858765 at epoch 43
96.875% accuracy for batch 100 at 43


207it [00:04, 49.76it/s]

loss for 200 --> 0.008090727031230927 at epoch 43
100.0% accuracy for batch 200 at 43


238it [00:04, 49.68it/s]
109it [00:02, 49.77it/s]

loss for 100 --> 0.00337290414609015 at epoch 44
100.0% accuracy for batch 100 at 44


209it [00:04, 49.74it/s]

loss for 200 --> 0.0006573415594175458 at epoch 44
100.0% accuracy for batch 200 at 44


238it [00:04, 49.04it/s]
108it [00:02, 49.82it/s]

loss for 100 --> 0.0012551688123494387 at epoch 45
100.0% accuracy for batch 100 at 45


206it [00:04, 50.14it/s]

loss for 200 --> 0.0010237484239041805 at epoch 45
100.0% accuracy for batch 200 at 45


238it [00:04, 49.61it/s]
108it [00:02, 49.18it/s]

loss for 100 --> 0.0005310550914146006 at epoch 46
100.0% accuracy for batch 100 at 46


209it [00:04, 50.37it/s]

loss for 200 --> 0.0013530475553125143 at epoch 46
100.0% accuracy for batch 200 at 46


238it [00:04, 49.92it/s]
107it [00:02, 47.19it/s]

loss for 100 --> 0.0006308340234681964 at epoch 47
100.0% accuracy for batch 100 at 47


207it [00:04, 50.11it/s]

loss for 200 --> 0.0012474701507017016 at epoch 47
100.0% accuracy for batch 200 at 47


238it [00:04, 49.70it/s]
105it [00:02, 49.16it/s]

loss for 100 --> 0.0016230730107054114 at epoch 48
100.0% accuracy for batch 100 at 48


205it [00:04, 49.96it/s]

loss for 200 --> 0.000816433341242373 at epoch 48
100.0% accuracy for batch 200 at 48


238it [00:04, 49.86it/s]
106it [00:02, 50.59it/s]

loss for 100 --> 0.00030893372604623437 at epoch 49
100.0% accuracy for batch 100 at 49


208it [00:04, 50.48it/s]

loss for 200 --> 0.0053189839236438274 at epoch 49
100.0% accuracy for batch 200 at 49


238it [00:04, 49.96it/s]
108it [00:02, 50.11it/s]

loss for 100 --> 0.0009526519570499659 at epoch 50
100.0% accuracy for batch 100 at 50


206it [00:04, 49.37it/s]

loss for 200 --> 0.0014621877344325185 at epoch 50
100.0% accuracy for batch 200 at 50


238it [00:04, 49.76it/s]
110it [00:02, 49.99it/s]

loss for 100 --> 0.03237089142203331 at epoch 51
96.875% accuracy for batch 100 at 51


205it [00:04, 49.55it/s]

loss for 200 --> 0.008840985596179962 at epoch 51
100.0% accuracy for batch 200 at 51


238it [00:04, 48.79it/s]
106it [00:02, 49.65it/s]

loss for 100 --> 0.0031066020019352436 at epoch 52
100.0% accuracy for batch 100 at 52


206it [00:04, 48.01it/s]

loss for 200 --> 0.001080155256204307 at epoch 52
100.0% accuracy for batch 200 at 52


238it [00:04, 49.14it/s]
105it [00:02, 49.64it/s]

loss for 100 --> 0.0006939896265976131 at epoch 53
100.0% accuracy for batch 100 at 53


206it [00:04, 50.03it/s]

loss for 200 --> 0.013158302754163742 at epoch 53
100.0% accuracy for batch 200 at 53


238it [00:04, 49.80it/s]
109it [00:02, 49.99it/s]

loss for 100 --> 0.0009096712456084788 at epoch 54
100.0% accuracy for batch 100 at 54


209it [00:04, 50.12it/s]

loss for 200 --> 0.0009391712956130505 at epoch 54
100.0% accuracy for batch 200 at 54


238it [00:04, 49.60it/s]
109it [00:02, 50.35it/s]

loss for 100 --> 0.0008367944392375648 at epoch 55
100.0% accuracy for batch 100 at 55


205it [00:04, 49.17it/s]

loss for 200 --> 0.0008687726804055274 at epoch 55
100.0% accuracy for batch 200 at 55


238it [00:04, 49.48it/s]
107it [00:02, 49.81it/s]

loss for 100 --> 0.0008604664471931756 at epoch 56
100.0% accuracy for batch 100 at 56


208it [00:04, 50.08it/s]

loss for 200 --> 0.0018661043141037226 at epoch 56
100.0% accuracy for batch 200 at 56


238it [00:04, 50.38it/s]
105it [00:02, 50.64it/s]

loss for 100 --> 0.0004456276074051857 at epoch 57
100.0% accuracy for batch 100 at 57


207it [00:04, 48.66it/s]

loss for 200 --> 0.0015173343708738685 at epoch 57
100.0% accuracy for batch 200 at 57


238it [00:04, 49.64it/s]
106it [00:02, 49.73it/s]

loss for 100 --> 0.0009332338813692331 at epoch 58
100.0% accuracy for batch 100 at 58


206it [00:04, 50.88it/s]

loss for 200 --> 0.00038276807754300535 at epoch 58
100.0% accuracy for batch 200 at 58


238it [00:04, 49.97it/s]
107it [00:02, 50.46it/s]

loss for 100 --> 0.0030839815735816956 at epoch 59
100.0% accuracy for batch 100 at 59


209it [00:04, 50.44it/s]

loss for 200 --> 0.01924067549407482 at epoch 59
100.0% accuracy for batch 200 at 59


238it [00:04, 50.33it/s]
108it [00:02, 50.73it/s]

loss for 100 --> 0.006970221642404795 at epoch 60
100.0% accuracy for batch 100 at 60


210it [00:04, 50.62it/s]

loss for 200 --> 0.0009826929308474064 at epoch 60
100.0% accuracy for batch 200 at 60


238it [00:04, 50.66it/s]
108it [00:02, 50.56it/s]

loss for 100 --> 0.00040008421638049185 at epoch 61
100.0% accuracy for batch 100 at 61


208it [00:04, 51.02it/s]

loss for 200 --> 0.001095032086595893 at epoch 61
100.0% accuracy for batch 200 at 61


238it [00:04, 50.86it/s]
108it [00:02, 50.32it/s]

loss for 100 --> 0.00364696211181581 at epoch 62
100.0% accuracy for batch 100 at 62


210it [00:04, 50.34it/s]

loss for 200 --> 0.0013871390838176012 at epoch 62
100.0% accuracy for batch 200 at 62


238it [00:04, 50.54it/s]
108it [00:02, 50.11it/s]

loss for 100 --> 0.0016069868579506874 at epoch 63
100.0% accuracy for batch 100 at 63


210it [00:04, 50.16it/s]

loss for 200 --> 0.00343328807502985 at epoch 63
100.0% accuracy for batch 200 at 63


238it [00:04, 50.28it/s]
106it [00:02, 49.59it/s]

loss for 100 --> 0.0007061504293233156 at epoch 64
100.0% accuracy for batch 100 at 64


208it [00:04, 50.07it/s]

loss for 200 --> 0.002454636851325631 at epoch 64
100.0% accuracy for batch 200 at 64


238it [00:04, 49.16it/s]
109it [00:02, 50.19it/s]

loss for 100 --> 0.0009237498743459582 at epoch 65
100.0% accuracy for batch 100 at 65


205it [00:04, 50.46it/s]

loss for 200 --> 0.009777945466339588 at epoch 65
100.0% accuracy for batch 200 at 65


238it [00:04, 50.21it/s]
109it [00:02, 50.04it/s]

loss for 100 --> 0.0005401649978011847 at epoch 66
100.0% accuracy for batch 100 at 66


205it [00:04, 50.45it/s]

loss for 200 --> 0.0030968368519097567 at epoch 66
100.0% accuracy for batch 200 at 66


238it [00:04, 50.23it/s]
108it [00:02, 50.87it/s]

loss for 100 --> 0.000549503427464515 at epoch 67
100.0% accuracy for batch 100 at 67


209it [00:04, 49.98it/s]

loss for 200 --> 0.0006923886248841882 at epoch 67
100.0% accuracy for batch 200 at 67


238it [00:04, 50.11it/s]
106it [00:02, 49.34it/s]

loss for 100 --> 0.0006730768363922834 at epoch 68
100.0% accuracy for batch 100 at 68


208it [00:04, 50.17it/s]

loss for 200 --> 0.0009641444776207209 at epoch 68
100.0% accuracy for batch 200 at 68


238it [00:04, 50.16it/s]
108it [00:02, 49.84it/s]

loss for 100 --> 0.0008702694904059172 at epoch 69
100.0% accuracy for batch 100 at 69


209it [00:04, 50.46it/s]

loss for 200 --> 0.0006447415798902512 at epoch 69
100.0% accuracy for batch 200 at 69


238it [00:04, 50.15it/s]
108it [00:02, 50.45it/s]

loss for 100 --> 0.0005048208986409009 at epoch 70
100.0% accuracy for batch 100 at 70


207it [00:04, 47.02it/s]

loss for 200 --> 0.0004911139840260148 at epoch 70
100.0% accuracy for batch 200 at 70


238it [00:04, 49.30it/s]
106it [00:02, 50.14it/s]

loss for 100 --> 0.0010118086356669664 at epoch 71
100.0% accuracy for batch 100 at 71


207it [00:04, 49.90it/s]

loss for 200 --> 0.0025056274607777596 at epoch 71
100.0% accuracy for batch 200 at 71


238it [00:04, 49.91it/s]
109it [00:02, 50.20it/s]

loss for 100 --> 0.012499356642365456 at epoch 72
100.0% accuracy for batch 100 at 72


209it [00:04, 50.14it/s]

loss for 200 --> 0.0018420531414449215 at epoch 72
100.0% accuracy for batch 200 at 72


238it [00:04, 49.73it/s]
110it [00:02, 49.92it/s]

loss for 100 --> 0.001054535387083888 at epoch 73
100.0% accuracy for batch 100 at 73


209it [00:04, 50.11it/s]

loss for 200 --> 0.0007307026535272598 at epoch 73
100.0% accuracy for batch 200 at 73


238it [00:04, 50.22it/s]
110it [00:02, 50.39it/s]

loss for 100 --> 0.048881128430366516 at epoch 74
96.875% accuracy for batch 100 at 74


205it [00:04, 50.01it/s]

loss for 200 --> 0.0024084860924631357 at epoch 74
100.0% accuracy for batch 200 at 74


238it [00:04, 50.06it/s]
105it [00:02, 49.78it/s]

loss for 100 --> 0.11167696863412857 at epoch 75
96.875% accuracy for batch 100 at 75


210it [00:04, 50.44it/s]

loss for 200 --> 0.0011864593252539635 at epoch 75
100.0% accuracy for batch 200 at 75


238it [00:04, 50.07it/s]
106it [00:02, 50.40it/s]

loss for 100 --> 0.0014487005537375808 at epoch 76
100.0% accuracy for batch 100 at 76


208it [00:04, 49.92it/s]

loss for 200 --> 0.0006407696637324989 at epoch 76
100.0% accuracy for batch 200 at 76


238it [00:04, 49.48it/s]
106it [00:02, 47.79it/s]

loss for 100 --> 0.0013146899873390794 at epoch 77
100.0% accuracy for batch 100 at 77


207it [00:04, 49.39it/s]

loss for 200 --> 0.0021737487986683846 at epoch 77
100.0% accuracy for batch 200 at 77


238it [00:04, 49.14it/s]
108it [00:02, 50.70it/s]

loss for 100 --> 0.001248287153430283 at epoch 78
100.0% accuracy for batch 100 at 78


204it [00:04, 47.51it/s]

loss for 200 --> 0.001539253513328731 at epoch 78
100.0% accuracy for batch 200 at 78


238it [00:04, 50.13it/s]
109it [00:02, 49.98it/s]

loss for 100 --> 0.001410324708558619 at epoch 79
100.0% accuracy for batch 100 at 79


206it [00:04, 50.53it/s]

loss for 200 --> 0.0006746791186742485 at epoch 79
100.0% accuracy for batch 200 at 79


238it [00:04, 50.20it/s]
106it [00:02, 49.90it/s]

loss for 100 --> 0.0006206037942320108 at epoch 80
100.0% accuracy for batch 100 at 80


210it [00:04, 50.43it/s]

loss for 200 --> 0.0008765977108851075 at epoch 80
100.0% accuracy for batch 200 at 80


238it [00:04, 50.00it/s]
105it [00:02, 50.04it/s]

loss for 100 --> 0.06691267341375351 at epoch 81
96.875% accuracy for batch 100 at 81


208it [00:04, 49.33it/s]

loss for 200 --> 0.0021606681402772665 at epoch 81
100.0% accuracy for batch 200 at 81


238it [00:04, 50.00it/s]
106it [00:02, 50.13it/s]

loss for 100 --> 0.0005065504228696227 at epoch 82
100.0% accuracy for batch 100 at 82


208it [00:04, 50.80it/s]

loss for 200 --> 0.1388929784297943 at epoch 82
96.875% accuracy for batch 200 at 82


238it [00:04, 50.14it/s]
107it [00:02, 48.69it/s]

loss for 100 --> 0.0010167040163651109 at epoch 83
100.0% accuracy for batch 100 at 83


209it [00:04, 46.46it/s]

loss for 200 --> 0.0010418868623673916 at epoch 83
100.0% accuracy for batch 200 at 83


238it [00:04, 49.03it/s]
107it [00:02, 48.95it/s]

loss for 100 --> 0.0009686416015028954 at epoch 84
100.0% accuracy for batch 100 at 84


207it [00:04, 49.85it/s]

loss for 200 --> 0.0007077198242768645 at epoch 84
100.0% accuracy for batch 200 at 84


238it [00:04, 49.62it/s]
106it [00:02, 49.52it/s]

loss for 100 --> 0.007212840486317873 at epoch 85
100.0% accuracy for batch 100 at 85


207it [00:04, 49.49it/s]

loss for 200 --> 0.0005043350392952561 at epoch 85
100.0% accuracy for batch 200 at 85


238it [00:04, 49.52it/s]
109it [00:02, 50.73it/s]

loss for 100 --> 0.0008194638649001718 at epoch 86
100.0% accuracy for batch 100 at 86


205it [00:04, 49.96it/s]

loss for 200 --> 0.004668733105063438 at epoch 86
100.0% accuracy for batch 200 at 86


238it [00:04, 50.14it/s]
109it [00:02, 49.46it/s]

loss for 100 --> 0.14892591536045074 at epoch 87
96.875% accuracy for batch 100 at 87


208it [00:04, 50.85it/s]

loss for 200 --> 0.06822920590639114 at epoch 87
96.875% accuracy for batch 200 at 87


238it [00:04, 49.79it/s]
105it [00:02, 50.01it/s]

loss for 100 --> 0.13564567267894745 at epoch 88
93.75% accuracy for batch 100 at 88


207it [00:04, 49.97it/s]

loss for 200 --> 0.001761976396664977 at epoch 88
100.0% accuracy for batch 200 at 88


238it [00:04, 50.23it/s]
106it [00:02, 50.32it/s]

loss for 100 --> 0.0004654246149584651 at epoch 89
100.0% accuracy for batch 100 at 89


207it [00:04, 50.33it/s]

loss for 200 --> 0.005596381612122059 at epoch 89
100.0% accuracy for batch 200 at 89


238it [00:04, 50.09it/s]
106it [00:02, 46.40it/s]

loss for 100 --> 0.0002679687167983502 at epoch 90
100.0% accuracy for batch 100 at 90


206it [00:04, 50.06it/s]

loss for 200 --> 0.0009088858496397734 at epoch 90
100.0% accuracy for batch 200 at 90


238it [00:04, 49.22it/s]
107it [00:02, 48.60it/s]

loss for 100 --> 0.0006331373006105423 at epoch 91
100.0% accuracy for batch 100 at 91


210it [00:04, 50.43it/s]

loss for 200 --> 0.0005244031199254096 at epoch 91
100.0% accuracy for batch 200 at 91


238it [00:04, 49.85it/s]
108it [00:02, 50.21it/s]

loss for 100 --> 0.000381566263968125 at epoch 92
100.0% accuracy for batch 100 at 92


207it [00:04, 49.57it/s]

loss for 200 --> 0.001006389269605279 at epoch 92
100.0% accuracy for batch 200 at 92


238it [00:04, 49.74it/s]
106it [00:02, 50.43it/s]

loss for 100 --> 0.0007129001896828413 at epoch 93
100.0% accuracy for batch 100 at 93


209it [00:04, 49.52it/s]

loss for 200 --> 0.0017046405700966716 at epoch 93
100.0% accuracy for batch 200 at 93


238it [00:04, 50.11it/s]
107it [00:02, 50.17it/s]

loss for 100 --> 0.00048711360432207584 at epoch 94
100.0% accuracy for batch 100 at 94


209it [00:04, 50.29it/s]

loss for 200 --> 0.08278699219226837 at epoch 94
96.875% accuracy for batch 200 at 94


238it [00:04, 50.36it/s]
106it [00:02, 49.74it/s]

loss for 100 --> 0.0008018853259272873 at epoch 95
100.0% accuracy for batch 100 at 95


206it [00:04, 50.25it/s]

loss for 200 --> 0.0006954338168725371 at epoch 95
100.0% accuracy for batch 200 at 95


238it [00:04, 49.56it/s]
105it [00:02, 49.75it/s]

loss for 100 --> 0.0004483535885810852 at epoch 96
100.0% accuracy for batch 100 at 96


206it [00:04, 50.03it/s]

loss for 200 --> 0.0009593100985512137 at epoch 96
100.0% accuracy for batch 200 at 96


238it [00:04, 49.51it/s]
107it [00:02, 50.03it/s]

loss for 100 --> 0.0016642496921122074 at epoch 97
100.0% accuracy for batch 100 at 97


209it [00:04, 49.79it/s]

loss for 200 --> 0.0005726965609937906 at epoch 97
100.0% accuracy for batch 200 at 97


238it [00:04, 49.50it/s]
108it [00:02, 50.03it/s]

loss for 100 --> 0.0024320255033671856 at epoch 98
100.0% accuracy for batch 100 at 98


208it [00:04, 49.54it/s]

loss for 200 --> 0.001289269421249628 at epoch 98
100.0% accuracy for batch 200 at 98


238it [00:04, 49.99it/s]
109it [00:02, 50.18it/s]

loss for 100 --> 0.0010118023492395878 at epoch 99
100.0% accuracy for batch 100 at 99


208it [00:04, 49.31it/s]

loss for 200 --> 0.00037531182169914246 at epoch 99
100.0% accuracy for batch 200 at 99


238it [00:04, 49.74it/s]
110it [00:02, 50.25it/s]

loss for 100 --> 0.0005499250837601721 at epoch 100
100.0% accuracy for batch 100 at 100


210it [00:04, 50.65it/s]

loss for 200 --> 0.007628720719367266 at epoch 100
100.0% accuracy for batch 200 at 100


238it [00:04, 49.76it/s]


In [18]:
test_dataset = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")
test_dataset.head(5)

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [19]:
test_dataset.drop(columns=["keyword", "location"], inplace=True)
test_dataset.head(5)

Unnamed: 0,id,text
0,0,Just happened a terrible car crash
1,2,"Heard about #earthquake is different cities, s..."
2,3,"there is a forest fire at spot pond, geese are..."
3,9,Apocalypse lighting. #Spokane #wildfires
4,11,Typhoon Soudelor kills 28 in China and Taiwan


In [20]:
loaded_test_dataset = TweetsDatasetTest(dataset=test_dataset, tokenizer=bert_tokenizer, max_length=max_length)
test_dataloader = DataLoader(dataset=loaded_test_dataset, batch_size=32, generator=generator, collate_fn=loaded_test_dataset.collate_fn, shuffle=False)

In [21]:
mlp.eval()
with torch.inference_mode():
    predictions = []
    ids = []
    for batch in test_dataloader:
        ids += batch["ids"]
        batch_size = batch["input_ids"].shape[0]
        logits = mlp(batch["input_ids"])
        probs = torch.sigmoid(logits)
        preds = (probs > 0.5).int()
        predictions.append(preds.view(-1))

    predictions = torch.cat([preds_tensor for preds_tensor in predictions], dim=0).tolist()
    print(len(predictions))
    print(len(ids))
    print(len(test_dataset))

    df = pd.DataFrame({
        "id": ids,
        "target": predictions
    })
    
df.to_csv("/kaggle/working/submission.csv", index=False)

3263
3263
3263
