<a href="https://www.kaggle.com/code/evelynartoria/solving-disaster-tweets-dataset-with-mlp-pytorch?scriptVersionId=188261938" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

!kaggle competitions download -c nlp-getting-started

!unzip -x ./nlp-getting-started.zip

In [1]:
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from transformers import AutoTokenizer

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
generator = torch.Generator(device=device)
torch.set_default_device(device)
print(f"default device set to {device}")

default device set to cuda


In [3]:
bert_tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [4]:
sample_text = ["asfognofgnsdfg", "fgdofgnigifnfdpsfmgdpmgsp"]
print(bert_tokenizer(sample_text, add_special_tokens=True, max_length=512, truncation=True, padding=True))

{'input_ids': [[101, 2004, 14876, 26745, 2546, 16206, 16150, 2546, 2290, 102, 0, 0, 0, 0, 0, 0, 0], [101, 1042, 2290, 3527, 2546, 29076, 5856, 2546, 2078, 2546, 18927, 22747, 24798, 18927, 24798, 13102, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}


In [5]:
train_dataset = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
train_dataset.sample(10)

Unnamed: 0,id,keyword,location,text,target
7173,10278,war%20zone,,@RobertONeill31 Getting hit by a foul ball whi...,0
4318,6132,hellfire,,The Prophet (peace be upon him) said 'Save you...,1
4137,5884,hailstorm,"Calgary, AB, Canada",600 passengers abandoned at LRT station during...,1
5910,8440,sandstorm,,Now playing: Darude - Sandstorm - radio edit h...,0
5324,7602,pandemonium,Dallas Fort-Worth,Pandemonium In Aba As Woman Delivers Baby With...,1
3825,5443,first%20responders,A Hoop Somewhere,Incase of accident the first responders would ...,0
505,730,attacked,"Livingston, IL U.S.A.",Christian Attacked by Muslims at the Temple Mo...,1
6818,9765,trapped,10 Steps Ahead. Cloud 9,Bomb head? Explosive decisions dat produced mo...,1
5720,8163,rescuers,,VIDEO: 'We're picking up bodies from water': R...,1
2068,2969,dead,,I just watched emmerdale nd I don't know who m...,1


In [6]:
train_dataset.drop(columns=["location", "keyword"], inplace=True)

In [7]:
bert_tokenizer.pad_token_id

0

In [8]:
class TweetsDataset(Dataset):
    def __init__(self, dataset: pd.DataFrame, tokenizer: object, max_length: int):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        inputs = self.dataset["text"].iloc[idx]
        labels = self.dataset["target"].iloc[idx]
        encoded_dict = self.tokenizer(inputs, max_length=self.max_length, truncation=True, padding=True, return_tensors="pt")

        return {
            "input_ids": encoded_dict["input_ids"].flatten(),
            "type_ids": encoded_dict["token_type_ids"].flatten(),
            "attention_mask": encoded_dict["attention_mask"].flatten(),
            "label": torch.tensor(labels, dtype=torch.long)
        }
        
    @staticmethod
    def collate_fn(batch):
        batch_input_ids = [item["input_ids"] for item in batch]
        batch_type_ids = [item["type_ids"] for item in batch]
        batch_masks = [item["attention_mask"] for item in batch]
        labels = [item["label"] for item in batch]


        batch_input_ids = pad_sequence(batch_input_ids, batch_first=True, padding_value=0)
        batch_type_ids = pad_sequence(batch_type_ids, batch_first=True, padding_value=0)
        batch_masks = pad_sequence(batch_masks, batch_first=True, padding_value=0)

        return {
            "input_ids": batch_input_ids,
            "type_ids": batch_type_ids,
            "attention_mask": batch_masks,
            "labels": torch.stack(labels)
        }



In [9]:
class TweetsDatasetTest(Dataset):
    def __init__(self, dataset: pd.DataFrame, tokenizer: object, max_length: int):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        inputs = self.dataset["text"].iloc[idx]
        encoded_dict = self.tokenizer(inputs, max_length=self.max_length, truncation=True, padding=True, return_tensors="pt")

        return {
            "input_ids": encoded_dict["input_ids"].flatten(),
            "type_ids": encoded_dict["token_type_ids"].flatten(),
            "attention_mask": encoded_dict["attention_mask"].flatten(),
            "id": self.dataset["id"].iloc[idx]
        }
        
    @staticmethod
    def collate_fn(batch):
        batch_input_ids = [item["input_ids"] for item in batch]
        batch_type_ids = [item["type_ids"] for item in batch]
        batch_masks = [item["attention_mask"] for item in batch]
        ids = [item["id"] for item in batch]


        batch_input_ids = pad_sequence(batch_input_ids, batch_first=True, padding_value=0)
        batch_type_ids = pad_sequence(batch_type_ids, batch_first=True, padding_value=0)
        batch_masks = pad_sequence(batch_masks, batch_first=True, padding_value=0)

        return {
            "input_ids": batch_input_ids,
            "type_ids": batch_type_ids,
            "attention_mask": batch_masks,
            "ids": ids
        }


In [10]:
max_length = max([len(text) for text in train_dataset["text"]])
print(max_length)
loaded_dataset = TweetsDataset(dataset=train_dataset, tokenizer=bert_tokenizer, max_length=max_length)

157


In [11]:
train_dataloader = DataLoader(dataset=loaded_dataset, batch_size=2, shuffle=True, generator=generator, collate_fn=loaded_dataset.collate_fn)
next(iter(train_dataloader))

{'input_ids': tensor([[  101,  1045, 16393, 14588,  2000,  1996,  1052,  1012,  1051,  1012,
           1052,  1012,  1041,  1012,  1998,  1996,  5255,  3121,  1997,  8680,
           2103,  1012,  1029,  1029,  1029,  1029,  1029,  1029,   102],
         [  101, 21318,  3367, 13465,  7474,  1999,  1019,  8117,  1013,  1013,
           5882,  5877,  2226,  8299,  1024,  1013,  1013,  1056,  1012,  2522,
           1013,  1021,  4160, 26952, 17914,  6784,  2581,  2615,   102]],
        device='cuda:0'),
 'type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0]], device='cuda:0'),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1]], device='cu

In [12]:
class MLP(nn.Module):
    def __init__(self, vocab_size, n_embd, context_size, n_classes, dropout):
        super(MLP, self).__init__()

        self.token_embedding_table = nn.Embedding(vocab_size, n_embd) # (B, T, C)
        self.position_embedding_table = nn.Embedding(context_size, n_embd) # (T, C)
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(in_features=n_embd, out_features=8*8)
        self.act_fn = nn.ReLU()
        self.lm_head = nn.Linear(in_features=8*8, out_features=n_classes) # (B, T, C) @ (B, C, n_classes) --> (B, C, n_classes)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        B, T = x.shape

        positions = torch.arange(start=0, end=T, step=1)
        x = self.token_embedding_table(x) + self.position_embedding_table(positions)
        x = self.dropout(x)
        x = self.linear(x) # (B, T, C) @ (B, C, 8*8) --> (B, T, 8*8)
        x = self.act_fn(x)
        x, _ = torch.max(x, dim=1) # max across the T dimensions
        x = self.lm_head(x) # (B, T, C) @ (B, C, n_classes) --> (B, T, n_classes)
        return x 

In [13]:
n_embd = 256
context_size = max_length
mlp = MLP(
    vocab_size=bert_tokenizer.vocab_size,
    n_embd=n_embd, context_size=context_size, 
    n_classes=1, dropout=0.25)

In [14]:
train_dataloader = DataLoader(dataset=loaded_dataset, batch_size=32, shuffle=True, generator=generator, collate_fn=loaded_dataset.collate_fn)

In [15]:
mlp.eval()
with torch.inference_mode():
    batch = next(iter(train_dataloader))
    logits = mlp(batch["input_ids"])
    test_loss_fn = nn.BCEWithLogitsLoss()
    loss = test_loss_fn(logits.view(-1), batch["labels"].to(torch.float32))
    print(logits.shape) # B, n_classes
    print(loss)

torch.Size([32, 1])
tensor(0.7082, device='cuda:0')


In [16]:
def train_model(model, dataloader, loss_fn, optimizer, epochs):
    #writer = SummaryWriter()
    #writer.add_graph(model, torch.zeros(size=(32, context_size), dtype=torch.long)) # (B, T)
    for epoch in range(epochs):
        for batch_idx, batch in tqdm(enumerate(dataloader)):
            logits = model(batch["input_ids"]) # B, n_clases
            loss = loss_fn(logits.view(-1), batch["labels"].to(torch.float32))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            #writer.add_scalar("Loss", loss.item(), batch_idx)

            if (batch_idx+1) % 100 == 0:
                print(f"loss for {batch_idx+1} --> {loss} at epoch {epoch+1}")
                probs = torch.sigmoid(logits)
                pred = (probs > 0.5).int()
                acc = (torch.sum((pred.view(-1) == batch["labels"])).item() / batch["labels"].shape[0]) * 100
                print(f"{acc}% accuracy for batch {batch_idx+1} at {epoch+1}")

    #writer.close()

In [17]:
optimizer = torch.optim.AdamW(params=mlp.parameters(), lr=1e-3)
loss_fn = nn.BCEWithLogitsLoss()
train_model(model=mlp, dataloader=train_dataloader, loss_fn=loss_fn, optimizer=optimizer, epochs=10)

104it [00:02, 50.30it/s]

loss for 100 --> 0.6785638928413391 at epoch 1
56.25% accuracy for batch 100 at 1


206it [00:04, 50.68it/s]

loss for 200 --> 0.7051863670349121 at epoch 1
56.25% accuracy for batch 200 at 1


238it [00:04, 48.80it/s]
105it [00:02, 50.41it/s]

loss for 100 --> 0.44932687282562256 at epoch 2
84.375% accuracy for batch 100 at 2


206it [00:04, 49.99it/s]

loss for 200 --> 0.49375200271606445 at epoch 2
75.0% accuracy for batch 200 at 2


238it [00:04, 50.11it/s]
105it [00:02, 51.02it/s]

loss for 100 --> 0.36015939712524414 at epoch 3
90.625% accuracy for batch 100 at 3


207it [00:04, 50.23it/s]

loss for 200 --> 0.31558001041412354 at epoch 3
90.625% accuracy for batch 200 at 3


238it [00:04, 50.52it/s]
107it [00:02, 48.81it/s]

loss for 100 --> 0.30766162276268005 at epoch 4
90.625% accuracy for batch 100 at 4


204it [00:04, 48.97it/s]

loss for 200 --> 0.44296398758888245 at epoch 4
81.25% accuracy for batch 200 at 4


238it [00:04, 49.94it/s]
107it [00:02, 50.85it/s]

loss for 100 --> 0.2066626399755478 at epoch 5
96.875% accuracy for batch 100 at 5


208it [00:04, 50.32it/s]

loss for 200 --> 0.32338500022888184 at epoch 5
87.5% accuracy for batch 200 at 5


238it [00:04, 50.32it/s]
107it [00:02, 49.37it/s]

loss for 100 --> 0.1269143521785736 at epoch 6
96.875% accuracy for batch 100 at 6


209it [00:04, 50.44it/s]

loss for 200 --> 0.15329962968826294 at epoch 6
96.875% accuracy for batch 200 at 6


238it [00:04, 50.22it/s]
105it [00:02, 49.86it/s]

loss for 100 --> 0.11843366920948029 at epoch 7
96.875% accuracy for batch 100 at 7


210it [00:04, 50.42it/s]

loss for 200 --> 0.18423151969909668 at epoch 7
96.875% accuracy for batch 200 at 7


238it [00:04, 49.69it/s]
108it [00:02, 50.84it/s]

loss for 100 --> 0.04749920219182968 at epoch 8
100.0% accuracy for batch 100 at 8


209it [00:04, 50.98it/s]

loss for 200 --> 0.03166554495692253 at epoch 8
100.0% accuracy for batch 200 at 8


238it [00:04, 50.73it/s]
105it [00:02, 51.14it/s]

loss for 100 --> 0.02747242897748947 at epoch 9
100.0% accuracy for batch 100 at 9


207it [00:04, 50.65it/s]

loss for 200 --> 0.042046088725328445 at epoch 9
100.0% accuracy for batch 200 at 9


238it [00:04, 50.73it/s]
108it [00:02, 50.95it/s]

loss for 100 --> 0.0636216402053833 at epoch 10
96.875% accuracy for batch 100 at 10


206it [00:04, 50.29it/s]

loss for 200 --> 0.015554878860712051 at epoch 10
100.0% accuracy for batch 200 at 10


238it [00:04, 50.35it/s]


In [18]:
test_dataset = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")
test_dataset.head(5)

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [19]:
test_dataset.drop(columns=["keyword", "location"], inplace=True)
test_dataset.head(5)

Unnamed: 0,id,text
0,0,Just happened a terrible car crash
1,2,"Heard about #earthquake is different cities, s..."
2,3,"there is a forest fire at spot pond, geese are..."
3,9,Apocalypse lighting. #Spokane #wildfires
4,11,Typhoon Soudelor kills 28 in China and Taiwan


In [20]:
loaded_test_dataset = TweetsDatasetTest(dataset=test_dataset, tokenizer=bert_tokenizer, max_length=max_length)
test_dataloader = DataLoader(dataset=loaded_test_dataset, batch_size=32, generator=generator, collate_fn=loaded_test_dataset.collate_fn, shuffle=False)

In [21]:
mlp.eval()
with torch.inference_mode():
    predictions = []
    ids = []
    for batch in test_dataloader:
        ids += batch["ids"]
        batch_size = batch["input_ids"].shape[0]
        logits = mlp(batch["input_ids"])
        probs = torch.sigmoid(logits)
        preds = (probs > 0.5).int()
        predictions.append(preds.view(-1))

    predictions = torch.cat([preds_tensor for preds_tensor in predictions], dim=0).tolist()
    print(len(predictions))
    print(len(ids))
    print(len(test_dataset))

    df = pd.DataFrame({
        "id": ids,
        "target": predictions
    })
    
df.to_csv("/kaggle/working/submission.csv", index=False)

3263
3263
3263
