<a href="https://www.kaggle.com/code/evelynartoria/nlp-pytorch-custom-dataset-class-disaster-tweets?scriptVersionId=188253046" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


### This is custom dataset class for datalaoder for the disaster tweets competition

In [2]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from transformers import AutoTokenizer

In [3]:
train_dataset = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
train_dataset.head(5)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
train_dataset.drop(columns=["keyword", "location"], inplace=True)

In [5]:
bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [6]:
class TweetsDataset(Dataset):
    def __init__(self, dataset: pd.DataFrame, tokenizer: object, max_length: int):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        inputs = self.dataset["text"].iloc[idx]
        label = self.dataset["target"].iloc[idx]
        data_dict = self.tokenizer(inputs, padding=True, truncation=True, max_length=self.max_length, return_tensors="pt")
        
        return {
            "input_ids": data_dict["input_ids"].flatten(),
            "type_ids": data_dict["token_type_ids"].flatten(),
            "attention_mask": data_dict["attention_mask"].flatten(),
            "label": torch.tensor(label, dtype=torch.long)
        }
    
    # collate_fn for handling the dataset with a dataloader, basically pad the batches for proper stacking
    @staticmethod
    def collate_fn(batch):
        batch_input_ids = [item["input_ids"] for item in batch]
        batch_type_ids = [item["type_ids"] for item in batch]
        batch_masks = [item["attention_mask"] for item in batch]
        batch_labels = [item["label"] for item in batch]
        
        batch_input_ids = pad_sequence(batch_input_ids, batch_first=True, padding_value=0)
        batch_type_ids = pad_sequence(batch_type_ids, batch_first=True, padding_value=0)
        batch_masks = pad_sequence(batch_masks, batch_first=True, padding_value=0)
        
        
        return {
            "input_ids": batch_input_ids,
            "type_ids": batch_type_ids,
            "attention_mask": batch_masks,
            "label": torch.stack(batch_labels)
        }

In [7]:
loaded_dataset = TweetsDataset(dataset=train_dataset, tokenizer=bert_tokenizer, max_length=512)
loaded_dataset[1956]

{'input_ids': tensor([  101,   137,   161,  3048,  2249, 17540,  1284,  1444,  1292,  3546,
          1107,  1103,   185,  7409, 19814,  1219,  1103, 15202,  2955,  1122,
          1156,  1494,   102]),
 'type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 'label': tensor(1)}

In [8]:
train_dataloader = DataLoader(dataset=loaded_dataset, batch_size=2, shuffle=True, generator=torch.Generator(device="cpu"), collate_fn=loaded_dataset.collate_fn)
next(iter(train_dataloader))

{'input_ids': tensor([[  101,   137,   161,  3048,  2249, 17540,  1284,  1444,  1292,  3546,
           1107,  1103,   185,  7409, 19814,  1219,  1103, 15202,  2955,  1122,
           1156,  1494,   102,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0],
         [  101,  7037, 17818,  4231,  1106,   140,  8031,  1186,  5431,  2197,
           8413,   131,   120,   120,   189,   119,  1884,   120,   142,  1527,
           2137,  1964,  2240,  1580,  1964,  2591,  3190,  5301,  8413,   131,
            120,   120,   189,   119,  1884,   120,   147,  1183,  1658, 20201,
           1604,  1182,  3663,  1324,  1403,   102]]),
 'type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0