In [1]:
from IPython.display import IFrame
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

# CoronaTweetsDataset usage

Examples of use of this class c:

## Imports

In [2]:
import sys
from pathlib import Path

# Add src folder in order to load Dataset class
src_path = str(Path.cwd().parents[0] / "src")

if src_path not in sys.path:
    sys.path.append(src_path)

In [3]:
import pandas as pd
from torch.utils.data import DataLoader
from transformers import BertTokenizer
from models.corona_tweet_dataset import CoronaTweetsDataset

### Instanciate a CoronaTweetDataset object

We need a pandas dataframe with the dataset, a Huggingface's Bert Tokenizer and an int with Max sequence length

In [4]:
df = pd.read_csv('../data/processed/df_train.csv', sep=',')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
MAX_SEQ_LEN = 256 #some number c:

In [5]:
train_dataset = CoronaTweetsDataset(df, tokenizer, MAX_SEQ_LEN)

So, we can get a sample directly selecting an index. 

In [6]:
train_dataset[0]

{'input_ids': tensor([[  101,  5254,  5254,  5254, 24471,  2140,   102,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,    

We get a dictionary with input_ids, attention_mask, token_type_ids, and label tensors. Note that the text tokens are padded in order to reach max_seq_len.

## Dataloader

During training loop we could use a DataLoader in order to manage our sample's batchs.

In [7]:
BATCH_SIZE = 32

train_dataloader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=0
)

We could access our data with the DataLoader,

In [10]:
for batch in train_dataloader:
    print(batch)
    break

{'input_ids': tensor([[[ 101, 3342, 2122,  ...,    0,    0,    0]],

        [[ 101, 2866, 1055,  ...,    0,    0,    0]],

        [[ 101, 2034, 5095,  ...,    0,    0,    0]],

        ...,

        [[ 101, 4067, 2017,  ...,    0,    0,    0]],

        [[ 101, 1037, 2158,  ...,    0,    0,    0]],

        [[ 101, 1996, 2878,  ...,    0,    0,    0]]]), 'attention_mask': tensor([[[1, 1, 1,  ..., 0, 0, 0]],

        [[1, 1, 1,  ..., 0, 0, 0]],

        [[1, 1, 1,  ..., 0, 0, 0]],

        ...,

        [[1, 1, 1,  ..., 0, 0, 0]],

        [[1, 1, 1,  ..., 0, 0, 0]],

        [[1, 1, 1,  ..., 0, 0, 0]]]), 'token_type_ids': tensor([[[0, 0, 0,  ..., 0, 0, 0]],

        [[0, 0, 0,  ..., 0, 0, 0]],

        [[0, 0, 0,  ..., 0, 0, 0]],

        ...,

        [[0, 0, 0,  ..., 0, 0, 0]],

        [[0, 0, 0,  ..., 0, 0, 0]],

        [[0, 0, 0,  ..., 0, 0, 0]]]), 'labels': tensor([[[0, 1, 0]],

        [[0, 0, 1]],

        [[0, 0, 1]],

        [[0, 1, 0]],

        [[1, 0, 0]],

        [[1