In [1]:
import numpy as np
import pandas as pd
import torch
import transformers

In [2]:
from transformers import T5Tokenizer, T5Model, T5ForConditionalGeneration
from torch.utils.data import Dataset, DataLoader
pd.set_option('display.max_colwidth', 4000)
DEBUG = False # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

In [3]:
tok = T5Tokenizer.from_pretrained('t5-base') # large = out of memory on optim.step()
mdl = T5ForConditionalGeneration.from_pretrained('t5-base').to('cuda')

Some weights of T5ForConditionalGeneration were not initialized from the model checkpoint at t5-base and are newly initialized: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
train_df = pd.read_csv('tweets_data/disaster_response_messages_training.csv')[['message', 'related']]
if DEBUG:
    train_df = train_df.sample(frac=0.1)
valid_df = pd.read_csv('tweets_data/disaster_response_messages_validation.csv')[['message', 'related']]
if DEBUG:
    valid_df = valid_df.sample(frac=0.1)
test_df = pd.read_csv('tweets_data/disaster_response_messages_test.csv')[['message', 'related']]
if DEBUG:
    test_df = test_df.sample(frac=0.1)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [5]:
train_df['related'].value_counts()

1    15795
0     5083
2      168
Name: related, dtype: int64

In [6]:
#valid_df[valid_df['related']==1]

In [7]:
prefix = 'is_disaster: '
suffix = ' </s>'

train_df['message'] = train_df['message'].apply(lambda x: prefix + x + suffix)
valid_df['message'] = valid_df['message'].apply(lambda x: prefix + x + suffix)
test_df['message'] = test_df['message'].apply(lambda x: prefix + x + suffix)

In [8]:
train_ls = train_df['message'].tolist()
train_vals = train_df['related'].apply(lambda x: str(x) + suffix).tolist()
valid_ls = valid_df['message'].tolist()
valid_vals = valid_df['related'].apply(lambda x: str(x) + suffix).tolist()
test_ls = test_df['message'].tolist()
test_vals = test_df['related'].apply(lambda x: str(x) + suffix).tolist()

In [9]:
class TweetsDataset(Dataset):
    
    def __init__(self, input_ids, attention_mask, decoder_ids, decoder_attention_mask):
        assert input_ids.shape == attention_mask.shape
        assert decoder_ids.shape == decoder_attention_mask.shape
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.decoder_ids = decoder_ids
        self.decoder_attention_mask = decoder_attention_mask
    
    def __getitem__(self, index):
        return (self.input_ids[index], self.attention_mask[index], self.decoder_ids[index], self.decoder_attention_mask[index])
    
    def __len__(self):
        return len(self.input_ids)

In [10]:
train_dict = tok(train_ls, max_length=512, truncation=True, padding=True, return_tensors='pt')
train_label_dict = tok(train_vals, return_tensors='pt', padding=True, truncation=True)

train_input_ids = train_dict['input_ids']
train_attention_mask = train_dict['attention_mask']
train_decoder_ids = train_label_dict['input_ids']
train_decoder_attention_mask = train_label_dict['attention_mask']

train_dataset = TweetsDataset(train_input_ids, train_attention_mask, train_decoder_ids, train_decoder_attention_mask)
train_dl = DataLoader(train_dataset, batch_size=8, shuffle=True)

In [11]:
%%time

optim = torch.optim.Adam(mdl.parameters())

for epoch in [1]:
    for input_ids, attention_mask, decoder_ids, decoder_attention_mask in train_dl:
        #print(input_ids.shape, attention_mask.shape)
        optim.zero_grad()
        # delete attention mask? attention_mask=attention_mask.to('cuda'), 
        res = mdl(input_ids.to('cuda'), labels=decoder_ids.to('cuda')) #decoder_input_ids=decoder_ids.to('cuda'), decoder_attention_mask=decoder_attention_mask.to('cuda'))
        loss = res[0]
        loss.backward()
        optim.step()
        #break;
    print(f'End of epoch {epoch}')

End of epoch 1
CPU times: user 58min 31s, sys: 4min 59s, total: 1h 3min 31s
Wall time: 22min 21s


In [12]:
#train_df['message'].apply(lambda x: len(x)).describe()

In [13]:
valid_dict = tok(valid_ls, max_length=512, truncation=True, padding=True, return_tensors='pt')
valid_label_dict = tok(valid_vals, return_tensors='pt', padding=True, truncation=True)

valid_input_ids = valid_dict['input_ids']
valid_attention_mask = valid_dict['attention_mask']
valid_decoder_ids = valid_label_dict['input_ids']
valid_decoder_attention_mask = valid_label_dict['attention_mask']

valid_dataset = TweetsDataset(valid_input_ids, valid_attention_mask, valid_decoder_ids, valid_decoder_attention_mask)
valid_dl = DataLoader(valid_dataset, batch_size=8, shuffle=True)

In [21]:
mdl.eval()

y = []
y_pred = []

with torch.no_grad():
    for input_ids, attention_mask, correct_labels, _ in valid_dl:
        res = mdl.generate(input_ids=input_ids.to('cuda'), attention_mask=attention_mask.to('cuda'))
        for elem, lbl in zip(res, correct_labels):
            y_pred.append(int(tok.decode(elem)))
            y.append(int(tok.decode(lbl)))
        print(res.shape)

torch.Size([8, 3])
torch.Size([8, 4])
torch.Size([8, 4])
torch.Size([8, 4])
torch.Size([8, 4])
torch.Size([8, 3])
torch.Size([8, 4])
torch.Size([8, 4])
torch.Size([8, 4])
torch.Size([8, 4])
torch.Size([8, 4])
torch.Size([8, 4])
torch.Size([8, 4])
torch.Size([8, 3])
torch.Size([8, 4])
torch.Size([8, 4])
torch.Size([8, 3])
torch.Size([8, 4])
torch.Size([8, 4])
torch.Size([8, 4])
torch.Size([8, 4])
torch.Size([8, 4])
torch.Size([8, 4])
torch.Size([8, 4])
torch.Size([8, 4])
torch.Size([8, 4])
torch.Size([8, 4])
torch.Size([8, 4])
torch.Size([8, 3])
torch.Size([8, 4])
torch.Size([8, 3])
torch.Size([8, 4])
torch.Size([8, 4])
torch.Size([8, 4])
torch.Size([8, 4])
torch.Size([8, 3])
torch.Size([8, 4])
torch.Size([8, 4])
torch.Size([8, 4])
torch.Size([8, 4])
torch.Size([8, 4])
torch.Size([8, 4])
torch.Size([8, 3])
torch.Size([8, 4])
torch.Size([8, 4])
torch.Size([8, 4])
torch.Size([8, 4])
torch.Size([8, 4])
torch.Size([8, 4])
torch.Size([8, 4])
torch.Size([8, 4])
torch.Size([8, 3])
torch.Size([

In [15]:
tok.decode(res[0])

'0'

In [18]:
#mdl.forward?
#torch.save(mdl.state_dict(), 'models/mdl_first_try.pkl--')

In [25]:
y = np.array(y)
y_pred = np.array(y_pred)

In [28]:
pd.DataFrame(y==y_pred).value_counts()

True     2137
False     436
dtype: int64