In [5]:
from datasets import load_dataset
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import DataLoader, TensorDataset
torch.cuda.empty_cache()

In [2]:
# Load the TweetEval dataset
tweet_eval_dataset = load_dataset('tweet_eval',"emoji")

# Accessing different splits
train_dataset = tweet_eval_dataset['train']
test_dataset = tweet_eval_dataset['test']
validation_dataset = tweet_eval_dataset['validation']

In [3]:
import pandas as pd
import re

# Concatenate datasets together
all_data = pd.concat([train_dataset.to_pandas(), validation_dataset.to_pandas(), test_dataset.to_pandas()])

# Drop all labels
all_data = all_data.drop(columns=['label'])
all_data['text'] = all_data['text']

def clean_text(text):
    # 1. Remove all characters except punctuation and English characters
    text = re.sub(r'[^a-zA-Z\s.,!?]', '', text)
    # 2. Remove all space at the beginning of the sentence
    text = text.lstrip()
    # 3. Remove all extra space
    text = re.sub(r'\s+', ' ', text)
    return text

all_data = all_data.head(2000)
texts = all_data['text'].tolist()


texts = [clean_text(text) for text in texts]

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [7]:
model = T5ForConditionalGeneration.from_pretrained('t5-small').to(device)
tokenizer = T5Tokenizer.from_pretrained('t5-small')


# Tokenization
inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
input_ids = inputs['input_ids']

# DataLoader
batch_size = 128
dataset = TensorDataset(input_ids)
loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
# Training
epochs = 500
model.train()
for epoch in range(epochs):
    total_loss = 0
    for i, batch in enumerate(loader):
        optimizer.zero_grad()
        batch_input_ids = batch[0].to(device)
        outputs = model(input_ids=batch_input_ids, labels=batch_input_ids)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(loader)
    print(f'Epoch: {epoch + 1}, Loss: {avg_loss}')

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Epoch: 1, Loss: 2.4507443755865097
Epoch: 2, Loss: 2.172430247068405
Epoch: 3, Loss: 1.9878387451171875
Epoch: 4, Loss: 1.8234905824065208
Epoch: 5, Loss: 1.6832695826888084
Epoch: 6, Loss: 1.5397050827741623
Epoch: 7, Loss: 1.3902331590652466
Epoch: 8, Loss: 1.2563733607530594
Epoch: 9, Loss: 1.176120601594448
Epoch: 10, Loss: 1.0549101531505585
Epoch: 11, Loss: 0.9549500122666359
Epoch: 12, Loss: 0.8542642258107662
Epoch: 13, Loss: 0.7640678025782108
Epoch: 14, Loss: 0.6828804537653923
Epoch: 15, Loss: 0.6154415309429169
Epoch: 16, Loss: 0.5691859051585197
Epoch: 17, Loss: 0.5383298471570015
Epoch: 18, Loss: 0.506182448938489
Epoch: 19, Loss: 0.4697606787085533
Epoch: 20, Loss: 0.4358366262167692
Epoch: 21, Loss: 0.40659354254603386
Epoch: 22, Loss: 0.3912338316440582
Epoch: 23, Loss: 0.36151413433253765
Epoch: 24, Loss: 0.3417316656559706
Epoch: 25, Loss: 0.30991453118622303
Epoch: 26, Loss: 0.2843216471374035
Epoch: 27, Loss: 0.2661282438784838
Epoch: 28, Loss: 0.2521718069911003
E

In [8]:
# save model
model.save_pretrained('t5-small-emoji')

# Save the encoder
torch.save(model.encoder.state_dict(), 'encoder_model.pth')

# Save the decoder
torch.save(model.decoder.state_dict(), 'decoder_model.pth')

In [14]:
# LOAD MODEL
# Load the encoder
model = T5ForConditionalGeneration.from_pretrained('t5-small')
model.encoder.load_state_dict(torch.load('encoder_model.pth'))
model.decoder.load_state_dict(torch.load('decoder_model.pth'))

model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [16]:
model.eval()

# Choose a text from your dataset
text = texts[0]

# Tokenize the text and obtain output from model
input_ids = tokenizer(text, return_tensors="pt").input_ids.to(device)

output = model.generate(input_ids, max_length = input_ids.shape[1])

# Decode the generated text
decoded_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(text)
print("---------------------")
print(decoded_text)

Sunday afternoon walking through Venice in the sun with user Abbot Kinney, Venice
---------------------
Sonntag Nachmittag Spa Spa Spa Spa Spa Spa Spa Spa Spa Spa Spa Spa Spa Spa


In [1]:
text = texts[2]
encoder_inputs_ids = tokenizer(text, return_tensors="pt").input_ids.to(device)
decoder_inputs_ids = encoder_inputs_ids

# Get the encoder's last hidden state (latent vector)
encoder_outputs = model.encoder(input_ids=encoder_inputs_ids)
latent_vector = encoder_outputs.last_hidden_state

# play around
random_tensor = torch.randn(1, 10, 512).to(device)
encoder_outputs.last_hidden_state = random_tensor

# decoder_outputs = model.decoder(input_ids=decoder_inputs_ids, encoder_hidden_states=latent_vector)

outputs = model(decoder_input_ids = encoder_inputs_ids, encoder_outputs=encoder_outputs)
logits = outputs.logits
predicted_token_ids = torch.argmax(logits, dim=-1)

# And use the tokenizer to convert these token IDs back into text
predicted_text = tokenizer.decode(predicted_token_ids[0], skip_special_tokens=True)

print(predicted_text)

NameError: name 'texts' is not defined

In [22]:
encoder_outputs.last_hidden_state.shape

torch.Size([1, 20, 512])