In [20]:
# https://cs.stanford.edu/~zxie/textgen.pdf
# https://www.tensorflow.org/text/tutorials/transformer#set_up_the_tokenizer
import numpy as np
import os
import matplotlib.pyplot as plt
import pandas as pd

import torch

from torch.utils.data import Dataset, DataLoader
from datasets import Dataset, load_dataset


from transformers import GPT2LMHeadModel, \
                        TextDataset, \
                        DataCollatorForLanguageModeling, \
                        Trainer, \
                        TrainingArguments,\
                        GPT2Tokenizer,\
                        GPT2Config,\
                        BertTokenizerFast,\
                        BertModel
                        
            

from tokenizers import BertWordPieceTokenizer

import boto3

if torch.cuda.is_available():
    device = torch.device('cpu')
    print("GPU!!!!!!!!!!!!!!!!")
else:
    device = torch.device('cpu')
    print("CPU :(")

GPU!!!!!!!!!!!!!!!!


In [21]:
#load dataframe
df = pd.read_csv("./data.csv", delimiter = "\t", names=['turkish','english'])
df = df[:5]

In [22]:
#verify that the dataset has been loaded

print(df.columns)
print(df.head())
print(df.shape)

Index(['turkish', 'english'], dtype='object')
                                             turkish  \
0  emekli üyeler kongre'nin şu sıralar çete savaş...   
1  entellektüellik , klas , asalet veya hikaye il...   
2  hangisi olduğunu tahmin edebildiniz mi ? şirke...   
3  pek uzak yerlere seyahat edemez veya belli bir...   
4                                 heyecanlanmıştım .   

                                             english  
0  retiring members nowadays say that it 's becom...  
1  no sophistication , no class , no dignity , no...  
2                     did you guess it ? companies .  
3  you ca n't travel very far or venture too far ...  
4                                    i was excited .  
(5, 2)


In [23]:
#load tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-multilingual-cased')
sentence = "Hello, how are you?"
tokens = tokenizer.encode(sentence, add_special_tokens=True)
tokenizer.eos_token = '[PAD]'
tokenizer.pad_token = tokenizer.eos_token
print(tokens)
for token in tokens:
    print(tokenizer.decode(token), '|')
tokenizer.save_pretrained('./bert-base-uncased')

[101, 31178, 117, 14796, 10301, 13028, 136, 102]
[CLS] |
Hello |
, |
how |
are |
you |
? |
[SEP] |


('./bert-base-uncased/tokenizer_config.json',
 './bert-base-uncased/special_tokens_map.json',
 './bert-base-uncased/vocab.txt',
 './bert-base-uncased/added_tokens.json',
 './bert-base-uncased/tokenizer.json')

1024


In [25]:
#tokenize the pandas dataframe
df['english_tokens'] = df['english'].apply(lambda x: tokenizer.encode_plus(x, add_special_tokens=True, padding = 'max_length', max_length = 768, truncation = True, return_tensors = 'pt'))
df['turkish_tokens'] = df['turkish'].apply(lambda x: tokenizer.encode_plus(x, add_special_tokens=True, padding = 'max_length', max_length = 768, truncation = True, return_tensors = 'pt'))

print(df['english_tokens'].values[0])

{'input_ids': tensor([[   101,  75933,  12464,  11858, 101615,  10107,  23763,  10189,  10271,
            112,    187,  13461,  11850,  16330,  74393,    119,    102,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              

In [26]:
bert_model = BertModel.from_pretrained("bert-base-multilingual-cased")
encoder = bert_model.encoder

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [61]:
#encodes the tokens
# print(df['english_tokens'].values[0]['attention_mask'].squeeze(0))
#TODO: fix the attention mask
df['english_encoded'] = df['english_tokens'].apply(lambda x: encoder(x['input_ids'].to(torch.float32).unsqueeze(0)
                                                                    #  attention_mask=x['attention_mask'].to(torch.float32).unsqueeze(0)
                                                                     )[0].to(torch.float32)
                                                                     )
df['turkish_encoded'] = df['turkish_tokens'].apply(lambda x: encoder(x['input_ids'].to(torch.float32).unsqueeze(0)
                                                                    #   attention_mask=x['attention_mask'].to(torch.float32).unsqueeze(0))[0]
                                                                      )[0].long()
                                                                      )
#add the attention masks
df['attention_masks'] = df['english_tokens'].apply(lambda x: x['attention_mask'].unsqueeze(0))
df['decoder_attention_masks'] = df['turkish_tokens'].apply(lambda x: x['attention_mask'].unsqueeze(0))

In [62]:
input_ids = df['english_encoded'].values.tolist()
target_ids = df['turkish_encoded'].values.tolist()
attention_masks = df['attention_masks'].values.tolist()
decoder_attention_masks = df['decoder_attention_masks'].values.tolist()

#create custom dataset that takes in the input_ids, attention_masks, and target_ids
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, input_ids, attention_masks, decoder_attention_masks, target_ids):
        self.input_ids = input_ids
        self.target_ids = target_ids
        self.attention_masks = attention_masks
        self.decoder_attention_masks = decoder_attention_masks

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        input_ids = self.input_ids[idx]
        target_ids = self.target_ids[idx]
        attention_masks = self.attention_masks[idx]
        decoder_attention_masks = self.decoder_attention_masks[idx]
        return input_ids, attention_masks, target_ids, decoder_attention_masks

dataset = CustomDataset(input_ids, attention_masks, decoder_attention_masks, input_ids)
print(dataset[0])
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

(tensor([[[-1.0371e+00,  3.9515e-01, -6.5867e-01, -7.2396e-02,  1.6084e+00,
          -5.9844e-01, -7.1354e-01, -2.3577e-02,  7.6560e-01,  7.0373e-01,
          -5.9151e-01,  5.9707e-01,  3.3662e-01,  2.8603e-01,  2.6011e-01,
          -1.7486e-01, -5.4137e-01, -4.3599e-02,  8.5187e-01,  1.3756e+00,
           1.8264e+00,  4.1690e-01, -4.3771e-01, -1.1941e-01, -6.6531e-01,
          -4.7859e-02, -2.0685e-01, -1.0543e+00, -6.8715e-01,  1.4992e-01,
           2.4614e-01,  3.4564e-01, -3.0112e-01, -4.2113e-02, -2.6306e-02,
           8.2188e-01,  1.9872e-01,  7.0809e-02,  5.1735e-01,  1.7551e-01,
           4.2995e-01,  5.8777e-01,  9.4538e-01,  2.2546e-01, -5.3248e-01,
          -1.1268e+00,  3.6727e-01, -8.3205e-01, -1.7676e-01, -1.7672e-01,
           2.2317e-03, -2.4613e-01,  7.3480e-01, -5.2762e-01,  3.1814e-01,
           4.5334e-01,  1.5647e-02, -3.5377e-01,  4.6430e-01,  4.4255e-01,
          -4.8431e-01, -1.2384e-01, -5.5225e-01, -3.6179e-01, -3.3160e-01,
          -4.0192e-01,  

In [63]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [65]:
gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2")
# decoder = gpt2_model.decoder
print(gpt2_model.config.n_positions)
print(gpt2_model.config.vocab_size)
print(gpt2_model.config.n_embd)

1024
50257
768


In [51]:
training_args = TrainingArguments(
    output_dir="./output",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=1,
    save_steps=1000,
    save_total_limit=3,
)


In [68]:
def train(model, dataloader, optimizer, device):
    model.train()
    
    for batch in dataloader:
        # print(batch)
        # print(batch[0])
        input_ids = batch[0].to(device)
        target_ids = batch[1].to(device)
        input_attention_mask = batch[2].to(device)
        target_attention_mask = batch[3].to(device)
        
        optimizer.zero_grad()
        input_embeddings = torch.randn(1, 768, 768)
        
        # Forward pass
        outputs = model(inputs_embeds = input_embeddings, labels=target_ids)
        logits = outputs.logits
        
        # Calculate loss
        loss = outputs.loss
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        # Print training information
        print('Batch Loss:', loss.item())
    
model = gpt2_model
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
device = torch.device("cpu")
train(model, dataloader, optimizer, device)


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
