In [None]:
import sagemaker
from sagemaker import get_execution_role

sagemaker_session = sagemaker.Session()

role = get_execution_role()
region = sagemaker_session.boto_session.region_name

In [41]:
# https://cs.stanford.edu/~zxie/textgen.pdf
# https://www.tensorflow.org/text/tutorials/transformer#set_up_the_tokenizer
import numpy as np
import os
import matplotlib.pyplot as plt
import pandas as pd

import torch

from torch.utils.data import Dataset, DataLoader
from datasets import Dataset, load_dataset


from transformers import GPT2LMHeadModel, \
                        TextDataset, \
                        DataCollatorForLanguageModeling, \
                        Trainer, \
                        TrainingArguments,\
                        GPT2Tokenizer,\
                        GPT2Config

from tokenizers import ByteLevelBPETokenizer

import boto3

if torch.cuda.is_available():
    device = torch.device('cuda')
    print("GPU!!!!!!!!!!!!!!!!")
else:
    device = torch.device('cpu')
    print("CPU :(")

GPU!!!!!!!!!!!!!!!!


In [42]:
#load the dataset
#NOTE THAT SINCE THIS CORPUS IS ONLY IN LOWERCASE, YOU NEED TO FEED THE DATA AS LOWERCASES OR YOU WILL NOT GET *ACCURATE* TRANSLATIONS
client = boto3.client('s3')
df = pd.read_csv("./Data/data.csv", delimiter = "\t", names=['turkish','english'])
df = df[:100000]


In [43]:
#verify that the dataset has been loaded

print(df.columns)
print(df.head())
print(df.shape)

Index(['turkish', 'english'], dtype='object')
                                             turkish  \
0  emekli üyeler kongre'nin şu sıralar çete savaş...   
1  entellektüellik , klas , asalet veya hikaye il...   
2  hangisi olduğunu tahmin edebildiniz mi ? şirke...   
3  pek uzak yerlere seyahat edemez veya belli bir...   
4                                 heyecanlanmıştım .   

                                             english  
0  retiring members nowadays say that it 's becom...  
1  no sophistication , no class , no dignity , no...  
2                     did you guess it ? companies .  
3  you ca n't travel very far or venture too far ...  
4                                    i was excited .  
(100000, 2)


In [44]:
%%time
#train the tokenizer

#check if the tokenizer has been trained
if os.path.exists("./tokenizer/vocab.json") and os.path.exists("./tokenizer/merges.txt"):
    print("Tokenizer already trained")
    tokenizer = ByteLevelBPETokenizer.from_file("./tokenizer/vocab.json", "./tokenizer/merges.txt")
else:
    tokenizer = ByteLevelBPETokenizer()
    tokenizer.unk_token = "<unk>"
    tokenizer.pad_token = "<pad>"
    tokenizer.enable_padding(length=256, pad_token="<pad>", direction="right")
    tokenizer.enable_truncation(max_length=256)

    # append the turkish to the english and add the special tokens
    df['turkish'] = df['turkish'].apply(lambda x: x.lower())
    df['english'] = df['english'].apply(lambda x: x.lower())

    # Train the tokenizer on the dataset for english and turkish

    en_tr = df['turkish'].tolist() + df['english'].tolist()
    tokenizer.train_from_iterator(en_tr, vocab_size=52_000, min_frequency=2, special_tokens=[
        "<s>",
        "<pad>",
        "</s>",
        "<unk>",
        "<mask>",
    ], show_progress=True)
    #create the tokenizer directory
    os.makedirs("./tokenizer", exist_ok=True)
    #save the tokenizer
    tokenizer.save_model("./tokenizer")

print(tokenizer.encode("merhaba dünya"))

#save model 




Encoding(num_tokens=256, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
CPU times: user 1min 12s, sys: 14.4 s, total: 1min 27s
Wall time: 14.2 s


In [57]:
tokenizer = GPT2Tokenizer.from_pretrained("./tokenizer")

tokenizer.add_special_tokens({
    "eos_token": "</s>",
    "bos_token": "<s>",
    "unk_token": "<unk>",
    "pad_token": "<pad>",
    "mask_token": "<mask>"
})

#verify that the tokenizer works with an example and make the special tokens are added to the example
print(tokenizer.encode("hello world"))
print(tokenizer.decode([35600]))
lengths = []

######################################################################
#TODO: ADD <NUM> TOKEN SO THAT THE MODEL CAN KEEP THE NUMBERS THE SAME
######################################################################

[35600, 773]
hello


In [47]:
# # Tokenize the dataset
# df = df[:100]
# def tokenize_text(text):
#     # add the eos token as well
#     text = text + "</s>"
#     # add the bos token as well
#     text = "<s>" + text
#     return tokenizer.encode_plus(text, 
#                             add_special_tokens=True, 
#                             max_length=256,
#                             padding="max_length",
#                             truncation=True,
#                             )

# Tokenize the dataset

# df['turkish_tokens'] = df['turkish'].apply(lambda x: tokenize_text(x))
# df['english_tokens'] = df['english'].apply(lambda x: tokenize_text(x))

In [48]:
# lengths = []

# for sample in df['turkish_tokens']:
#     try:
#         lengths.append(len(sample['input_ids']))
#     except:
#         print(sample)
#         break 
# plt.hist(lengths, np.linspace(0, 500, 101))
# plt.ylim(plt.ylim())
# max_length = max(lengths)
# plt.plot([max_length, max_length], plt.ylim())
# plt.title(f'Maximum tokens per example: {max_length}');

KeyError: 'turkish_tokens'

In [62]:
#load the dataset from the csv file
#dataset = load_dataset('csv', data_files="./Data/data.csv", delimiter = "\t", names=['turkish','english'], split='train[:1000]')

#create python dictionary with english to turkish from df
data = []
for index, row in df.iterrows():
    data.append({"input": row['english'], "target": row['turkish']})

tokenized_data = tokenizer(
    [example["input"] for example in data],
    [example["target"] for example in data],
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="pt",  # Return PyTorch tensors
)

print(tokenized_data.keys())
print(tokenized_data["input_ids"].shape)

class TranslationDataset(torch.utils.data.Dataset):
    def __init__(self, tokenized_data):
        self.input_ids = tokenized_data["input_ids"]
        self.attention_mask = tokenized_data["attention_mask"]
        self.labels = tokenized_data["target"]

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_mask[idx],
            "labels": self.labels[idx],
        }

    def __len__(self):
        return len(self.input_ids)

# Create the TranslationDataset instance
dataset = TranslationDataset(tokenized_data)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=4, shuffle=True)

# dataset = Dataset.from_dict({"input_ids": english_tokens["input_ids"], "labels": turkish_tokens["input_ids"]})

dict_keys(['input_ids', 'attention_mask'])


KeyError: 'target'

In [51]:
#config for GPT2
config = GPT2Config(
    vocab_size=tokenizer.vocab_size,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
    unk_token_id=tokenizer.unk_token_id,
    mask_token_id=tokenizer.mask_token_id,
    # n_positions=256,
    # n_ctx=256,
    # n_embd=768,
    # n_layer=12,
    # n_head=12,
    # n_inner=3072,
    # activation_function="gelu",
    # resid_pdrop=0.1,
    # embd_pdrop=0.1,
    # attn_pdrop=0.1,
)

#initialize the model
model = GPT2LMHeadModel(config)
print(model.num_parameters())
model.save_pretrained("./model")

125778432


In [52]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [53]:
training_args = TrainingArguments(
    output_dir="./gpt2", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=1, # number of training epochs
    per_device_train_batch_size=8, # batch size for training
    per_device_eval_batch_size=16,  # batch size for evaluation
    eval_steps = 400, # Number of update steps between two evaluations.
    save_steps=10_000, # after # steps model is saved
    warmup_steps=500,# number of warmup steps for learning rate scheduler
    )

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=data_loader,
)

In [54]:
%%time
trainer.train()

for batch in train_dataloader:
    inputs, targets = batch

trainer.save_model("./gpt2")



  0%|          | 0/1 [00:00<?, ?it/s]

ValueError: type of input_ids unknown: <class 'str'>. Should be one of a python, numpy, pytorch or tensorflow object.

In [None]:
#translate a sentence
input_ids = tokenizer.encode("hello world", return_tensors="pt")
input_ids = input_ids.to("cuda")
output = model.generate(input_ids, max_length=256, num_beams=5, early_stopping=True)
print(tokenizer.decode(output[0]))

tensor([[  76,  294,  358, 1443]], device='cuda:0')
hello world
tensor([[  76,  294,  358, 1443,  266,    2,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1],
        [  76,  294,  358, 1443,  329,  266,    2,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1],
        [  76,  294,  358, 1443,  266,    2,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1],
        [  76,  294,  358, 1443,  268,  380,  303,   88,  327,   84,   84,  277,
          268,  337,  265,  275,  303,   81,  277,  317,  272,  393,  338,   73,
          310,  393,  299,  291,   81,   8