In [3]:
import os
import sagemaker
from sagemaker import get_execution_role

sagemaker_session = sagemaker.Session()

role = get_execution_role()
region = sagemaker_session.boto_session.region_name

In [1]:
# https://cs.stanford.edu/~zxie/textgen.pdf
# https://www.tensorflow.org/text/tutorials/transformer#set_up_the_tokenizer
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import torch

from torch.utils.data import Dataset, DataLoader
from datasets import Dataset


from transformers import GPT2LMHeadModel, \
                        TextDataset, \
                        DataCollatorForLanguageModeling, \
                        Trainer, \
                        TrainingArguments,\
                        GPT2Tokenizer,\
                        GPT2Config

from tokenizers import ByteLevelBPETokenizer

import boto3

if torch.cuda.is_available():
    device = torch.device('cuda')
    print("GPU!!!!!!!!!!!!!!!!")
else:
    device = torch.device('cpu')
    print("CPU :(")

2023-06-15 20:54:48.064920: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


GPU!!!!!!!!!!!!!!!!


In [2]:
#load the dataset
#NOTE THAT SINCE THIS CORPUS IS ONLY IN LOWERCASE, YOU NEED TO FEED THE DATA AS LOWERCASES OR YOU WILL NOT GET *ACCURATE* TRANSLATIONS
client = boto3.client('s3')
df = pd.read_csv("./Data/data.csv", delimiter = "\t", names=['turkish','english'])
df = df[:1000]

In [3]:
#verify that the dataset has been loaded

print(df.columns)
print(df.head())
print(df.shape)

Index(['turkish', 'english'], dtype='object')
                                             turkish  \
0  emekli üyeler kongre'nin şu sıralar çete savaş...   
1  entellektüellik , klas , asalet veya hikaye il...   
2  hangisi olduğunu tahmin edebildiniz mi ? şirke...   
3  pek uzak yerlere seyahat edemez veya belli bir...   
4                                 heyecanlanmıştım .   

                                             english  
0  retiring members nowadays say that it 's becom...  
1  no sophistication , no class , no dignity , no...  
2                     did you guess it ? companies .  
3  you ca n't travel very far or venture too far ...  
4                                    i was excited .  
(1000, 2)


In [4]:
%%time
# Initialize the GPT-2 tokenizer
tokenizer = ByteLevelBPETokenizer()
tokenizer.unk_token = "<unk>"
tokenizer.pad_token = "<pad>"
tokenizer.enable_padding(length=256, pad_token="<pad>", direction="right")
tokenizer.enable_truncation(max_length=256)

# append the turkish to the english and add the special tokens
df['turkish'] = df['turkish'].apply(lambda x: x.lower())
df['english'] = df['english'].apply(lambda x: x.lower())

# Train the tokenizer on the dataset for english and turkish

en_tr = df['turkish'].tolist() + df['english'].tolist()
tokenizer.train_from_iterator(en_tr, vocab_size=52_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
], show_progress=True)

#save model 
tokenizer.save_model("./tokenizer")

tokenizer = GPT2Tokenizer.from_pretrained("./tokenizer")
tokenizer.add_special_tokens({
    "eos_token": "</s>",
    "bos_token": "<s>",
    "unk_token": "<unk>",
    "pad_token": "<pad>",
    "mask_token": "<mask>"
})

print(tokenizer.encode("test if this is working"))

######################################################################
#TODO: ADD <NUM> TOKEN SO THAT THE MODEL CAN KEEP THE NUMBERS THE SAME
######################################################################




[2538, 609, 412, 358, 1684]
CPU times: user 3.87 s, sys: 1.82 s, total: 5.69 s
Wall time: 988 ms


In [5]:
# Tokenize the dataset
def tokenize_text(text):
    return tokenizer.encode(text, 
                                 add_special_tokens=True, 
                                #  return_attention_mask=True, 
                                #  return_tensors='pt'
                                 )

# Tokenize the dataset

df['turkish_tokens'] = df['turkish'].apply(lambda x: tokenize_text(x))
df['english_tokens'] = df['english'].apply(lambda x: tokenize_text(x))

print(df['turkish_tokens'][0])

[4811, 321, 3614, 328, 695, 75, 297, 11, 3116, 955, 6449, 5402, 1321, 1934, 6927, 4603, 303, 267]


In [6]:
lengths = []

for sample in df['turkish_tokens']:
    try:
        lengths.append(len(sample))
    except:
        print(sample)
        break 
plt.hist(lengths, np.linspace(0, 500, 101))
plt.ylim(plt.ylim())
max_length = max(lengths)
plt.plot([max_length, max_length], plt.ylim())
plt.title(f'Maximum tokens per example: {max_length}');

NameError: name 'tr_tokens' is not defined

In [None]:
#config for GPT2
config = GPT2Config(
    vocab_size=tokenizer.vocab_size,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
    unk_token_id=tokenizer.unk_token_id,
    mask_token_id=tokenizer.mask_token_id,
    n_positions=256,
    n_ctx=256,
    n_embd=768,
    n_layer=12,
    n_head=12,
    n_inner=3072,
    activation_function="gelu",
    resid_pdrop=0.1,
    embd_pdrop=0.1,
    attn_pdrop=0.1,
)

#initialize the model
model = GPT2LMHeadModel(config)
print(model.num_parameters())

90837504


In [None]:
%%time

#load the dataset from pandas 
dataset = Dataset.from_pandas(df[['turkish_tokens', 'english_tokens']])
print(dataset[0])

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)


ArrowInvalid: ("Could not convert {'input_ids': [4811, 321, 3614, 328, 695, 75, 297, 11, 3116, 955, 6449, 5402, 1321, 1934, 6927, 4603, 303, 267], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]} with type BatchEncoding: did not recognize Python value type when inferring an Arrow data type", 'Conversion failed for column turkish_tokens with type object')