In [1]:
import os
import sagemaker
from sagemaker import get_execution_role

sagemaker_session = sagemaker.Session()

role = get_execution_role()
region = sagemaker_session.boto_session.region_name

In [6]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import tensorflow_datasets as tfds

import pandas as pd

import tensorflow_text as text
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

import boto3

In [7]:
#load the dataset
#NOTE THAT SINCE THIS CORPUS IS ONLY IN LOWERCASE, YOU NEED TO FEED THE DATA AS LOWERCASES OR YOU WILL NOT GET *ACCURATE* TRANSLATIONS
client = boto3.client('s3')
df = pd.read_csv(f"s3://sagemaker-eu-west-1-755811905719/output_1686666402/part-00000-238d78a7-0c67-4db3-8634-9e919ca12ce8-c000.csv", delimiter = "\t")


In [8]:
#verify that the dataset has been loaded
print(df.head())
print(df.shape)
print(df['_c0'][0])
print(df['_c1'][0])
print(type(df['_c0'][0]))

                                                 _c0  \
0  emekli üyeler kongre'nin şu sıralar çete savaş...   
1  entellektüellik , klas , asalet veya hikaye il...   
2  hangisi olduğunu tahmin edebildiniz mi ? şirke...   
3  pek uzak yerlere seyahat edemez veya belli bir...   
4                                 heyecanlanmıştım .   

                                                 _c1  
0  retiring members nowadays say that it 's becom...  
1  no sophistication , no class , no dignity , no...  
2                     did you guess it ? companies .  
3  you ca n't travel very far or venture too far ...  
4                                    i was excited .  
(50000, 2)
emekli üyeler kongre'nin şu sıralar çete savaşlarına döndüğünü söylüyorlar .
retiring members nowadays say that it 's become like gang warfare .
<class 'str'>


In [9]:
df.columns = ['turkish', 'english']

tokenizer = Tokenizer(oov_token="<UNK>")

# Fit the tokenizer on the texts
tokenizer.fit_on_texts(df['turkish'].tolist() + df['english'].tolist())

# Convert the turkish and english columns into sequences
turkish_sequences = tokenizer.texts_to_sequences(df['turkish'])
english_sequences = tokenizer.texts_to_sequences(df['english'])

# Pad the sequences so that they are all the same length
turkish_padded = pad_sequences(turkish_sequences, padding='post')
english_padded = pad_sequences(english_sequences, padding='post')

df['turkish_tokenized'] = turkish_padded.tolist()
df['english_tokenized'] = english_padded.tolist()

######################################################################
#TODO: ADD <NUM> TOKEN SO THAT THE MODEL CAN KEEP THE NUMBERS THE SAME
######################################################################

In [10]:
#ensure that the tokenizing is working properly
print(df.head())
 

                                             turkish  \
0  emekli üyeler kongre'nin şu sıralar çete savaş...   
1  entellektüellik , klas , asalet veya hikaye il...   
2  hangisi olduğunu tahmin edebildiniz mi ? şirke...   
3  pek uzak yerlere seyahat edemez veya belli bir...   
4                                 heyecanlanmıştım .   

                                             english  \
0  retiring members nowadays say that it 's becom...   
1  no sophistication , no class , no dignity , no...   
2                     did you guess it ? companies .   
3  you ca n't travel very far or venture too far ...   
4                                    i was excited .   

                                   turkish_tokenized  \
0  [9254, 27916, 35956, 140, 10637, 17007, 54019,...   
1  [54020, 27917, 54021, 205, 891, 103, 4609, 219...   
2  [8227, 119, 872, 54022, 246, 2329, 0, 0, 0, 0,...   
3  [409, 1590, 2805, 2757, 9255, 205, 1164, 5, 11...   
4  [22803, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [11]:
import json

# Save the vocabulary as a JSON file
with open('vocabulary.json', 'w') as f:
    json.dump(tokenizer.word_index, f)

In [12]:
#create the training, validation and testing datasets
train = df.sample(frac=0.9, random_state=0)
test = df.drop(train.index)
validation = test.sample(frac=0.5, random_state=0)
test = test.drop(validation.index)

#create a pandas dataframe for each of the train, validation and test datasets
train_df = pd.DataFrame({
    'turkish': train['turkish_tokenized'].tolist(),
    'english': train['english_tokenized'].tolist()
})
validation_df = pd.DataFrame({
    'turkish': validation['turkish_tokenized'].tolist(),
    'english': validation['english_tokenized'].tolist()
})
test_df = pd.DataFrame({
    'turkish': test['turkish_tokenized'].tolist(),
    'english': test['english_tokenized'].tolist()
})


In [13]:
#test if the datasets work
print(train_df.shape)
print(test_df.shape)
print(validation_df.shape)
print(train_df.head())
print(test_df.head())
print(validation_df.head())

(45000, 2)
(2500, 2)
(2500, 2)
                                             turkish  \
0  [7242, 8754, 9, 471, 66639, 0, 0, 0, 0, 0, 0, ...   
1  [113, 74767, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
2  [9, 107, 201, 4530, 46, 102003, 369, 9940, 241...   
3  [175, 3209, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...   
4  [253, 149, 50013, 754, 7084, 126, 34, 6734, 28...   

                                             english  
0  [11, 23, 7242, 118, 339, 3, 11, 23, 168, 53, 2...  
1  [110543, 110544, 15, 27, 8, 735, 254, 6, 636, ...  
2  [3, 96, 665, 579, 105, 142, 2, 2547, 2907, 6, ...  
3  [2, 221, 17, 3209, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
4  [108, 403, 65, 180, 10, 14, 18, 50013, 2867, 1...  
                                             turkish  \
0  [98, 7784, 4208, 293, 54035, 7784, 10638, 2280...   
1  [35988, 88, 33, 120, 3813, 0, 0, 0, 0, 0, 0, 0...   
2  [157, 237, 12, 59, 12470, 6736, 19461, 0, 0, 0...   
3  [5, 27938, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...   
4  [9, 54106, 219, 348

In [14]:
# Load the pretrained GPT-2 model and tokenizer
dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=None,  # Use file_path if you have text data in a file
    split=None,  # Specify the split (e.g., "train", "validation")
    text_column=['english_tokens', 'turkish_tokens'],  # Provide the tokenized text columns
    block_size=128  # Adjust the block size according to your needs
)

ImportError: 
GPT2LMHeadModel requires the PyTorch library but it was not found in your environment.
However, we were able to find a TensorFlow installation. TensorFlow classes begin
with "TF", but are otherwise identically named to our PyTorch classes. This
means that the TF equivalent of the class you tried to import would be "TFGPT2LMHeadModel".
If you want to use TensorFlow, please use TF classes instead!

If you really do want to use PyTorch please go to
https://pytorch.org/get-started/locally/ and follow the instructions that
match your environment.
