In [2]:
import os
import sagemaker
import boto3
sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

# s3 key prefix for the data
s3_prefix = 'samples/datasets/translations-tr-en'


print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")



sagemaker role arn: arn:aws:iam::184473660456:role/mod-6297809195fe4845-SageMakerExecutionRole-1I8SG2YZE93HH
sagemaker bucket: sagemaker-eu-west-1-184473660456
sagemaker session region: eu-west-1


In [3]:
%pip install datasets
from datasets import Dataset
%pip install transformers
from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments
import pandas as pd
import torch

tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-base')

read_bucket = 'sagemaker-translation-en-tr-data'

df = pd.read_csv(f"s3://{read_bucket}/data.csv", delimiter = '\t', names=['tr','en'], header = None)

df = df.dropna()

train_dataset = Dataset.from_pandas(df)

#split into train, test and validation sets
train_dataset = train_dataset.train_test_split(test_size=0.1, shuffle=True)
eval_dataset = train_dataset['test']
train_dataset = train_dataset['train']
test_dataset = eval_dataset.train_test_split(test_size=0.5, shuffle=True)
eval_dataset = test_dataset['train']
test_dataset = test_dataset['test']

print(len(train_dataset))
print(len(eval_dataset))
print(len(test_dataset))

def preprocess_function(examples):
    inputs = examples['en']
    targets = examples['tr']
    inputs = tokenizer(inputs, padding='max_length', truncation=True, max_length=1024, return_tensors='pt')
    targets = tokenizer(targets, padding='max_length', truncation=True, max_length=1024, return_tensors='pt')
    return {'input_ids': inputs['input_ids'], 'attention_mask': inputs['attention_mask'], 'decoder_input_ids': targets['input_ids'][:, :-1], 'decoder_attention_mask': targets['attention_mask'][:, :-1], 'labels': targets['input_ids'][:, 1:]}

train_dataset = train_dataset.map(preprocess_function, batched=True)
eval_dataset = eval_dataset.map(preprocess_function, batched=True)



# save train_dataset to s3
training_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/train'
train_dataset.save_to_disk(training_input_path)

# save test_dataset to s3
test_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/test'
test_dataset.save_to_disk(test_input_path)

# save test_dataset to s3
eval_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/eval'
eval_dataset.save_to_disk(eval_input_path)

Collecting datasets
  Using cached datasets-2.13.0-py3-none-any.whl (485 kB)
Collecting xxhash (from datasets)
  Using cached xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
Collecting aiohttp (from datasets)
  Using cached aiohttp-3.8.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
Collecting huggingface-hub<1.0.0,>=0.11.0 (from datasets)
  Using cached huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
Collecting multidict<7.0,>=4.5 (from aiohttp->datasets)
  Using cached multidict-6.0.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (114 kB)
Collecting async-timeout<5.0,>=4.0.0a3 (from aiohttp->datasets)
  Using cached async_timeout-4.0.2-py3-none-any.whl (5.8 kB)
Collecting yarl<2.0,>=1.0 (from aiohttp->datasets)
  Using cached yarl-1.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (268 kB)
Collecting frozenlist>=1.1.1 (from aiohttp->datasets)
  Using cached frozenlist-1.3.3-cp310-cp310-manylinux_2_5_x8

  from .autonotebook import tqdm as notebook_tqdm


Collecting transformers
  Using cached transformers-4.30.2-py3-none-any.whl (7.2 MB)
Collecting regex!=2019.12.17 (from transformers)
  Using cached regex-2023.6.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (770 kB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Using cached tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
Collecting safetensors>=0.3.1 (from transformers)
  Using cached safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
Installing collected packages: tokenizers, safetensors, regex, transformers
Successfully installed regex-2023.6.3 safetensors-0.3.1 tokenizers-0.13.3 transformers-4.30.2
[0mNote: you may need to restart the kernel to use updated packages.
172371
9576
9577


                                                                                                   

In [None]:
pytorch_estimator = torch(

In [4]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    label_pad_token_id=model.config.pad_token_id
)

In [5]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10
)

In [5]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
)

In [6]:
trainer.train()



Step,Training Loss
10,15.6962
20,15.7236
30,14.7729
40,13.6574
50,12.8895
60,12.1488
70,11.3271
80,10.4852
90,9.3716
100,8.0194


TrainOutput(global_step=1000, training_loss=1.7876022917032242, metrics={'train_runtime': 763.4771, 'train_samples_per_second': 2.62, 'train_steps_per_second': 1.31, 'total_flos': 1219472916480000.0, 'train_loss': 1.7876022917032242, 'epoch': 1.0})

In [11]:
# Define the input text
input_text = "This is a test sentence."

# Tokenize the input text using the BART tokenizer
input_ids = tokenizer.encode(input_text, return_tensors='pt').to('cuda')

# Generate the output text using the BART model
output_ids = model.generate(input_ids, max_length=128, num_beams=4, early_stopping=True)

# Decode the output text using the BART tokenizer
output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

print(output_text)

 bir şekilde görüyoruz.
