In [2]:
from datasets import Dataset
from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments
import pandas as pd

tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-base')

read_bucket = 'sagemaker-translation-en-tr-data'

df = pd.read_csv(f"s3://{read_bucket}/data.csv", delimiter = '\t', names=['tr','en'], header = None)

df = df.dropna()
train_dataset = Dataset.from_pandas(df[:2000])
eval_dataset = Dataset.from_pandas(df[2000:3000])

def preprocess_function(examples):
    inputs = examples['en']
    targets = examples['tr']
    inputs = tokenizer(inputs, padding='max_length', truncation=True, max_length=1024, return_tensors='pt')
    targets = tokenizer(targets, padding='max_length', truncation=True, max_length=1024, return_tensors='pt')
    return {'input_ids': inputs['input_ids'], 'attention_mask': inputs['attention_mask'], 'decoder_input_ids': targets['input_ids'][:, :-1], 'decoder_attention_mask': targets['attention_mask'][:, :-1], 'labels': targets['input_ids'][:, 1:]}

train_dataset = train_dataset.map(preprocess_function, batched=True)
eval_dataset = eval_dataset.map(preprocess_function, batched=True)


  from .autonotebook import tqdm as notebook_tqdm
                                                                

In [3]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    label_pad_token_id=model.config.pad_token_id
)

In [4]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10
)

In [5]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
)

In [6]:
trainer.train()



Step,Training Loss
10,15.6962
20,15.7236
30,14.7729
40,13.6574
50,12.8895
60,12.1488
70,11.3271
80,10.4852
90,9.3716
100,8.0194


TrainOutput(global_step=1000, training_loss=1.7876022917032242, metrics={'train_runtime': 763.4771, 'train_samples_per_second': 2.62, 'train_steps_per_second': 1.31, 'total_flos': 1219472916480000.0, 'train_loss': 1.7876022917032242, 'epoch': 1.0})

In [11]:
# Define the input text
input_text = "This is a test sentence."

# Tokenize the input text using the BART tokenizer
input_ids = tokenizer.encode(input_text, return_tensors='pt').to('cuda')

# Generate the output text using the BART model
output_ids = model.generate(input_ids, max_length=128, num_beams=4, early_stopping=True)

# Decode the output text using the BART tokenizer
output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

print(output_text)

 bir şekilde görüyoruz.
