In [1]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from torch.utils.data import Dataset
import pandas as pd

In [2]:
# Load the pre-trained GPT-2 model and tokenizer
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

In [3]:
# Prepare your dataset
class MyDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx]

In [None]:
# Load your parallel corpus of English and Turkish text
df = pd.read_csv('./data.csv', encoding='utf-8', delimiter='\t', header=None, names=['tr', 'en'])
df = df.dropna()
df = df[:1000]

In [None]:
# Tokenize the text and convert it to a format that can be used for training
#variable text which is a list of tuples of the form (turkish, english)
texts = []
for i in range(len(df)):
    texts.append((df.iloc[i]['en'], df.iloc[i]['tr']))
print(texts[0])
 # List of sentence-aligned text pairs

In [None]:
# tokenized pairs of text
tokenized_texts = []
for text_pair in texts:
    tokenized_pair = tokenizer.encode(text_pair[0], text_pair[1])
    tokenized_texts.append(tokenized_pair)

In [None]:
# Fine-tune the pre-trained GPT-2 model on your own dataset
train_dataset = MyDataset(tokenized_texts)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=2,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
)

In [None]:
trainer.train()
#save the model
trainer.save_model('./model')

In [None]:
# Generate translations using the fine-tuned model
generated_text = model.generate(
    input_ids=tokenizer.encode('This is an example sentence to translate', return_tensors='pt').to('cuda'),
    max_length=50,
    num_return_sequences=5,
    no_repeat_ngram_size=2,
    repetition_penalty=1.5,
    top_p=0.92,
    temperature=1.0,
    do_sample=True,
)

# Decode the generated text
for i, text in enumerate(generated_text):
    print(f"Generated text {i+1}: {tokenizer.decode(text.tolist(), skip_special_tokens=True)}")