# Convert Excel as a Text File

In [None]:
import pandas as pd

# Read xlsx file
df = pd.read_excel('rickandmorty.xlsx')

# Open a new txt file for writing
with open('conversations.txt', 'w') as f:
    for i in range(0, len(df), 4):
        chunk = df[i:i + 4]
        conversation = ''
        for index, row in chunk.iterrows():
            
            #conversation += f"<|{row['isim']}|> {row['astar']} - "
            if index%2==0:
                conversation += f"<|Rick|> {row['astar']} "
            else:
                conversation += f"<|Morty|> {row['astar']} "
        f.write(f"{conversation[:]}\n")

# Split the Text File as Training and Testing

In [None]:
def split_file(file_path, train_file, test_file, test_size=0.15):
    with open(file_path, 'r') as f:
        lines = f.readlines()
    num_test_lines = int(len(lines) * test_size)
    with open(train_file, 'w', encoding="utf-8") as f:
        f.writelines(lines[num_test_lines:])
    with open(test_file, 'w' , encoding="utf-8") as f:
        f.writelines(lines[:num_test_lines])

split_file("conversations.txt", "train_dataset.txt", "test_dataset.txt")

# Get model and tokenizer

In [None]:
from transformers import AutoTokenizer
from transformers import TextDataset,DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments,AutoModelWithLMHead

model = AutoModelWithLMHead.from_pretrained("redrussianarmy/gpt2-turkish-cased")
tokenizer = AutoTokenizer.from_pretrained("redrussianarmy/gpt2-turkish-cased")

train_path = 'train_dataset.txt'
test_path = 'test_dataset.txt'

# Set the Training Arguments

In [None]:
def load_dataset(train_path,test_path,tokenizer):
    train_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=32)
     
    test_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=test_path,
          block_size=32)   
    
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return train_dataset,test_dataset,data_collator

train_dataset,test_dataset,data_collator = load_dataset(train_path,test_path,tokenizer)



training_args = TrainingArguments(
    output_dir="./gpt2-turkish", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=24, # number of training epochs
    per_device_train_batch_size=24, # batch size for training
    per_device_eval_batch_size=48,  # batch size for evaluation
    save_steps=256, # after # steps model is saved 
    warmup_steps=256,# number of warmup steps for learning rate scheduler
    prediction_loss_only=False,
    optim="adamw_torch",
    logging_steps=6,
    max_steps = 36,
    evaluation_strategy="steps"
    #learning_rate = 1e3
    )




trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset)

# Get number of Tokens

In [None]:
with open("train_dataset.txt", "r", encoding="utf-8") as file:
    content = file.read()

tokenizer_conversation = tokenizer.encode(content)
print("Number of Tokens:", len(tokenizer_conversation))

# Train and save the Model

In [None]:
trainer.train()
trainer.save_model()

# Configuration of Model

In [None]:
config = model.config
print(config)

# Setup the Pipeline

In [None]:
from transformers import pipeline

chef = pipeline('text-generation',model='./gpt2-turkish', tokenizer="redrussianarmy/gpt2-turkish-cased", pad_token_id=50256, max_new_tokens=64)

In [None]:
import warnings
warnings.filterwarnings("ignore")

def find_third_lt(s):
    count = 0
    for i, c in enumerate(s):
        if c == "<":
            count += 1
            if count == 3:
                return i
    return -1



for i in range(5):
    morty = input()
    rick = chef(f"Bu, gelişmiş bir yapay zeka olan Rick ve insan Morty arasındaki bir konuşmadır: <|Morty|> {morty}\n")[0]["generated_text"]
    third_small = find_third_lt(rick)
    #print(rick)
    #print("-----------------------------------------")
    print("Rick: " + rick[100 + len(morty):third_small])