In [3]:
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer, DataCollatorForLanguageModeling, Trainer, TrainingArguments
import torch

# Load your data from CSV
df = pd.read_csv("converted_data.csv")

# Assuming "Singlish" is the column name where your text data resides
singlish_data = df["Singlish"].tolist()

# Preprocess the Singlish data by removing occurrences of "[Unknown]" and empty sentences
singlish_data = [sentence.replace("[Unkown]", "").strip() for sentence in singlish_data if sentence.strip()]

# Tokenize the data
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Add a pad token
tokenized_data = []

for sentence in singlish_data:
    tokenized_sentence = tokenizer(sentence, padding='max_length', truncation=True, max_length=512, return_tensors='pt')
    tokenized_data.append(tokenized_sentence.input_ids)

# Concatenate tokenized sequences into a single tensor
tokenized_data = torch.cat(tokenized_data, dim=0)

# Train the model
model = GPT2LMHeadModel.from_pretrained("gpt2")
training_args = TrainingArguments(
    output_dir="./lm_model",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_data,  # Pass tokenized data directly
)

trainer.train()

# Save the model
model.save_pretrained("./lm_model")

print("Training completed and model saved successfully.")


RuntimeError: Sizes of tensors must match except in dimension 0. Expected size 512 but got size 0 for tensor number 1191 in the list.

In [None]:
# Load the trained model and tokenizer
model = GPT2LMHeadModel.from_pretrained("./lm_model")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Set the device to CPU if you don't have a GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Set the model to evaluation mode
model.eval()

# Prompt for generation
prompt = "kohomada bn ubata?"

# Tokenize the prompt
input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)

# Generate text based on the prompt
output = model.generate(input_ids, max_length=100, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)

# Decode the generated output
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

print("Generated Text:")
print(generated_text)