<a href="https://colab.research.google.com/github/Tulipsa-Mallick/Task4_GPT_Finetuned/blob/main/gpt2_fine_tuning_colab_valid.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 🤖 GPT-2 Fine-Tuning on Custom Text with Hugging Face (Colab Setup)

In [None]:
!pip install transformers datasets

In [None]:

from google.colab import files
uploaded = files.upload()


In [None]:

from datasets import Dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling

# Get uploaded file name
file_name = list(uploaded.keys())[0]

# Read lines from the uploaded file
with open(file_name, 'r', encoding='utf-8') as f:
    lines = f.readlines()

# Convert lines into a dataset format
data = [{"text": line.strip()} for line in lines if line.strip()]
dataset = Dataset.from_list(data)

# Load GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# Tokenize the dataset
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Data collator and model setup
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
model = GPT2LMHeadModel.from_pretrained("gpt2")


In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=10,
    save_total_limit=1,
    logging_dir="./logs",
    logging_steps=5,
     report_to=None
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

trainer.train()



In [None]:

model.save_pretrained("gpt2-finetuned")
tokenizer.save_pretrained("gpt2-finetuned")


In [None]:

from transformers import pipeline
generator = pipeline("text-generation", model="gpt2-finetuned", tokenizer="gpt2-finetuned")
generator("Once upon a time", max_length=50, num_return_sequences=1)


In [None]:
model_save_path = "./gpt2-finetuned"
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)


In [None]:
import shutil
shutil.make_archive("gpt2-finetuned", 'zip', model_save_path)


In [None]:
from google.colab import files
files.download("/content/gpt2-finetuned.zip")



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>