<a href="https://colab.research.google.com/github/Susrith45/Genie-Gan/blob/main/GAN_Text_Generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torch torchvision torchaudio
!pip install transformers datasets


In [None]:
!pip install -U transformers datasets accelerate evaluate peft bitsandbytes sentencepiece


In [None]:
import os
import math
import inspect
import torch
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split

# Transformers imports
from transformers import (
    GPT2TokenizerFast,
    GPT2LMHeadModel,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer,
)

# ------------- Basic checks -------------
print("torch version:", torch.__version__)
print("cuda available:", torch.cuda.is_available())
device = "cuda" if torch.cuda.is_available() else "cpu"

# ------------- Load data -------------
# Change path as needed. Must have a 'text' column.
csv_path = "my_dataset.csv"
if not os.path.exists(csv_path):
    raise FileNotFoundError(f"{csv_path} not found. Upload your CSV with a 'text' column or change path.")

df = pd.read_csv(csv_path)
if "text" not in df.columns:
    raise ValueError("CSV must contain a 'text' column. Rename your text column or modify the code.")

texts = df["text"].astype(str).tolist()
print(f"Loaded {len(texts)} rows from {csv_path}")

# ------------- Train / Val split -------------
train_texts, val_texts = train_test_split(texts, test_size=0.05, random_state=42)
print(f"Train size: {len(train_texts)}, Val size: {len(val_texts)}")

train_ds = Dataset.from_dict({"text": train_texts})
val_ds   = Dataset.from_dict({"text": val_texts})

# ------------- Tokenizer & model -------------
MODEL_NAME = "gpt2"   # change to gpt2-medium etc. if you have more memory
tokenizer = GPT2TokenizerFast.from_pretrained(MODEL_NAME)
# GPT-2 has no pad token by default — set it to eos_token to avoid errors
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = GPT2LMHeadModel.from_pretrained(MODEL_NAME)
# move model to device for later inference (Trainer handles training device automatically)
model.to(device)

# ------------- Tokenization -------------
max_length = 128   # reduce if you get OOM; increase if you have more memory

def tokenize_batch(examples):
    return tokenizer(examples["text"],
                     truncation=True,
                     padding="max_length",
                     max_length=max_length)

print("Tokenizing train dataset...")
train_tok = train_ds.map(tokenize_batch, batched=True, remove_columns=["text"])
print("Tokenizing val dataset...")
val_tok = val_ds.map(tokenize_batch, batched=True, remove_columns=["text"])

# For causal LM, labels = input_ids
def add_labels(batch):
    batch["labels"] = batch["input_ids"].copy()
    return batch

train_tok = train_tok.map(add_labels, batched=True)
val_tok   = val_tok.map(add_labels, batched=True)

print(train_tok)
print(val_tok)

# ------------- Data collator -------------
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# ------------- TrainingArguments (robust across versions) -------------
# Build a kwargs dict of common args (we will filter to only those supported by the installed TrainingArguments)
common_args = dict(
    output_dir="./gpt2-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    evaluation_strategy="steps",   # newer versions
    eval_steps=500,
    logging_steps=100,
    save_steps=1000,
    save_total_limit=2,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=200,
    fp16=torch.cuda.is_available(),   # mixed precision if GPU available
    push_to_hub=False,
    report_to="none", # Explicitly disable reporting to any platform
)

# Inspect TrainingArguments signature and pass only supported params to avoid TypeError on older transformers
sig = inspect.signature(TrainingArguments.__init__)
valid_params = set(sig.parameters.keys())

# Prepare filtered kwargs
filtered_args = {k: v for k, v in common_args.items() if k in valid_params}

# Handle very old transformers that use evaluate_during_training instead of evaluation_strategy
if "evaluation_strategy" not in filtered_args and "evaluate_during_training" in valid_params:
    # map to old param names if present
    filtered_args["evaluate_during_training"] = True
    # older versions may not accept eval_steps; drop it if not supported
    if "eval_steps" in valid_params:
        filtered_args["eval_steps"] = common_args["eval_steps"]

print("TrainingArguments will be created with these keys:", sorted(filtered_args.keys()))
training_args = TrainingArguments(**filtered_args)

# ------------- Trainer -------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    data_collator=data_collator,
)

print("Trainer created. You can now run trainer.train() to start training.")
# If you want to start training immediately, uncomment the next line:
# trainer.train()

# ------------- Evaluate utilities -------------
# Function to evaluate and print perplexity
def evaluate_and_print_ppl(tr):
    res = tr.evaluate()
    print("Evaluation results:", res)
    if "eval_loss" in res and res["eval_loss"] is not None:
        try:
            ppl = math.exp(res["eval_loss"])
            print(f"Perplexity: {ppl:.2f}")
        except OverflowError:
            print("Perplexity could not be computed from eval_loss (overflow).")
    return res

# Example: (uncomment to evaluate now)
# evaluate_and_print_ppl(trainer)

# ------------- Inference helper -------------
def generate_samples(prompt="The acting in this movie was", num_return_sequences=2, max_new_tokens=60):
    # Load model & tokenizer from trainer if saved; otherwise use current objects
    model.eval()
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
    out = model.generate(
        input_ids,
        max_length=input_ids.shape[1] + max_new_tokens,
        do_sample=True,
        temperature=0.9,
        top_k=50,
        top_p=0.95,
        num_return_sequences=num_return_sequences,
        pad_token_id=tokenizer.eos_token_id,
    )
    decoded = [tokenizer.decode(s, skip_special_tokens=True) for s in out]
    return decoded

In [None]:
os.environ["WANDB_DISABLED"] = "true"   # Disable wandb logging
training_args.num_train_epochs = 1
max_length = 64
trainer.train()


In [None]:
eval_results = trainer.evaluate()
print(eval_results)

import math
if "eval_loss" in eval_results:
    ppl = math.exp(eval_results["eval_loss"])
    print(f"Perplexity: {ppl:.2f}")


In [None]:
trainer.save_model("./gpt2-finetuned")
tokenizer.save_pretrained("./gpt2-finetuned")
print("Model and tokenizer saved to ./gpt2-finetuned")


In [31]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModelForCausalLM.from_pretrained("./gpt2-finetuned").to(device)
tokenizer = AutoTokenizer.from_pretrained("./gpt2-finetuned")

prompt = "The acting in this movie was"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

outputs = model.generate(
    input_ids,
    max_length=80,
    temperature=0.9,
    top_k=50,
    top_p=0.95,
    num_return_sequences=3,
    do_sample=True,
    pad_token_id=tokenizer.eos_token_id
)

for i, output in enumerate(outputs):
    print(f"\n=== SAMPLE {i+1} ===")
    print(tokenizer.decode(output, skip_special_tokens=True))



=== SAMPLE 1 ===
The acting in this movie was made by Robert Rodriguez. The movie opens in theaters on June 21.

On June 17, Warner Bros. announced the release date for the second film, in which two characters, a former prostitute who was kidnapped and tortured and murdered, stand by a man who killed him and then went on to commit the crime with his own blood-soaked severed arm. The

=== SAMPLE 2 ===
The acting in this movie was just me, because I felt like I was the one who made it. There were times I would take the whole time, which really did make it a little easier on myself. It was fun."

In the short film, he's taking care of a younger son, Ryan, who's struggling in school.

"We're doing the same kind of

=== SAMPLE 3 ===
The acting in this movie was more than the acting in any other film I've ever seen."

When asked if she felt like she was being played by the same person from previous films, Darden said: "I think that's pretty good. In order to do that in this case, it's basic