In [None]:
from huggingface_hub import login
login("hidden")

In [None]:
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import load_dataset
import os

# Optional: Disable TensorFlow import if you faced the Keras issue earlier
os.environ["TRANSFORMERS_NO_TF"] = "1"

# ---------------------------
# Load model & tokenizer
# ---------------------------
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

tokenizer.pad_token = tokenizer.eos_token
model.resize_token_embeddings(len(tokenizer))

# ---------------------------
# Load dataset
# ---------------------------
dataset = load_dataset("json", data_files="finetune_data.jsonl")
train_data = dataset["train"]

# ---------------------------
# Format function
# ---------------------------
def format(example):
    prompt = example["prompt"]
    completion = example["completion"]
    full_text = prompt + tokenizer.eos_token + completion

    tokenized = tokenizer(full_text, truncation=True, padding="max_length", max_length=512)
    labels = tokenized["input_ids"].copy()

    prompt_len = len(tokenizer(prompt + tokenizer.eos_token)["input_ids"])
    labels[:prompt_len] = [-100] * prompt_len

    tokenized["labels"] = labels
    return tokenized

# Map + remove raw strings
formatted_data = train_data.map(format, remove_columns=train_data.column_names)

# ---------------------------
# Dataset class
# ---------------------------
class PromptCompletionDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __getitem__(self, idx):
        return {
            k: torch.tensor(v, dtype=torch.long)  # force correct dtype
            for k, v in self.data[idx].items()
        }

    def __len__(self):
        return len(self.data)

train_dataset = PromptCompletionDataset(formatted_data)

# ---------------------------
# Training setup
# ---------------------------
training_args = TrainingArguments(
    output_dir="./tiny_output",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    logging_steps=10,
    save_steps=50,
    save_total_limit=2,
    # evaluation_strategy="no",
    fp16=True,
    learning_rate=1e-5,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

# ---------------------------
# Train
# ---------------------------
trainer.train()

# ---------------------------
# Save final model
# ---------------------------
model.save_pretrained("fine_tuned_tinyllama")
tokenizer.save_pretrained("fine_tuned_tinyllama")

In [5]:
import torch

def generate_completion(prompt, max_new_tokens=100, temperature=0.7):
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
    input_ids = input_ids.to(model.device)

    with torch.no_grad():
        output = model.generate(
            input_ids=input_ids,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=temperature,
            pad_token_id=tokenizer.eos_token_id
        )

    generated = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated[len(prompt):].strip()

In [10]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load tokenizer from base model
tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")

model = AutoModelForCausalLM.from_pretrained(
    "fine_tuned_tinyllama",
    ignore_mismatched_sizes=True
)

# Load fine-tuned model
model = AutoModelForCausalLM.from_pretrained("fine_tuned_tinyllama")

model = AutoModelForCausalLM.from_pretrained("fine_tuned_tinyllama")

prompt = """Keywords: gravity, space, planets, attraction, falling

Write a dialogue-based story script:
"""

generated_story = generate_completion(prompt, max_new_tokens=300)
print(generated_story)



OSError: The paging file is too small for this operation to complete. (os error 1455)

# Evaluation

In [None]:
# Install required packages
!pip install transformers nltk --quiet

import math
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from nltk import ngrams
from collections import Counter

# Load GPT-2 for Perplexity
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.eval()

# -------------------------------
# ✅ Perplexity
# -------------------------------
def compute_perplexity(text):
    inputs = tokenizer(text, return_tensors="pt")
    input_ids = inputs.input_ids
    with torch.no_grad():
        outputs = model(input_ids, labels=input_ids)
        loss = outputs.loss
    return math.exp(loss.item())

# -------------------------------
# ✅ Distinct-n (1-gram, 2-gram)
# -------------------------------
def distinct_n(text, n):
    tokens = text.split()
    n_grams = list(ngrams(tokens, n))
    return len(set(n_grams)) / len(n_grams) if n_grams else 0

# -------------------------------
# ✅ Keyword Coverage
# -------------------------------
def keyword_coverage(keywords, story):
    story_lower = story.lower()
    matched = [kw for kw in keywords if kw.lower() in story_lower]
    return len(matched) / len(keywords) if keywords else 0

# -------------------------------
# 📊 Example Usage
# -------------------------------
generated_story = "The brave lion used a magic flute to calm the forest spirits."
keywords = ["brave", "magic flute", "forest", "spirits"]

print("\n--- Evaluation ---")
print("Perplexity:", compute_perplexity(generated_story))
print("Distinct-1:", distinct_n(generated_story, 1))
print("Distinct-2:", distinct_n(generated_story, 2))
print("Keyword Coverage:", keyword_coverage(keywords, generated_story))
