In [None]:
from huggingface_hub import login
login("hidden")

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import load_dataset
import os

# Optional: Disable TensorFlow import if needed
os.environ["TRANSFORMERS_NO_TF"] = "1"

# ---------------------------
# Load model & tokenizer
# ---------------------------
model_name = "microsoft/phi-1_5"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Set pad token (phi doesn't have one by default)
tokenizer.pad_token = tokenizer.eos_token
model.resize_token_embeddings(len(tokenizer))

# ---------------------------
# Load dataset
# ---------------------------
dataset = load_dataset("json", data_files=r"C:\Users\cl502_09\Desktop\NLP PROJ UZMA\StoryGeneration\finetune_data.jsonl")
train_data = dataset["train"]

# ---------------------------
# Format function
# ---------------------------
def format(example):
    prompt = example["prompt"]
    completion = example["completion"]
    full_text = prompt + tokenizer.eos_token + completion

    tokenized = tokenizer(full_text, truncation=True, padding="max_length", max_length=512)
    labels = tokenized["input_ids"].copy()

    prompt_len = len(tokenizer(prompt + tokenizer.eos_token)["input_ids"])
    labels[:prompt_len] = [-100] * prompt_len

    tokenized["labels"] = labels
    return tokenized

formatted_data = train_data.map(format, remove_columns=train_data.column_names)

# ---------------------------
# Dataset class
# ---------------------------
class PromptCompletionDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __getitem__(self, idx):
        return {
            k: torch.tensor(v, dtype=torch.long)
            for k, v in self.data[idx].items()
        }

    def __len__(self):
        return len(self.data)

train_dataset = PromptCompletionDataset(formatted_data)

# ---------------------------
# Training setup
# ---------------------------
training_args = TrainingArguments(
    output_dir="./phi_output",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    num_train_epochs=15,
    logging_steps=10,
    save_steps=50,
    save_total_limit=2,
    fp16=True,
    learning_rate=1e-5,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)




In [None]:
# ---------------------------
# Train
# ---------------------------
trainer.train()

# ---------------------------
# Save final model
# ---------------------------
model.save_pretrained("fine_tuned_phi")
tokenizer.save_pretrained("fine_tuned_phi")

wandb: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
wandb: Currently logged in as: uzmapatil1110 (uzmapatil1110-symbiosis-institute-of-technology) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin


OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 6.00 GiB of which 0 bytes is free. Of the allocated memory 12.62 GiB is allocated by PyTorch, and 11.07 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load your fine-tuned model (adjust the path if different)
model_path = r"D:\NLP Project\StoryGeneration3.0\fine_tuned_phi"  # Path where you saved your model
model = AutoModelForCausalLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Ensure padding is defined (phi doesn't have a pad token by default)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
# Take keywords from user input
keywords = "Friendly Ghost, Scared girl, Graveyard, Grandma, Friends"

# Format the prompt
prompt = f"Keywords: {keywords}\n\nWrite a dialogue-based story script:\n"

# Tokenize the input
inputs = tokenizer(prompt, return_tensors="pt").to((model.device))

# Generate output
output = model.generate(
    **inputs,
    max_new_tokens=500,
    temperature=0.8,
    top_p=0.95,
    do_sample=True,
    pad_token_id=tokenizer.eos_token_id
)

# Decode and print result
story = tokenizer.decode(output[0], skip_special_tokens=True)
print("\nGenerated Story:\n")
print(story)