# Finetuing LLM model to generate children stories

#### Importing Libraries

In [1]:
from datasets import Dataset
from datetime import datetime
import pandas as pd
from peft import (
    LoraConfig,
    TaskType,
    get_peft_model,
    prepare_model_for_kbit_training
)
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments
)

  from .autonotebook import tqdm as notebook_tqdm


#### Check if CUDA is enabled

In [2]:
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    DEVICE = torch.device("cuda")
    print(f"GPU detected: {torch.cuda.get_device_name(0)}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    DEVICE = torch.device("cpu")
    print("No GPU detected, using CPU")

CUDA available: True
GPU detected: NVIDIA GeForce RTX 4080 SUPER
GPU memory: 16.69 GB


In [3]:
# Args
CSV_PATH = 'data/train.csv'
MODEL_DIR = 'models/'
MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.3"
STORIES_SAMPLE_SIZE = 100
TRAINING_ARGS_OUTPUT_DIR = "./training-args/mistral-stories-generator"


#### Loading Training dataset into pandas dataframe

In [4]:
# loading dataset to pandas dataframe
df = pd.read_csv(CSV_PATH)
print(f"Loaded {len(df)} stories")
df.columns = ['story']
print(df.head())


Loaded 2119719 stories
                                               story
0  One day, a little girl named Lily found a need...
1  Once upon a time, there was a little car named...
2  One day, a little fish named Fin was swimming ...
3  Once upon a time, in a land full of trees, the...
4  Once upon a time, there was a little girl name...


#### Preprocessing the dataset for training on Hugging Face model

In [5]:
# Creating samples
stories_sample = df.sample(n=STORIES_SAMPLE_SIZE, random_state=42)

In [6]:
# Classifying stories to genre and plots using rule based solution
genres = ["fantasy", "adventure", "fairy tale", "bedtime", "moral", "magical"]
plot_elements = [
    "forest, magic", "castle, dragon", "river, talking animals",
    "space, stars", "garden, fairies", "ocean, mermaids"
]

# Create training pairs
pairs = []
for i, story in enumerate(stories_sample['story']):
    genre = genres[i % len(genres)]
    plot = plot_elements[i % len(plot_elements)]

    pairs.append({
        "prompt": f"Write a bedtime story in the genre of {genre} with these elements: {plot}\\n\\n",
        "completion": story
    })

dataset_df = pd.DataFrame(pairs)

In [7]:
def format_for_training(examples):
    texts = []
    for prompt, completion in zip(examples['prompt'], examples['completion']):
        # Format: "<s>[INST] {prompt} [/INST] {completion} </s>"
        # This follows Mistral's instruction fine-tuning format
        text = f"<s>[INST] {prompt} [/INST] {completion} </s>"
        texts.append(text)
    return {'text': texts}

In [8]:
# Convert to HF Dataset
dataset = Dataset.from_pandas(dataset_df)
processed_dataset = dataset.map(
    format_for_training,
    batched=True,
    remove_columns=['prompt', 'completion']
)


Map: 100%|██████████| 100/100 [00:00<00:00, 44715.39 examples/s]


In [9]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [10]:
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=2048  # Adjust based on your stories' length
    )

In [11]:
tokenized_dataset = processed_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"]
)

Map: 100%|██████████| 100/100 [00:00<00:00, 2636.57 examples/s]


In [12]:
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.1)

In [13]:
from transformers import BitsAndBytesConfig

In [14]:
# Create quantization config
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

In [15]:
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    torch_dtype=torch.float16,
    quantization_config=quantization_config
)


Loading checkpoint shards: 100%|██████████| 3/3 [00:08<00:00,  2.92s/it]


In [16]:
# Configure LoRA
lora_config = LoraConfig(
    r=16,  # Rank
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

In [17]:
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

# Print trainable parameters
model.print_trainable_parameters()

trainable params: 13,631,488 || all params: 7,261,655,040 || trainable%: 0.1877


In [18]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

In [19]:
training_args = TrainingArguments(
    output_dir=TRAINING_ARGS_OUTPUT_DIR,
    eval_steps=100,
    eval_strategy='steps',
    save_strategy="steps",
    save_steps=100,
    learning_rate=2e-4,
    per_device_train_batch_size=1,  # Adjust based on GPU memory
    gradient_accumulation_steps=4,  # Increase this to simulate larger batch sizes
    per_device_eval_batch_size=1,
    num_train_epochs=3,
    weight_decay=0.01,
    warmup_steps=100,
    logging_steps=10,
    fp16=True,  # Use mixed precision
    load_best_model_at_end=True,
    report_to="none",  # Set to "wandb" if you want to use Weights & Biases
    remove_unused_columns=False
)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [20]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [21]:
print("\nStarting training...")
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.



Starting training...


  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss


TrainOutput(global_step=66, training_loss=1.6191459785808215, metrics={'train_runtime': 524.8609, 'train_samples_per_second': 0.514, 'train_steps_per_second': 0.126, 'total_flos': 2.277130691936256e+16, 'train_loss': 1.6191459785808215, 'epoch': 2.888888888888889})

In [22]:
now = datetime.now()
formatted_timestamp = now.strftime("%Y-%m-%dT%H:%M:%S")

In [23]:
trainer.save_model(MODEL_DIR + f"mistral-stories-{STORIES_SAMPLE_SIZE}stories-{formatted_timestamp}")
print("\nModel saved")


Model saved


In [24]:
def generate_story(genre, plot, max_length=1000):
    # Prepare the prompt
    prompt = f"<s>[INST] Generate a children's story in the genre of {genre} with the following plot: {plot} [/INST]"

    # Encode the prompt
    inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)

    # Generate
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs.input_ids,
            attention_mask=inputs.attention_mask,
            max_length=max_length,
            temperature=0.7,
            top_p=0.9,
            do_sample=True
        )

    # Decode and return
    story = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Remove the prompt part from the output
    story = story.replace(prompt, "").strip()
    return story

In [25]:
test_genre = "adventure"
test_plot = "A young explorer discovers a magical map leading to a hidden island"
print(f"\nGenerating a test story in the genre '{test_genre}' with the plot: '{test_plot}'")
story = generate_story(test_genre, test_plot)
print("\nGenerated Story:")
print(story)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



Generating a test story in the genre 'adventure' with the plot: 'A young explorer discovers a magical map leading to a hidden island'


  return fn(*args, **kwargs)



Generated Story:
Generate a children's story in the genre of adventure with the following plot: A young explorer discovers a magical map leading to a hidden island  Once upon a time, there was a little boy named Timmy. Timmy was three years old and loved to explore. One day, he was exploring the backyard when he found something shiny. It was a silver key! Timmy was so excited. He ran inside to tell his mom about the key. 

"Mommy, look what I found!" Timmy said.

"Oh, a shiny key! What does it do?" Timmy's mom asked.

"I don't know, but I want to find out!" Timmy said.

Timmy's mom smiled and said, "Let's see if we can find something it fits."

They looked around the house and found a small wooden box. The key fit perfectly into the box. Timmy was so proud of himself. He had found something special. 

From that day on, Timmy loved to explore and find new things. He always carried the silver key with him, just in case he found something special. 

The End. 

Timmy and his mom were so e