In [1]:

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling, TrainingArguments, Trainer, TrainerCallback
from peft import LoraConfig, TaskType, get_peft_model
import torch
import psutil
import numpy as np



2025-06-09 19:03:21.435321: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749495801.660749      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749495801.725468      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# !pip install trl

In [3]:
torch.cuda.set_per_process_memory_fraction(1.0, 0)  # Use maximum available memory
torch.cuda.memory_max_split_size_mb = 64  # Set the max split size to avoid fragmentation

In [4]:
def print_memory_footprint():
    # GPU memory usage
    if torch.cuda.is_available():
        gpu_memory = torch.cuda.memory_allocated() / (1024 ** 3)  # Convert to GB
        gpu_memory_cached = torch.cuda.memory_reserved() / (1024 ** 3)  # Cached memory
        print(f"[GPU] Memory Allocated: {gpu_memory:.2f} GB, Cached: {gpu_memory_cached:.2f} GB")
    else:
        print("[GPU] No GPU detected.")

    # CPU memory usage
    memory = psutil.virtual_memory()
    used_memory_gb = memory.used / (1024 ** 3)  # Convert to GB
    total_memory_gb = memory.total / (1024 ** 3)
    print(f"[CPU] Memory Usage: {used_memory_gb:.2f} GB / {total_memory_gb:.2f} GB")

In [5]:
'''
print("First example of blended_skill_talk training set:")
print(dataset['train'][0])
'''

'\nprint("First example of blended_skill_talk training set:")\nprint(dataset[\'train\'][0])\n'

In [6]:
tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
tokenizer.pad_token = tokenizer.eos_token  # GPT-2 doesn't have a pad token, so we use eos_token

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

In [7]:
# Load dataset and tokenizer
dataset = load_dataset("allenai/soda")



def tokenize_function(examples):
    # Concatenate dialog turns into a single string for language modeling
    texts = [" ".join(dialog) for dialog in examples["dialogue"]]
    return tokenizer(texts, truncation=True, max_length=512)

# Tokenize datasets
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=['head', 'relation', 'tail', 'literal', 'narrative', 'dialogue', 'speakers', 'PersonX', 'PersonY', 'PersonZ', 'original_index', 'split', 'head_answer', 'pmi_head_answer', 'relation_tail_answer', 'pmi_relation_tail_answer'])
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(2500))
small_eval_dataset = tokenized_datasets["validation"].shuffle(seed=42).select(range(500))

README.md:   0%|          | 0.00/4.92k [00:00<?, ?B/s]

train.parquet:   0%|          | 0.00/689M [00:00<?, ?B/s]

valid.parquet:   0%|          | 0.00/82.9M [00:00<?, ?B/s]

test.parquet:   0%|          | 0.00/84.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1191582 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/146346 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/148968 [00:00<?, ? examples/s]

Map:   0%|          | 0/1191582 [00:00<?, ? examples/s]

Map:   0%|          | 0/146346 [00:00<?, ? examples/s]

Map:   0%|          | 0/148968 [00:00<?, ? examples/s]

In [8]:
print(dataset)
tokenized_datasets


DatasetDict({
    train: Dataset({
        features: ['head', 'relation', 'tail', 'literal', 'narrative', 'dialogue', 'speakers', 'PersonX', 'PersonY', 'PersonZ', 'original_index', 'split', 'head_answer', 'pmi_head_answer', 'relation_tail_answer', 'pmi_relation_tail_answer'],
        num_rows: 1191582
    })
    validation: Dataset({
        features: ['head', 'relation', 'tail', 'literal', 'narrative', 'dialogue', 'speakers', 'PersonX', 'PersonY', 'PersonZ', 'original_index', 'split', 'head_answer', 'pmi_head_answer', 'relation_tail_answer', 'pmi_relation_tail_answer'],
        num_rows: 146346
    })
    test: Dataset({
        features: ['head', 'relation', 'tail', 'literal', 'narrative', 'dialogue', 'speakers', 'PersonX', 'PersonY', 'PersonZ', 'original_index', 'split', 'head_answer', 'pmi_head_answer', 'relation_tail_answer', 'pmi_relation_tail_answer'],
        num_rows: 148968
    })
})


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 1191582
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 146346
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 148968
    })
})

In [9]:
# LoRA configuration for causal language modeling
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=32,
    lora_dropout=0.09,
    target_modules=["q_proj","v_proj","up_proj"],
)

# %%

In [10]:
# Dynamic device assignment
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load pre-trained GPT-2 language model
# Load pre-trained GPT-2 model
#model = GPT2LMHeadModel.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id).to(device)
model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0",
                                             torch_dtype=torch.bfloat16,).to(device)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)

# Enable gradient checkpointing if you run into memory issues
#model.gradient_checkpointing_enable()

# %%
# Print the model's architecture to inspect the names of the modules
#print(model)

# %%
print_memory_footprint()

Using device: cuda


config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

[GPU] Memory Allocated: 2.06 GB, Cached: 2.15 GB
[CPU] Memory Usage: 2.27 GB / 31.35 GB


In [11]:
# %%

# Data collator for language modeling (masks tokens for prediction)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) # mlm=False for causal LM

# Perplexity as metric
#perplexity_metric = evaluate.load("perplexity", module_type="metric")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    shift_logits = torch.tensor(logits[:, :-1, :])
    shift_labels = torch.tensor(labels[:, 1:])
    loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-100)
    loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
    ppl = torch.exp(loss).item()
    return {"perplexity": ppl}

# Training args
training_args = TrainingArguments(
    output_dir="./lama-dialog-finetuned",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=2,
    num_train_epochs=4,
    learning_rate=5e-5,
    logging_dir="./logs",
    logging_steps=100,
    #evaluation_strategy="epoch",
    #save_strategy="epoch",
    report_to="none",
    remove_unused_columns=False,
    bf16=True,  # Enable bfloat16
    fp16=False,  # Disable fp16 to avoid conflicts
)
# %%
print_memory_footprint()

[GPU] Memory Allocated: 2.06 GB, Cached: 2.15 GB
[CPU] Memory Usage: 2.28 GB / 31.35 GB


In [12]:
class MemoryCallback(TrainerCallback):
    def on_evaluate(self, args, state, control, **kwargs):
        print("\nMemory footprint after evaluation:")
        print_memory_footprint()

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    #eval_dataset=small_eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    #compute_metrics=compute_metrics,
    callbacks=[MemoryCallback()]
)


  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [13]:
any(p.requires_grad for p in model.parameters())

True

In [14]:
trainer.train()

Step,Training Loss
100,0.7315
200,0.6828
300,0.6928
400,0.6429
500,0.6358
600,0.6287
700,0.6226
800,0.6199
900,0.6302
1000,0.6363


TrainOutput(global_step=1252, training_loss=0.6459459877623537, metrics={'train_runtime': 3269.2652, 'train_samples_per_second': 3.059, 'train_steps_per_second': 0.383, 'total_flos': 1.838161000562688e+16, 'train_loss': 0.6459459877623537, 'epoch': 4.0})

In [15]:
trainer.save_model('TinyLlama-new-1000')

In [16]:
print_memory_footprint()

[GPU] Memory Allocated: 2.09 GB, Cached: 10.98 GB
[CPU] Memory Usage: 2.91 GB / 31.35 GB


In [17]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
smoothie = SmoothingFunction().method4
# Dynamic device assignment
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# For generation, we'll load the model and use the `generate` method
model_for_generation = AutoModelForCausalLM.from_pretrained('TinyLlama-new-1000').to(device)
#model_for_generation = get_peft_model(model_for_generation, lora_config) # Ensure LoRA is applied

input_texts = [
    "So light is made up of colors?",
    "Why is the sky blue?",
    "How does gravity work?",
    "What causes rainbows?",
    "Why do we see lightning before thunder?",
    "How do plants make food?",
    "Why do we sleep?",
    "What is photosynthesis?",
    "How does the internet work?",
    "Why do birds migrate?"
]

reference_answers = [
    "Yes, light is made up of different colors. When they combine, they appear white. This can be separated with prisms, for example.",
    "The sky appears blue due to a phenomenon called Rayleigh scattering. Shorter blue wavelengths scatter more than red ones.",
    "Gravity is a force that attracts objects with mass toward each other. Earth pulls objects towards its center due to gravity.",
    "Rainbows occur when sunlight is both refracted and reflected inside raindrops, separating the light into different colors.",
    "Light travels faster than sound, so we see lightning before we hear thunder.",
    "Plants make food through photosynthesis, using sunlight, water, and carbon dioxide to produce energy and oxygen.",
    "We sleep to allow the body and brain to rest, recover, and process information from the day.",
    "Photosynthesis is the process plants use to convert sunlight into energy using chlorophyll, water, and carbon dioxide.",
    "The internet is a global network of computers that communicate via protocols like TCP/IP to exchange data.",
    "Birds migrate to find better food sources, breeding grounds, or climates that are more suitable during different seasons."
]

bleu_scores = []

for i in range(10):
    input_text = input_texts[i]
    reference = reference_answers[i]

    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)

    output = model_for_generation.generate(
        input_ids,
        max_length=150,
        num_return_sequences=1,
        temperature=0.7,
        pad_token_id=tokenizer.eos_token_id
    )

    generated_response = tokenizer.decode(output[0], skip_special_tokens=True)
    
    candidate = generated_response.split()
    reference_tokens = [reference.split()]

    score = sentence_bleu(reference_tokens, candidate, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothie)
    bleu_scores.append(score)

    print(f"\nPrompt {i+1}: {input_text}")
    print(f"Generated: {generated_response}")
    print(f"Reference: {reference}")
    print(f"BLEU-4 Score: {score:.4f}")

print("\nAverage BLEU-4 Score across all prompts: %.4f" % (sum(bleu_scores) / len(bleu_scores)))


Using device: cuda


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



Prompt 1: So light is made up of colors?
Generated: So light is made up of colors? Yes, it is. Light is made up of different colors of light. So what does that mean? Well, when you look at a light source, like the sun, it's made up of different colors of light. The colors of light are called wavelengths. So when you look at the sun, it's made up of different colors of light? Yes, that's right. The colors of light are called wavelengths. So what does that mean? Well, when you look at the sun, it's made up of different colors of light. The colors of light are called wavelengths. So when you look at the sun, it'
Reference: Yes, light is made up of different colors. When they combine, they appear white. This can be separated with prisms, for example.
BLEU-4 Score: 0.0417

Prompt 2: Why is the sky blue?
Generated: Why is the sky blue? It's because of the sun. The sun is the reason why the sky is blue. But why is the sun blue? It's because of the color of the light that the sun emits. The l

# DPO

In [18]:
# import os
# os.environ["WANDB_API_KEY"] = "946e923f7717c88464dc01b43cdcb664b74b23b6"
# from trl import DPOConfig, DPOTrainer

# Sft_model = AutoModelForCausalLM.from_pretrained('TinyLlama-new-1000', torch_dtype=torch.bfloat16).to(device)
# tokenizer = AutoTokenizer.from_pretrained("TinyLlama-new-1000")
# train_dataset = load_dataset("trl-lib/ultrafeedback_binarized", split="train")
# train_dataset = train_dataset.select(range(500))


In [19]:
# Sft_model

In [20]:
# training_args = DPOConfig(
#     per_device_train_batch_size=1,  # Reduce batch size
#     remove_unused_columns=False,
#     max_length=150,  # Reduce sequence length if possible
#     max_prompt_length=256,
#     bf16 = True,  # To maintain effective batch size
#     dataloader_num_workers=0,
# )


# trainer = DPOTrainer(model=Sft_model, args=training_args, processing_class=tokenizer, train_dataset=train_dataset)
# trainer.train()