In [8]:
import datasets
import tempfile
import logging
import random
import config
import os
import yaml
import time
import torch
import transformers
import pandas as pd
import jsonlines

#from utilities import *
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
from transformers import TrainingArguments
from transformers import AutoModelForCausalLM
#from llama import BasicModelRunner



logger = logging.getLogger(__name__)
global_config = None

# Load the Lamini docs dataset

In [23]:
dataset_path = "lamini/lamini_docs"
use_hf = True

# Set up the model, training config, and tokenizer

In [22]:
model_name = "EleutherAI/pythia-70m"

In [24]:
training_config = {
    "model": {
        "pretrained_name": model_name,
        "max_length" : 2048
    },
    "datasets": {
        "use_hf": use_hf,
        "path": dataset_path
    },
    "verbose": True
}

In [134]:
from datasets import load_dataset

def tokenize_and_split_data(
    tokenizer,
    dataset_name="lamini/lamini_docs",
    max_length=512,
    test_size=0.1,
    seed=42,
):
    dataset = load_dataset(dataset_name)

    def tokenize_function(batch):
        # batch["question"] and batch["answer"] are lists
        texts = [
            "Question: " + q + "\nAnswer: " + a
            for q, a in zip(batch["question"], batch["answer"])
        ]

        tokens = tokenizer(
            texts,
            truncation=True,
            padding="max_length",
            max_length=max_length,
        )
        # make labels match input_ids
        tokens["labels"] = tokens["input_ids"].copy()
        return tokens

    tokenized = dataset.map(tokenize_function, batched=True)

    train_dataset = tokenized["train"]
    test_dataset = tokenized["test"]
    return train_dataset, test_dataset


In [135]:
from transformers import AutoTokenizer

model_name = "EleutherAI/pythia-70m"  
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

train_dataset, test_dataset = tokenize_and_split_data(tokenizer)

print(train_dataset)
print(test_dataset)


Map:   0%|          | 0/1260 [00:00<?, ? examples/s]

Map:   0%|          | 0/140 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1260
})
Dataset({
    features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 140
})


In [154]:
from transformers import TrainingArguments

max_steps = 200   # or comment this out and just use num_train_epochs

training_args = TrainingArguments(
    output_dir="lamini_docs_200_steps",
    learning_rate=1e-5,
    num_train_epochs=1,
    max_steps=max_steps,           # you can remove this later if you want full epochs
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    disable_tqdm=False,
    logging_strategy="steps",
    logging_steps=10,
    eval_strategy="steps",         # <-- your version uses eval_strategy
    eval_steps=50,
    save_strategy="steps",
    save_steps=50,
    warmup_steps=1,
    optim="adafactor",
    gradient_accumulation_steps=4,
    gradient_checkpointing=False,
    load_best_model_at_end=True,
    save_total_limit=1,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
)


In [155]:
from transformers import AutoModelForCausalLM, Trainer
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

base_model = AutoModelForCausalLM.from_pretrained(model_name)
base_model.to(device)

trainer = Trainer(
    model=base_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

trainer.do_grad_scaling = False  # optional

training_output = trainer.train()   


Step,Training Loss,Validation Loss
50,0.3962,0.396929
100,0.4075,0.381215
150,0.4702,0.371974
200,0.3447,0.369243


In [156]:
save_dir = f"{training_args.output_dir}/final"
trainer.save_model(save_dir)

finetuned_model = AutoModelForCausalLM.from_pretrained(save_dir, local_files_only=True)
finetuned_model.to(device)


GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 512)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (query_key_value): Linear(in_features=512, out_features=1536, bias=True)
          (dense): Linear(in_features=512, out_features=512, bias=True)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=512, out_features=2048, bias=True)
          (dense_4h_to_h): Linear(in_features=2048, out_features=512, bias=True)
          (act): GELUActivation()
        )
      )
    )
    (final_layer_norm): LayerNorm((512,), eps=1e-05, elementwise

In [157]:
def inference(text, model, tokenizer, max_input_tokens=1000, max_output_tokens=100):
    input_ids = tokenizer.encode(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=max_input_tokens,
    ).to(model.device)

    generated_tokens = model.generate(
        input_ids=input_ids,
        max_new_tokens=max_output_tokens,
    )

    return tokenizer.decode(generated_tokens[0], skip_special_tokens=True)


In [158]:
test_question = test_dataset[0]["question"]
prompt = f"Question: {test_question}\nAnswer:"   # matches training format

full_output = inference(prompt, finetuned_model, tokenizer)
print("RAW model output:")
print(full_output)

# Optional: extract only the answer part
if "Answer:" in full_output:
    answer_only = full_output.split("Answer:")[-1].strip()
else:
    answer_only = full_output

print("\nExtracted answer:")
print(answer_only)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


RAW model output:
Question: Can Lamini generate technical documentation or user manuals for software projects?
Answer: Yes, Lamini can generate technical documentation or user manuals for software projects.

Extracted answer:
Yes, Lamini can generate technical documentation or user manuals for software projects.
