# Lamini Docs Fine-Tuning
End-to-end example using a small FLAN-T5 model.

In [None]:
from datasets import load_dataset
lamini = load_dataset('lamini/lamini_docs')
lamini

In [None]:
import jsonlines
rows = [
    {'question': '### Question:\nWhat is AI?\n\n### Answer:', 'answer': 'Artificial intelligence is the field of building systems that can perform tasks requiring human intelligence.'}
]
with jsonlines.open('../data/sample_lamini_docs_processed.jsonl', 'w') as w:
    w.write_all(rows)
print('Wrote ../data/sample_lamini_docs_processed.jsonl')

In [None]:
from datasets import load_dataset
train_ds = load_dataset('json', data_files='../data/sample_lamini_docs_processed.jsonl')
train_ds

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
model_name = 'google/flan-t5-small'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
def preprocess(batch):
    model_inputs = tokenizer(batch['question'], truncation=True, max_length=512)
    labels = tokenizer(text_target=batch['answer'], truncation=True, max_length=128)
    model_inputs['labels'] = labels['input_ids']
    return model_inputs
tokenized = train_ds['train'].map(preprocess, batched=True, remove_columns=train_ds['train'].column_names)
collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
args = Seq2SeqTrainingArguments(output_dir='../outputs/flan-t5-small', per_device_train_batch_size=8, num_train_epochs=1, report_to='none')
trainer = Seq2SeqTrainer(model=model, args=args, train_dataset=tokenized, tokenizer=tokenizer, data_collator=collator)
print('Trainer ready (run trainer.train() to start)')