In [2]:
pip install txtinstruct langchain > /dev/null

In [3]:
from datasets import load_dataset

from txtinstruct.models import StatementGenerator

# Load SQuAD dataset
dataset = load_dataset("squad", 
                       split="train")

# Train model
generator = StatementGenerator()



In [None]:
model, tokenizer = generator(
    "google/flan-t5-small",
    dataset,
    "sequence-sequence",
    learning_rate=1e-3,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=128 // 16,
    num_train_epochs=0.01,
    logging_steps=100,
)
#Note that we only trained the model for a fraction of an epoch 
#for expediency. Under normal circumstances, num_train_epochs 
#would be at least 3.

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
from txtai.pipeline import Sequences

# Load statement generation model
statements = Sequences((model, tokenizer))

# Run example prompt
statements("""Generate a question using the context below.
### Context:
Hugging face is an open-source platform for hosting 
all kind of AI language models.""")

In [None]:
from txtai.embeddings import Embeddings
from txtinstruct.data import DatasetBuilder

In [None]:
# Query templates
templates = [
    "Tell me about {text}",
    "Give an explanation on {text}",
    "Provide a quick summary on {text}",
    "Explain {text} in simple terms",
    "Describe {text}"
]

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size = 100,
    chunk_overlap  = 20,
    length_function = len,
)

In [None]:
!wget https://www.gutenberg.org/cache/epub/1/pg1.txt

In [1]:
with open('pg1.txt','r') as d:
  data = d.read()

SyntaxError: ignored

In [None]:
txt_split = text_splitter.split_text(data)
txt_dict = []

for txt in txt_split:
  txt_dict.append({'text':txt})

In [None]:
# Build dataset
builder = DatasetBuilder(Sequences("google/flan-t5-base"), 
                         statements, 
                         templates)
builder(txt_dict,"data.json")

In [None]:
import json

from txtinstruct.models import Instructor

# Read in generated dataset
with open("data.json", encoding="utf-8") as f:
    data = json.load(f)

In [None]:
data[0]

{'context': "Machine learning (ML) is a field of inquiry devoted to understanding and building methods that 'learn', that is, methods that leverage data to improve performance on some set of tasks. It is seen as a part of artificial intelligence. Machine learning algorithms build a model based on sample data, known as training data, in order to make predictions or decisions without being explicitly programmed to do so. Machine learning algorithms are used in a wide variety of applications, such as in medicine, email filtering, speech recognition, agriculture, and computer vision, where it is difficult or unfeasible to develop conventional algorithms to perform the needed tasks. \nA subset of machine learning is closely related to computational statistics, which focuses on making predictions using computers, but not all machine learning is statistical learning. The study of mathematical optimization delivers methods, theory and application domains to the field of machine learning. Data 

In [None]:
# Instruction-tune model
instructor = Instructor()
model, tokenizer = instructor(
    "google/flan-t5-small", 
    data,
    "sequence-sequence",
    learning_rate=1e-3,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=128 // 8,
    num_train_epochs=3,
    logging_steps=100,
)

Downloading and preparing dataset generator/default to /root/.cache/huggingface/datasets/generator/default-96d6fbc7a3e2b7c7/0.0.0...


Generating train split: 0 examples [00:00, ? examples/s]

Dataset generator downloaded and prepared to /root/.cache/huggingface/datasets/generator/default-96d6fbc7a3e2b7c7/0.0.0. Subsequent calls will reuse this data.


Map:   0%|          | 0/15 [00:00<?, ? examples/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss


In [None]:
from txtai.pipeline import Extractor

def prompt(text):
    template = "Answer the following question using only the context below. Give a detailed answer. "
    template += "Say 'I don't have data on that' when the question can't be answered.\n"
    template += f"Question: {text}\n"
    template += "Context: "

    return template


In [None]:
#This model is from hugging face
extractor = Extractor(
    embeddings,
    Sequences("google/flan-t5-small")
)

extractor([{
    "query": "Tell me about Linux",
    "question": prompt("Tell me about Linux")
}])

[{'answer': 'Linux'}]

In [None]:
#This model is trained in this colab notebook
extractor = Extractor(
    embeddings,
    Sequences((model, tokenizer))
)

extractor([{
    "query": "Tell me about Linux",
    "question": prompt("Tell me about Linux")
}])

[{'answer': 'Linux (or ) is a family of open-source Unix-like operating systems based on the Linux kernel, an operating system kernel first released on September 17, 1991, by Linus Torvalds. Linux is typically packaged as a Linux distribution, which includes the kernel and supporting system software and libraries, many of which are provided by the GNU Project.'}]

In [None]:
extractor([{
    "query": "Tell me about adversarial Machine Learning",
    "question": prompt("Tell me about adversarial Machine Learning")
}])

[{'answer': 'Adversarial machine learning is the study of the attacks on machine learning algorithms, and of the defenses against such attacks'}]

In [None]:
extractor([{
    "query": "What is a Large Language model",
    "question": prompt("What is a Large Language Model")
}])

[{'answer': "I don't have data on that"}]

In [None]:
!git clone https://github.com/lamini-ai/lamini.git

Cloning into 'lamini'...
remote: Enumerating objects: 193, done.[K
remote: Counting objects: 100% (76/76), done.[K
remote: Compressing objects: 100% (49/49), done.[K
remote: Total 193 (delta 44), reused 47 (delta 26), pack-reused 117[K
Receiving objects: 100% (193/193), 27.82 MiB | 16.32 MiB/s, done.
Resolving deltas: 100% (110/110), done.


In [None]:
!pip install llama-llm jsonlines > /dev/null

In [None]:
!python /content/lamini/generate_data.py