In [2]:
# !pip install transformers bitsandbytes accelerate datasets peft trl

In [3]:
from datasets import load_dataset

# Load the instruct_tune_dataset dataset
dataset = load_dataset("Dobby091/koko")



  from .autonotebook import tqdm as notebook_tqdm


In [4]:
print(dataset["test"][0])

{'pdf_filename': '228_6.pdf', 'question': 'What is the reproducibility range for chromium content at 1 to 5 percent?', 'answer': 'Reproducibility is ±0.120 percent.'}


In [5]:
from datasets import Dataset, DatasetDict

# Select the first 1000 samples in the train set and the first 200 samples in the test set
train_dataset = dataset['train'].select(range(15))
test_dataset = dataset['test'].select(range(2))

# Create a new `DatasetDict` to store the selected samples
selected_dataset_dict = DatasetDict({'train': train_dataset, 'test': test_dataset})

# Print the number of samples in each split
print(f"Number of train samples: {len(train_dataset)}")
print(f"Number of test samples: {len(test_dataset)}")

selected_dataset_dict

Number of train samples: 15
Number of test samples: 2


DatasetDict({
    train: Dataset({
        features: ['pdf_filename', 'question', 'answer'],
        num_rows: 15
    })
    test: Dataset({
        features: ['pdf_filename', 'question', 'answer'],
        num_rows: 2
    })
})

In [6]:
# !pip install llama-index
# !pip install llama-index-embeddings-huggingface

In [7]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings, SimpleDirectoryReader, VectorStoreIndex
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor

In [8]:
# import any embedding model on HF hub (https://huggingface.co/spaces/mteb/leaderboard)
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
# Settings.embed_model = HuggingFaceEmbedding(model_name="thenlper/gte-large") # alternative model

Settings.llm = None
Settings.chunk_size = 256
Settings.chunk_overlap = 25



LLM is explicitly disabled. Using MockLLM.


In [9]:
# articles available here:  {add GitHub repo}
documents = SimpleDirectoryReader("pdf").load_data()

In [10]:
# some ad hoc document refinement
print(len(documents))
for doc in documents:
    if "Member-only story" in doc.text:
        documents.remove(doc)
        continue

    if "The Data Entrepreneurs" in doc.text:
        documents.remove(doc)

    if " min read" in doc.text:
        documents.remove(doc)

print(len(documents))

32
32


In [16]:
# store docs into vector DB
index = VectorStoreIndex.from_documents(documents)
# set number of docs to retreive
top_k = 3

# configure retriever
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=top_k,
)
# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.5)],
)
# query documents
query = "When was this standard adopted?"
response = query_engine.query(query)

# reformat response
context = "Context:\n"
for i in range(top_k):
    context = context + response.source_nodes[i].text + "\n\n"

print(context)

Context:
IS t 
Indian Standard 228 ( Part 5 ) - 1987 
METHODS FOR 
CHEMICAL ANALYSIS OF STEELS 
PART 6 DETERMINATION OF NICKEL BY 
DIMETHYLGLYOXIME ( GRAVIMETRIC ) METHOD 
( FOR NICKEL > 0’1 PERCENT) 
( Third Revision ) 
0. FOREWORD 
0.1 This Indian Standard ( Part 5 ) ( Third Revision ) was adopted by 
the Indian Standards Institution on 16 January 1987, after the draft 
finalized by the Methods of Chemical Analysis of Ferrous Metals 
Sectional Committee had been approved by the Strtictural and Metals 
Division Council. 
0.2 IS : 228, which was issued as a tentative standard in 1952 and 
revised in 1959, covered the chemial analysis of pig iron, cast iron and 
plain carbon and low alloy steels. For the convenience it was decided 
to publish a comprehensive series on chemical analysis of steels includ- 
ing high alloy steels.

IS : ‘228 ( Part 4 ) - 1987 
Indian Standard 
METHODS FOR 
CHEMICAL ANALYSIS OF STEELS 
PART 4 DETERYlNATlON OF TOTAL CARBON 
BY GARVIMETRIC METHOD 
(FOR CARBON 

In [17]:
def create_prompt(sample):
    bos_token = "<s>"
    base_prompt1 = "below context is from "
    base_prompt2 = ", answer the folwing questions based on the context given \n"
    document = sample['pdf_filename']
    # context = sample['context']
    answer = sample['answer']
    question = sample['question']
    eos_token = "</s>"
    full_prompt = ""
    full_prompt += bos_token
    full_prompt += "###Instruction:\n"
    full_prompt += base_prompt1
    full_prompt += document
    full_prompt += base_prompt2
    full_prompt += "\n\n###context:\n" + context
    full_prompt += "\n\n###question:\n" + question
    full_prompt += "\n\n###answer:\n" + answer
    full_prompt += eos_token
    print("------------------")
    print(full_prompt)
    print("---------------------------------------------------------------------------------------------------------------")
    return full_prompt

In [18]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

nf4_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [20]:
model_path = "filipealmeida/Mistral-7B-Instruct-v0.1-sharded"

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map="auto",
    quantization_config=nf4_config,
    use_cache=False
)

Loading checkpoint shards: 100%|██████████| 8/8 [00:03<00:00,  2.33it/s]


In [21]:
tokenizer = AutoTokenizer.from_pretrained(model_path)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_size = "right"



In [22]:
from peft import AutoPeftModelForCausalLM, LoraConfig, get_peft_model, prepare_model_for_kbit_training

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CASUAL_LM"
)

In [23]:
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

In [24]:
from transformers import TrainingArguments

# args = TrainingArguments(
#     output_dir = "KOKO",
#     max_steps = 100,
#     per_device_train_batch_size = 4,
#     warmup_steps = 0.03,
#     logging_steps = 10,
#     save_strategy = "epoch",
#     evaluation_strategy="steps",
#     eval_steps=20,
#     learning_rate=2e-4,
#     lr_scheduler_type='constant',
# )

gradient_accumulation_steps = 4  # adjust this value based on your GPU memory

args = TrainingArguments(
    output_dir = "KOKO",
    max_steps = 100,
    per_device_train_batch_size = 4,
    gradient_accumulation_steps = gradient_accumulation_steps,  # add this line
    warmup_steps = 0.03,
    logging_steps = 10,
    save_strategy = "epoch",
    evaluation_strategy="steps",
    eval_steps=20,
    learning_rate=2e-4,
    lr_scheduler_type='constant',
)


In [25]:
from trl import SFTTrainer

max_seq_length = 256

trainer = SFTTrainer(
    model=model,
    peft_config = peft_config,
    max_seq_length=max_seq_length,
    tokenizer = tokenizer,
    formatting_func=create_prompt,
    packing = True,
    args = args,
    train_dataset = selected_dataset_dict["train"],
    eval_dataset= selected_dataset_dict["test"]
)

Generating train split: 51 examples [00:00, 2017.67 examples/s]


------------------
<s>###Instruction:
below context is from 228_4.pdf, answer the folwing questions based on the context given 


###context:
Context:
IS t 
Indian Standard 228 ( Part 5 ) - 1987 
METHODS FOR 
CHEMICAL ANALYSIS OF STEELS 
PART 6 DETERMINATION OF NICKEL BY 
DIMETHYLGLYOXIME ( GRAVIMETRIC ) METHOD 
( FOR NICKEL > 0’1 PERCENT) 
( Third Revision ) 
0. FOREWORD 
0.1 This Indian Standard ( Part 5 ) ( Third Revision ) was adopted by 
the Indian Standards Institution on 16 January 1987, after the draft 
finalized by the Methods of Chemical Analysis of Ferrous Metals 
Sectional Committee had been approved by the Strtictural and Metals 
Division Council. 
0.2 IS : 228, which was issued as a tentative standard in 1952 and 
revised in 1959, covered the chemial analysis of pig iron, cast iron and 
plain carbon and low alloy steels. For the convenience it was decided 
to publish a comprehensive series on chemical analysis of steels includ- 
ing high alloy steels.

IS : ‘228 ( Part 4 

Generating train split: 6 examples [00:00, 831.60 examples/s]
max_steps is given, it will override any value given in num_train_epochs


------------------
<s>###Instruction:
below context is from 228_6.pdf, answer the folwing questions based on the context given 


###context:
Context:
IS t 
Indian Standard 228 ( Part 5 ) - 1987 
METHODS FOR 
CHEMICAL ANALYSIS OF STEELS 
PART 6 DETERMINATION OF NICKEL BY 
DIMETHYLGLYOXIME ( GRAVIMETRIC ) METHOD 
( FOR NICKEL > 0’1 PERCENT) 
( Third Revision ) 
0. FOREWORD 
0.1 This Indian Standard ( Part 5 ) ( Third Revision ) was adopted by 
the Indian Standards Institution on 16 January 1987, after the draft 
finalized by the Methods of Chemical Analysis of Ferrous Metals 
Sectional Committee had been approved by the Strtictural and Metals 
Division Council. 
0.2 IS : 228, which was issued as a tentative standard in 1952 and 
revised in 1959, covered the chemial analysis of pig iron, cast iron and 
plain carbon and low alloy steels. For the convenience it was decided 
to publish a comprehensive series on chemical analysis of steels includ- 
ing high alloy steels.

IS : ‘228 ( Part 4 



In [26]:
trainer.train()



Step,Training Loss,Validation Loss
20,0.9304,No log




KeyboardInterrupt: 

In [None]:
model.eval()

inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), max_new_tokens=280)

print(tokenizer.batch_decode(outputs)[0])