In [3]:
!pip install transformers bitsandbytes accelerate datasets peft trl

[0m

In [4]:
from datasets import load_dataset

# Load the instruct_tune_dataset dataset
dataset = load_dataset("Dobby091/koko")



  from .autonotebook import tqdm as notebook_tqdm
Downloading readme: 100%|██████████| 440/440 [00:00<00:00, 295kB/s]
Downloading data: 100%|██████████| 6.88k/6.88k [00:00<00:00, 8.92kB/s]
Downloading data: 100%|██████████| 3.91k/3.91k [00:00<00:00, 6.26kB/s]
Generating train split: 100%|██████████| 72/72 [00:00<00:00, 10273.16 examples/s]
Generating test split: 100%|██████████| 18/18 [00:00<00:00, 6028.22 examples/s]


In [5]:
print(dataset["test"][0])

{'pdf_filename': '228_6.pdf', 'question': 'What is the reproducibility range for chromium content at 1 to 5 percent?', 'answer': 'Reproducibility is ±0.120 percent.'}


In [6]:
from datasets import Dataset, DatasetDict

# Select the first 1000 samples in the train set and the first 200 samples in the test set
train_dataset = dataset['train'].select(range(15))
test_dataset = dataset['test'].select(range(2))

# Create a new `DatasetDict` to store the selected samples
selected_dataset_dict = DatasetDict({'train': train_dataset, 'test': test_dataset})

# Print the number of samples in each split
print(f"Number of train samples: {len(train_dataset)}")
print(f"Number of test samples: {len(test_dataset)}")

selected_dataset_dict

Number of train samples: 15
Number of test samples: 2


DatasetDict({
    train: Dataset({
        features: ['pdf_filename', 'question', 'answer'],
        num_rows: 15
    })
    test: Dataset({
        features: ['pdf_filename', 'question', 'answer'],
        num_rows: 2
    })
})

In [7]:
!pip install llama-index
!pip install llama-index-embeddings-huggingface

[0m

In [8]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings, SimpleDirectoryReader, VectorStoreIndex
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor

In [9]:
# import any embedding model on HF hub (https://huggingface.co/spaces/mteb/leaderboard)
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
# Settings.embed_model = HuggingFaceEmbedding(model_name="thenlper/gte-large") # alternative model

Settings.llm = None
Settings.chunk_size = 256
Settings.chunk_overlap = 25



LLM is explicitly disabled. Using MockLLM.


In [19]:
# articles available here:  {add GitHub repo}
documents = SimpleDirectoryReader("pdf").load_data()

In [20]:
# some ad hoc document refinement
print(len(documents))
for doc in documents:
    if "Member-only story" in doc.text:
        documents.remove(doc)
        continue

    if "The Data Entrepreneurs" in doc.text:
        documents.remove(doc)

    if " min read" in doc.text:
        documents.remove(doc)

print(len(documents))

32
32


In [21]:
# store docs into vector DB
index = VectorStoreIndex.from_documents(documents)


In [22]:
# set number of docs to retreive
top_k = 3

# configure retriever
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=top_k,
)

In [23]:
# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.5)],
)

In [24]:
# query documents
query = "When was this standard adopted?"
response = query_engine.query(query)

In [25]:
# reformat response
context = "Context:\n"
for i in range(top_k):
    context = context + response.source_nodes[i].text + "\n\n"

print(context)

Context:
IS:228(Pw6)-iW7 
Indian Standard 
METHODS FOR 
CHEMICAL ANALYSIS OF STEELS 
PART 8 DETERMINATION OF CHROMIUM BY 
PERSULPHATE OXIDATION METHOD 
(FOR CHROMCUM > 0’1 PERCENT ) 
( Third Revision ) 
0. FOREWORD 
0.1 This Indian Standard ( Part 6 ) ( Third Revision ) was adopted by 
the Indian Standards Institution on 16 January 1987, after thedraft 
finalized by the Methods of Chemical Analysis of Ferrous Metals 
Sectional Committee Chad been approved by the Structural and 
Metals Division Council. 
0.2 IS : 228 which was issued -as a tentative standard in 1952 and 
revised in 1959, covered the chemical analysis of pig iron, cast iron 
and plain carbon and low alloy steels. For convenience, it was decided 
to publish a comprehensive series on chemical analysis of steels 
including high alloy steels.

IS t 
Indian Standard 228 ( Part 5 ) - 1987 
METHODS FOR 
CHEMICAL ANALYSIS OF STEELS 
PART 6 DETERMINATION OF NICKEL BY 
DIMETHYLGLYOXIME ( GRAVIMETRIC ) METHOD 
( FOR NICKEL > 0’1 PE

In [26]:
def create_prompt(sample):
    bos_token = "<s>"
    base_prompt1 = "below context is from "
    base_prompt2 = ", answer the folwing questions based on the context given \n"
    document = sample['pdf_filename']
    # context = sample['context']
    answer = sample['answer']
    question = sample['question']
    eos_token = "</s>"
    full_prompt = ""
    full_prompt += bos_token
    full_prompt += "###Instruction:\n"
    full_prompt += base_prompt1
    full_prompt += document
    full_prompt += base_prompt2
    full_prompt += "\n\n###context:\n" + context
    full_prompt += "\n\n###question:\n" + question
    full_prompt += "\n\n###answer:\n" + answer
    full_prompt += eos_token
    print("------------------")
    print(full_prompt)
    print("---------------------------------------------------------------------------------------------------------------")
    return full_prompt

In [27]:
create_prompt(dataset["train"][0])


------------------
<s>###Instruction:
below context is from 228_4.pdf, answer the folwing questions based on the context given 


###context:
Context:
IS:228(Pw6)-iW7 
Indian Standard 
METHODS FOR 
CHEMICAL ANALYSIS OF STEELS 
PART 8 DETERMINATION OF CHROMIUM BY 
PERSULPHATE OXIDATION METHOD 
(FOR CHROMCUM > 0’1 PERCENT ) 
( Third Revision ) 
0. FOREWORD 
0.1 This Indian Standard ( Part 6 ) ( Third Revision ) was adopted by 
the Indian Standards Institution on 16 January 1987, after thedraft 
finalized by the Methods of Chemical Analysis of Ferrous Metals 
Sectional Committee Chad been approved by the Structural and 
Metals Division Council. 
0.2 IS : 228 which was issued -as a tentative standard in 1952 and 
revised in 1959, covered the chemical analysis of pig iron, cast iron 
and plain carbon and low alloy steels. For convenience, it was decided 
to publish a comprehensive series on chemical analysis of steels 
including high alloy steels.

IS t 
Indian Standard 228 ( Part 5 ) - 198

'<s>###Instruction:\nbelow context is from 228_4.pdf, answer the folwing questions based on the context given \n\n\n###context:\nContext:\nIS:228(Pw6)-iW7 \nIndian Standard \nMETHODS FOR \nCHEMICAL ANALYSIS OF STEELS \nPART 8 DETERMINATION OF CHROMIUM BY \nPERSULPHATE OXIDATION METHOD \n(FOR CHROMCUM > 0’1 PERCENT ) \n( Third Revision ) \n0. FOREWORD \n0.1 This Indian Standard ( Part 6 ) ( Third Revision ) was adopted by \nthe Indian Standards Institution on 16 January 1987, after thedraft \nfinalized by the Methods of Chemical Analysis of Ferrous Metals \nSectional Committee Chad been approved by the Structural and \nMetals Division Council. \n0.2 IS : 228 which was issued -as a tentative standard in 1952 and \nrevised in 1959, covered the chemical analysis of pig iron, cast iron \nand plain carbon and low alloy steels. For convenience, it was decided \nto publish a comprehensive series on chemical analysis of steels \nincluding high alloy steels.\n\nIS t \nIndian Standard 228 ( Part 

In [28]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

nf4_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [29]:
model = AutoModelForCausalLM.from_pretrained(
    "filipealmeida/Mistral-7B-Instruct-v0.1-sharded",
    device_map="auto",
    quantization_config=nf4_config,
    use_cache=False
)

Downloading shards: 100%|██████████| 8/8 [03:10<00:00, 23.81s/it]
Loading checkpoint shards: 100%|██████████| 8/8 [00:03<00:00,  2.31it/s]


In [31]:
tokenizer = AutoTokenizer.from_pretrained("filipealmeida/Mistral-7B-Instruct-v0.1-sharded")

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_size = "right"



In [30]:
from peft import AutoPeftModelForCausalLM, LoraConfig, get_peft_model, prepare_model_for_kbit_training

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CASUAL_LM"
)

In [32]:
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

In [33]:
from transformers import TrainingArguments

# args = TrainingArguments(
#     output_dir = "KOKO",
#     max_steps = 100,
#     per_device_train_batch_size = 4,
#     warmup_steps = 0.03,
#     logging_steps = 10,
#     save_strategy = "epoch",
#     evaluation_strategy="steps",
#     eval_steps=20,
#     learning_rate=2e-4,
#     lr_scheduler_type='constant',
# )

gradient_accumulation_steps = 4  # adjust this value based on your GPU memory

args = TrainingArguments(
    output_dir = "KOKO",
    max_steps = 100,
    per_device_train_batch_size = 4,
    gradient_accumulation_steps = gradient_accumulation_steps,  # add this line
    warmup_steps = 0.03,
    logging_steps = 10,
    save_strategy = "epoch",
    evaluation_strategy="steps",
    eval_steps=20,
    learning_rate=2e-4,
    lr_scheduler_type='constant',
)


In [35]:
from trl import SFTTrainer

max_seq_length = 256

trainer = SFTTrainer(
    model=model,
    peft_config = peft_config,
    max_seq_length=max_seq_length,
    tokenizer = tokenizer,
    formatting_func=create_prompt,
    packing = True,
    args = args,
    train_dataset = selected_dataset_dict["train"],
    eval_dataset= selected_dataset_dict["test"]
)

Generating train split: 54 examples [00:00, 3956.61 examples/s]


------------------
<s>###Instruction:
below context is from 228_4.pdf, answer the folwing questions based on the context given 


###context:
Context:
IS:228(Pw6)-iW7 
Indian Standard 
METHODS FOR 
CHEMICAL ANALYSIS OF STEELS 
PART 8 DETERMINATION OF CHROMIUM BY 
PERSULPHATE OXIDATION METHOD 
(FOR CHROMCUM > 0’1 PERCENT ) 
( Third Revision ) 
0. FOREWORD 
0.1 This Indian Standard ( Part 6 ) ( Third Revision ) was adopted by 
the Indian Standards Institution on 16 January 1987, after thedraft 
finalized by the Methods of Chemical Analysis of Ferrous Metals 
Sectional Committee Chad been approved by the Structural and 
Metals Division Council. 
0.2 IS : 228 which was issued -as a tentative standard in 1952 and 
revised in 1959, covered the chemical analysis of pig iron, cast iron 
and plain carbon and low alloy steels. For convenience, it was decided 
to publish a comprehensive series on chemical analysis of steels 
including high alloy steels.

IS t 
Indian Standard 228 ( Part 5 ) - 198

Generating train split: 7 examples [00:00, 2050.57 examples/s]
max_steps is given, it will override any value given in num_train_epochs


------------------
<s>###Instruction:
below context is from 228_6.pdf, answer the folwing questions based on the context given 


###context:
Context:
IS:228(Pw6)-iW7 
Indian Standard 
METHODS FOR 
CHEMICAL ANALYSIS OF STEELS 
PART 8 DETERMINATION OF CHROMIUM BY 
PERSULPHATE OXIDATION METHOD 
(FOR CHROMCUM > 0’1 PERCENT ) 
( Third Revision ) 
0. FOREWORD 
0.1 This Indian Standard ( Part 6 ) ( Third Revision ) was adopted by 
the Indian Standards Institution on 16 January 1987, after thedraft 
finalized by the Methods of Chemical Analysis of Ferrous Metals 
Sectional Committee Chad been approved by the Structural and 
Metals Division Council. 
0.2 IS : 228 which was issued -as a tentative standard in 1952 and 
revised in 1959, covered the chemical analysis of pig iron, cast iron 
and plain carbon and low alloy steels. For convenience, it was decided 
to publish a comprehensive series on chemical analysis of steels 
including high alloy steels.

IS t 
Indian Standard 228 ( Part 5 ) - 198



In [37]:
trainer.train()

KeyboardInterrupt: 