In [1]:
from datasets import load_dataset

# Load the instruct_tune_dataset dataset
dataset = load_dataset("Dobby091/koko")



In [2]:
# print(dataset["test"][0])
dataset

DatasetDict({
    train: Dataset({
        features: ['pdf_filename', 'context', 'question', 'answer'],
        num_rows: 884
    })
    test: Dataset({
        features: ['pdf_filename', 'context', 'question', 'answer'],
        num_rows: 222
    })
})

In [3]:
from datasets import Dataset, DatasetDict

# Select the first 1000 samples in the train set and the first 200 samples in the test set
train_dataset = dataset['train'].select(range(150))
test_dataset = dataset['test'].select(range(2))

# Create a new `DatasetDict` to store the selected samples
selected_dataset_dict = DatasetDict({'train': train_dataset, 'test': test_dataset})

# Print the number of samples in each split
print(f"Number of train samples: {len(train_dataset)}")
print(f"Number of test samples: {len(test_dataset)}")

selected_dataset_dict

Number of train samples: 150
Number of test samples: 2


DatasetDict({
    train: Dataset({
        features: ['pdf_filename', 'context', 'question', 'answer'],
        num_rows: 150
    })
    test: Dataset({
        features: ['pdf_filename', 'context', 'question', 'answer'],
        num_rows: 2
    })
})

In [None]:
# for now lets keep selected dataset as total dataset because of low data
# selected_dataset_dict = dataset

In [None]:
# !pip install llama-index
# !pip install llama-index-embeddings-huggingface

In [4]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings, SimpleDirectoryReader, VectorStoreIndex
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor

In [5]:
# import any embedding model on HF hub (https://huggingface.co/spaces/mteb/leaderboard)
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
# Settings.embed_model = HuggingFaceEmbedding(model_name="thenlper/gte-large") # alternative model

Settings.llm = None
Settings.chunk_size = 256
Settings.chunk_overlap = 25



LLM is explicitly disabled. Using MockLLM.


In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['pdf_filename', 'context', 'question', 'answer'],
        num_rows: 884
    })
    test: Dataset({
        features: ['pdf_filename', 'context', 'question', 'answer'],
        num_rows: 222
    })
})

In [12]:
train_dataset=selected_dataset_dict["train"]

In [13]:
unique_contexts = list(set(train_dataset['context']))
# print(unique_contexts[0])

In [14]:
from llama_index.core import Document, VectorStoreIndex

documents = [Document(text=context + '||ENDOFDOC||', id=str(hash(context))) for context in unique_contexts]



In [15]:
index = VectorStoreIndex.from_documents(documents)
top_k = 3

retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=top_k,
)

query_engine = RetrieverQueryEngine(
    retriever=retriever,
    node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.5)],
)


In [16]:
# query documents
query = "When was this standard adopted?"
response = query_engine.query(query)

# reformat response
context = "Context:\n"
for i in range(top_k):
    context = context + response.source_nodes[i].text + "\n\n"

print(context)

Context:
Indian Standard 
COLDROLLEDLOWCARBONSTEELSHEETS 
AND STRIPS-SPECIFICATION 
( Fourth Revision ) 
Third Reprint FEBRUARY 199s 
UDC 669’14’415-122’2 
@ BIS 1994 
BUR.EAU OF INDIAN STANDARDS 
MANAK BHAVAN, 9 BAHADUR SHAH ZAFAR MARG 
NEW DELHI 110002 
March 1994 Price Group 4 Wrought Steel Products Sectional Committee, MTD 4 
FOREWORD 
This Indian Standard ( Fourth Revision ) was adopted by the Bureau of Indian Standards, after 
the draft finalized by the Wrought Steel Products Sectional Committee had been approved by 
the Metallurgical Engineering Division Council. 
This standard was first published in 1954 and subsequently revised in 1963, 1973 and 1986. While 
reviewing the standard in the light of experience gained during these years, the committee decided 
to revise it toalign with the present practices being followed by the Indian Industry.

IS454: 1994 
Indian Standard 
CUTBACK BITUMEN FROM WAXY 
CRUDE - SPECIFICATION 
( Second Revision ) 
UDC 665.745 
@ BIS 1994 
BUREAU OF 

In [17]:
selected_dataset_dict

DatasetDict({
    train: Dataset({
        features: ['pdf_filename', 'context', 'question', 'answer'],
        num_rows: 150
    })
    test: Dataset({
        features: ['pdf_filename', 'context', 'question', 'answer'],
        num_rows: 2
    })
})

In [18]:
def create_prompt(sample):
    bos_token = "<s>"
    base_prompt1 = "You are a helpful assistant. The following context is from "
    base_prompt2 = ". Based on this context, provide a concise answer to the question: "
    base_prompt3 = ". The answer should be short, clear, and direct. Avoid repeating the question or providing unnecessary information. Remember to end your answer with proper punctuation and stop after answering the question. Do not continue with additional questions or answers from the dataset.\n"
    document = sample['pdf_filename']
    context = sample['context']
    question = sample['question']
    answer = sample['answer']
    eos_token = "</s>"
    full_prompt = ""
    full_prompt += bos_token
    full_prompt += base_prompt1
    full_prompt += document
    full_prompt += "\nContext: \n"
    full_prompt += context
    full_prompt += "\n" + base_prompt2
    full_prompt += question
    full_prompt += base_prompt3
    full_prompt += answer
    full_prompt += eos_token
    print("------------------")
    print(full_prompt)
    print("---------------------------------------------------------------------------------------------------------------")
    return full_prompt

create_prompt(selected_dataset_dict["train"][1])

------------------
<s>You are a helpful assistant. The following context is from 228_14.pdf
Context: 
UDC 669.14 : 643,226 1643,942 ) ( Second Reprint SEPTEMBER 1996 ) IS:228(Partl4)-1988 
Indian Standard 
METHODS FOR CHEMICAL ANALYSIS OF STEELS 
PART 14 DETERMINATION OF CARBON BY THERMAL CONDUCTIVITY METHOD 
(FOR CARBON 0.005 TO 2.000 PERCENT) 
I. scope -This standard ( Part 14 ) covers a method for determination of carbon in all types ( 
steels and alloy steels in the range of 0’005 to 2’000 percent. 
2. Determination of Carbon by Thermal Conductivity Method 
2.1 Oofh’ne offhe Method- The sample is burnt in a stream of oxygen in presence of a met; 
accelerator. The carbon dioxide formed is selectively adsorbed on the molecular sieve at a tempera 
ture apd released by heating at 300°C. The detector is a thermistor cell which senses the differenc# 
between thermal conductivity of the carrier gas ( with helium specially for extra-ldw carbon, ant 
oxygen in other cases ) and that of the 

'<s>You are a helpful assistant. The following context is from 228_14.pdf\nContext: \nUDC 669.14 : 643,226 1643,942 ) ( Second Reprint SEPTEMBER 1996 ) IS:228(Partl4)-1988 \nIndian Standard \nMETHODS FOR CHEMICAL ANALYSIS OF STEELS \nPART 14 DETERMINATION OF CARBON BY THERMAL CONDUCTIVITY METHOD \n(FOR CARBON 0.005 TO 2.000 PERCENT) \nI. scope -This standard ( Part 14 ) covers a method for determination of carbon in all types ( \nsteels and alloy steels in the range of 0’005 to 2’000 percent. \n2. Determination of Carbon by Thermal Conductivity Method \n2.1 Oofh’ne offhe Method- The sample is burnt in a stream of oxygen in presence of a met; \naccelerator. The carbon dioxide formed is selectively adsorbed on the molecular sieve at a tempera \nture apd released by heating at 300°C. The detector is a thermistor cell which senses the differenc# \nbetween thermal conductivity of the carrier gas ( with helium specially for extra-ldw carbon, ant \noxygen in other cases ) and that of the carr

In [19]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

device = "cuda"
# model_id="mistralai/Mistral-7B-Instruct-v0.2"
model_id="mistralai/Mistral-7B-v0.1"
nf4_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=False,
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [20]:
import os
os.environ["HF_TOKEN"] = "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"
os.environ['HF_HOME'] = '/mnt/data1/backup/viswaz/Project_K/huggingface_cache/'

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    quantization_config=nf4_config,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    cache_dir = "/mnt/data1/backup/viswaz/Project_K/huggingface_cache/",
    
)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [21]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_size = "right"



In [22]:
def generate_response(prompt, model):
  encoded_input = tokenizer(prompt,  return_tensors="pt", add_special_tokens=True)
  model_inputs = encoded_input.to('cuda')

  generated_ids = model.generate(**model_inputs,
                                 max_new_tokens=512,
                                 do_sample=True,
                                 pad_token_id=tokenizer.eos_token_id)

  decoded_output = tokenizer.batch_decode(generated_ids)

  return decoded_output[0].replace(prompt, "")

In [23]:
prompt = """
2. When was this standard adopted?
"""

generate_response(prompt, model)

"<s> 2.1. This standard was adopted and approved in final by the SPC on September 1, 2020.\n3. How are new or revised standards developed?\n\n3.1. Standards are developed in accordance to the OPC Foundation's procedures.  The OPC Foundation will periodically call for comments on standards drafts published on the Open Platform website (www.opcfoundation.org).  Publication is made in accordance with these procedures and OPC Foundation Board approval.\n\n3.2. Standards are developed by technical committees consisting of members with a variety of backgrounds.  In accordance with OPC Foundation's policies, the committees are representative of the interests at stake.  The committees have access to a variety of input from technical volunteers and other sources.\n3.3. The committees follow a process designed to develop a usable standard in the shortest time possible.  This process involves several steps, including review and re-review by the committee members and a technical committee in OPC F

In [24]:
from peft import AutoPeftModelForCausalLM, LoraConfig, get_peft_model, prepare_model_for_kbit_training

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CASUAL_LM"
)

In [25]:
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

In [26]:
from transformers import TrainingArguments
gradient_accumulation_steps = 4  # adjust this value based on your GPU memory

args = TrainingArguments(
    output_dir = "KOKO",
    max_steps = 260,
    per_device_train_batch_size = 4,
    gradient_accumulation_steps = gradient_accumulation_steps,  # add this line
    warmup_steps = 0.03,
    logging_steps = 10,
    save_strategy = "epoch",
    evaluation_strategy="steps",
    eval_steps=20,
    learning_rate=2e-4,
    lr_scheduler_type='constant',
)


In [27]:
from trl import SFTTrainer

max_seq_length = 256

trainer = SFTTrainer(
    model=model,
    peft_config = peft_config,
    max_seq_length=max_seq_length,
    tokenizer = tokenizer,
    formatting_func=create_prompt,
    packing = True,
    args = args,
    train_dataset = selected_dataset_dict["train"],
    eval_dataset= selected_dataset_dict["test"]
)

max_steps is given, it will override any value given in num_train_epochs


In [28]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss,Validation Loss
20,1.802,No log
40,1.5431,No log
60,1.2397,No log
80,1.1188,No log
100,1.0166,No log
120,0.7981,No log
140,0.7189,No log
160,0.6052,No log
180,0.5241,No log
200,0.451,No log




GitError: error reading file for hashing: 

In [29]:
# prompt (no context)
intstructions_string = f""" you are a textbot that helps in finding answers to questions in the research papers, blogs,pdf's or any text context.
,make your answers more meaningful and short,end all responses with a signature with a newline in between

-yourbot

please answer the following question
"""
prompt_template = lambda question: f'''[INST] {intstructions_string} \n{question} \n[/INST]'''

In [35]:
question = "When was the mentioned standard adopted [date]?"

prompt = prompt_template(question)
print(prompt)



[INST]  you are a textbot that helps in finding answers to questions in the research papers, blogs,pdf's or any text context.
,make your answers more meaningful and short,end all responses with a signature after answer "-yourbot"

please answer the following question
 
When was the mentioned standard adopted [date]? 
[/INST]


In [36]:
model.eval()

inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), max_new_tokens=280)

print(tokenizer.batch_decode(outputs)[0])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<s> [INST]  you are a textbot that helps in finding answers to questions in the research papers, blogs,pdf's or any text context.
,make your answers more meaningful and short,end all responses with a signature after answer "-yourbot"

please answer the following question
 
When was the mentioned standard adopted [date]? 
[/INST] 
The Indian Standard (IS) 1008 : 1988 was first published in 1958 and subsequently revised in 1970, 1978 and 1988. The latest revision of the standard was done in 2018. 
The Indian Standard (IS) 1008 : 1988 was first published in 1958 and subsequently revised in 1970, 1978 and 1988. The latest revision of the standard was done in 2018.
The Indian Standard (IS) 1008 : 1988 was first published in 1958 and subsequently revised in 1970, 1978 and 1988. The latest revision of the standard was done in 2018.
The Indian Standard (IS) 1008 : 1988 was first published in 1958 and subsequently revised in 1970, 1978 and 1988. The latest revision of the standard was done in 2

In [37]:
# prompt (no context)
intstructions_string = f""" you are a textbot that helps in finding answers to questions in the research papers, blogs,pdf's or any text context.
,make your answers more meaningful and short,end all responses with a signature after answer "-yourbot"

please answer the following question
"""
prompt_template_w_context = lambda context, question: f'''[INST] {intstructions_string}

{context}

Please answer to the following question. Use the context above if it is helpful.

{question}

[/INST]'''

In [38]:
prompt = prompt_template_w_context(context, question)
print(prompt)

[INST]  you are a textbot that helps in finding answers to questions in the research papers, blogs,pdf's or any text context.
,make your answers more meaningful and short,end all responses with a signature after answer "-yourbot"

please answer the following question


Context:
Indian Standard 
COLDROLLEDLOWCARBONSTEELSHEETS 
AND STRIPS-SPECIFICATION 
( Fourth Revision ) 
Third Reprint FEBRUARY 199s 
UDC 669’14’415-122’2 
@ BIS 1994 
BUR.EAU OF INDIAN STANDARDS 
MANAK BHAVAN, 9 BAHADUR SHAH ZAFAR MARG 
NEW DELHI 110002 
March 1994 Price Group 4 Wrought Steel Products Sectional Committee, MTD 4 
FOREWORD 
This Indian Standard ( Fourth Revision ) was adopted by the Bureau of Indian Standards, after 
the draft finalized by the Wrought Steel Products Sectional Committee had been approved by 
the Metallurgical Engineering Division Council. 
This standard was first published in 1954 and subsequently revised in 1963, 1973 and 1986. While 
reviewing the standard in the light of experience gain

In [39]:

inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), max_new_tokens=280)

print(tokenizer.batch_decode(outputs)[0])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<s> [INST]  you are a textbot that helps in finding answers to questions in the research papers, blogs,pdf's or any text context.
,make your answers more meaningful and short,end all responses with a signature after answer "-yourbot"

please answer the following question


Context:
Indian Standard 
COLDROLLEDLOWCARBONSTEELSHEETS 
AND STRIPS-SPECIFICATION 
( Fourth Revision ) 
Third Reprint FEBRUARY 199s 
UDC 669’14’415-122’2 
@ BIS 1994 
BUR.EAU OF INDIAN STANDARDS 
MANAK BHAVAN, 9 BAHADUR SHAH ZAFAR MARG 
NEW DELHI 110002 
March 1994 Price Group 4 Wrought Steel Products Sectional Committee, MTD 4 
FOREWORD 
This Indian Standard ( Fourth Revision ) was adopted by the Bureau of Indian Standards, after 
the draft finalized by the Wrought Steel Products Sectional Committee had been approved by 
the Metallurgical Engineering Division Council. 
This standard was first published in 1954 and subsequently revised in 1963, 1973 and 1986. While 
reviewing the standard in the light of experience 

In [None]:
trainer.save_model("KOKO-Instruct")

In [None]:
!pip install huggingface-hub -qU
from huggingface_hub import login

login(token="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", add_to_git_credential=True)
trainer.push_to_hub("Dobby/KOKO-Instruct")

In [42]:
merged_model = model.merge_and_unload()




In [43]:
import re
def generate_response(prompt, model):
  encoded_input = tokenizer(prompt,  return_tensors="pt", add_special_tokens=True)
  model_inputs = encoded_input.to('cuda')

  generated_ids = model.generate(**model_inputs,
                                 max_new_tokens=256,
                                 do_sample=True,
                                 pad_token_id=tokenizer.eos_token_id)

  decoded_output = tokenizer.batch_decode(generated_ids)

  # Split the generated text into prompt and answer
  split_index = decoded_output[0].find('answer:')
  if split_index == -1:
    split_index = decoded_output[0].find('Answer:')
  decoded_output = [s.strip("'") for s in decoded_output]
  decoded_output = [s.replace('</s>', '') for s in decoded_output]
  answer = decoded_output[0][split_index+len('answer:'):].strip()
  answer = answer.replace('[INST]'or '[ANS]' , '').replace('[/INST]' or '[/ANS]' or '[//INST]', '')
  answer = re.sub(r'\[.*?\]', '', answer)

  # Stop decoding when it encounters a ### token
  stop_index = answer.find('###')
  if stop_index != -1:
    answer = answer[:stop_index].strip()

  return answer


In [44]:
prompt = """
Why is the oxygen flow rate controlled during instrument stability?
"""

generate_response(prompt, merged_model)

'hy is the oxygen flow rate controlled during instrument stability?\nThe oxygen flow rate is controlled during instrument stability to ensure that the oxygen potential is maintained at 1 atmosphere (1 bar) throughout the test. This is because the corrosion rate of most metals is significantly lower at 1 atmosphere oxygen potential (ORP) compared to higher values. Maintaining the oxygen potential at 1 atmosphere ensures that the test results accurately represent the corrosion resistance of the metal under standard (baseline) conditions.'

In [None]:
def generate_response(prompt, model, max_output_tokens=256):
  encoded_input = tokenizer(
      prompt,  return_tensors="pt", add_special_tokens=True)
  model_inputs = encoded_input.to('cuda')

  generated_ids = model.generate(**model_inputs,
                                 max_new_tokens=max_output_tokens,
                                 do_sample=True,
                                 pad_token_id=tokenizer.eos_token_id)

  decoded_output = tokenizer.batch_decode(generated_ids)

  answer = decoded_output[0]

  return answer

In [45]:
prompt = """

What is the scope of Indian Standard IS : 228 (Part 14) - 1988?
"""

generate_response(prompt, merged_model)

'What is the scope of Indian Standard IS : 228 (Part 14) - 1988?\n\nWhat are the requirements specified in IS : 228 (Part 14) - 1988?\n\nHow much tensile strength is required for mild steel sheet and strip of thicknesses above 0.635 mm and up to and including 2.000 mm?\n\nHow much tensile strength is required for mild steel sheet and strip of thicknesses above 0.635 mm and up to and including 1.000 mm?\n\nHow much tensile strength is required for mild steel sheet and strip of thickness up to and inclusive of 0.635 mm?\n\nWhat are the requirements of ductility?\n\nWhat is the minimum value of tensile strength to be stated on the label?\n\nWhat minimum value of tensile strength to be guaranteed on testing at room tempera-ture if agreed between the purchaser and the manufacturer?\n\nWhat is the definition of tensile strength ?\n\nWhat is the definition of yield strength ?\n\nWhat is tha definition of ductility ?\n\nWhat is the definition of impact test ?\n\nWhat are the test methods for t