In [3]:
# !pip install transformers bitsandbytes accelerate datasets peft trl
# !pip install transformers trl accelerate torch bitsandbytes peft datasets
# !pip install flash-attn

In [4]:
from datasets import load_dataset

# Load the instruct_tune_dataset dataset
dataset = load_dataset("Dobby091/koko")



In [5]:
print(dataset["test"][0])

{'pdf_filename': '228_6.pdf', 'question': 'What is the reproducibility range for chromium content at 1 to 5 percent?', 'answer': 'Reproducibility is ±0.120 percent.'}


In [6]:
from datasets import Dataset, DatasetDict

# Select the first 1000 samples in the train set and the first 200 samples in the test set
train_dataset = dataset['train'].select(range(15))
test_dataset = dataset['test'].select(range(2))

# Create a new `DatasetDict` to store the selected samples
selected_dataset_dict = DatasetDict({'train': train_dataset, 'test': test_dataset})

# Print the number of samples in each split
print(f"Number of train samples: {len(train_dataset)}")
print(f"Number of test samples: {len(test_dataset)}")

selected_dataset_dict

Number of train samples: 15
Number of test samples: 2


DatasetDict({
    train: Dataset({
        features: ['pdf_filename', 'question', 'answer'],
        num_rows: 15
    })
    test: Dataset({
        features: ['pdf_filename', 'question', 'answer'],
        num_rows: 2
    })
})

In [7]:
# for now lets keep selected dataset as total dataset because of low data
selected_dataset_dict = dataset

In [8]:
# !pip install llama-index
# !pip install llama-index-embeddings-huggingface

In [9]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings, SimpleDirectoryReader, VectorStoreIndex
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor

In [10]:
# import any embedding model on HF hub (https://huggingface.co/spaces/mteb/leaderboard)
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
# Settings.embed_model = HuggingFaceEmbedding(model_name="thenlper/gte-large") # alternative model

Settings.llm = None
Settings.chunk_size = 256
Settings.chunk_overlap = 25



LLM is explicitly disabled. Using MockLLM.


In [11]:
# articles available here:  {add GitHub repo}
documents = SimpleDirectoryReader("pdf").load_data()

In [12]:
# some ad hoc document refinement
print(len(documents))
for doc in documents:
    if "Member-only story" in doc.text:
        documents.remove(doc)
        continue

    if "The Data Entrepreneurs" in doc.text:
        documents.remove(doc)

    if " min read" in doc.text:
        documents.remove(doc)

print(len(documents))

32
32


In [13]:
# store docs into vector DB
index = VectorStoreIndex.from_documents(documents)
# set number of docs to retreive
top_k = 3

# configure retriever
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=top_k,
)
# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.5)],
)
# query documents
query = "When was this standard adopted?"
response = query_engine.query(query)

# reformat response
context = "Context:\n"
for i in range(top_k):
    context = context + response.source_nodes[i].text + "\n\n"

print(context)

Context:
IS : ‘228 ( Part 4 ) - 1987 
Indian Standard 
METHODS FOR 
CHEMICAL ANALYSIS OF STEELS 
PART 4 DETERYlNATlON OF TOTAL CARBON 
BY GARVIMETRIC METHOD 
(FOR CARBON > O-1 PERCENT ) 
( Third Revision ) 
0. FOREWORD 
0.1 This Indian Standard ( Part 4 ) (Third R cvision ) was adopted I)y 
the Indian Standards Institution on 16 January 1987, afirr the draft 
finalized by the Methods of Chemical Analysis of Pcrrous Metals 
Sectional Committre had been approved by the Structural and Mct;~ls 
Division Council. 
0.2 IS : 228, which was issued as a tentative standard in 1952 and 
revised in 1959, covered the chemical analysis of pig iron, cast iron and 
plain carbon and low alloy steels.

IS t 
Indian Standard 228 ( Part 5 ) - 1987 
METHODS FOR 
CHEMICAL ANALYSIS OF STEELS 
PART 6 DETERMINATION OF NICKEL BY 
DIMETHYLGLYOXIME ( GRAVIMETRIC ) METHOD 
( FOR NICKEL > 0’1 PERCENT) 
( Third Revision ) 
0. FOREWORD 
0.1 This Indian Standard ( Part 5 ) ( Third Revision ) was adopted by 
the Indian

In [14]:
selected_dataset_dict

DatasetDict({
    train: Dataset({
        features: ['pdf_filename', 'question', 'answer'],
        num_rows: 72
    })
    test: Dataset({
        features: ['pdf_filename', 'question', 'answer'],
        num_rows: 18
    })
})

In [15]:
def create_prompt(sample):
    bos_token = "<s>"
    base_prompt1 = "below context is from "
    base_prompt2 = ", answer the following questions based on the context given \n"
    document = sample['pdf_filename']
    # context = sample['context']
    answer = sample['answer']
    question = sample['question']
    eos_token = "</s>"
    full_prompt = ""
    full_prompt += bos_token
    full_prompt += "[INST]"
    full_prompt += "###Instruction:\n"
    full_prompt += base_prompt1
    full_prompt += document
    full_prompt += base_prompt2
    # full_prompt += "[INST]"
    full_prompt += "\n\n###pdf_filename:\n" + document
    full_prompt += "\n\n###context:\n" + context
    full_prompt += "\n\n###question:\n" + question
    full_prompt += "[/INST]"
    full_prompt += "\n\n###answer:\n" + answer
    full_prompt += eos_token
    print("------------------")
    print(full_prompt)
    print("---------------------------------------------------------------------------------------------------------------")
    return full_prompt

create_prompt(selected_dataset_dict["train"][1])

------------------
<s>[INST]###Instruction:
below context is from 228_4.pdf, answer the following questions based on the context given 


###pdf_filename:
228_4.pdf

###context:
Context:
IS : ‘228 ( Part 4 ) - 1987 
Indian Standard 
METHODS FOR 
CHEMICAL ANALYSIS OF STEELS 
PART 4 DETERYlNATlON OF TOTAL CARBON 
BY GARVIMETRIC METHOD 
(FOR CARBON > O-1 PERCENT ) 
( Third Revision ) 
0. FOREWORD 
0.1 This Indian Standard ( Part 4 ) (Third R cvision ) was adopted I)y 
the Indian Standards Institution on 16 January 1987, afirr the draft 
finalized by the Methods of Chemical Analysis of Pcrrous Metals 
Sectional Committre had been approved by the Structural and Mct;~ls 
Division Council. 
0.2 IS : 228, which was issued as a tentative standard in 1952 and 
revised in 1959, covered the chemical analysis of pig iron, cast iron and 
plain carbon and low alloy steels.

IS t 
Indian Standard 228 ( Part 5 ) - 1987 
METHODS FOR 
CHEMICAL ANALYSIS OF STEELS 
PART 6 DETERMINATION OF NICKEL BY 
DIMETH

'<s>[INST]###Instruction:\nbelow context is from 228_4.pdf, answer the following questions based on the context given \n\n\n###pdf_filename:\n228_4.pdf\n\n###context:\nContext:\nIS : ‘228 ( Part 4 ) - 1987 \nIndian Standard \nMETHODS FOR \nCHEMICAL ANALYSIS OF STEELS \nPART 4 DETERYlNATlON OF TOTAL CARBON \nBY GARVIMETRIC METHOD \n(FOR CARBON > O-1 PERCENT ) \n( Third Revision ) \n0. FOREWORD \n0.1 This Indian Standard ( Part 4 ) (Third R cvision ) was adopted I)y \nthe Indian Standards Institution on 16 January 1987, afirr the draft \nfinalized by the Methods of Chemical Analysis of Pcrrous Metals \nSectional Committre had been approved by the Structural and Mct;~ls \nDivision Council. \n0.2 IS : 228, which was issued as a tentative standard in 1952 and \nrevised in 1959, covered the chemical analysis of pig iron, cast iron and \nplain carbon and low alloy steels.\n\nIS t \nIndian Standard 228 ( Part 5 ) - 1987 \nMETHODS FOR \nCHEMICAL ANALYSIS OF STEELS \nPART 6 DETERMINATION OF NICK

In [16]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

device = "cuda"
model_id="mistralai/Mistral-7B-Instruct-v0.2"
# model_id="mistralai/Mistral-7B-v0.1"
nf4_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=False,
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [17]:
import os
os.environ["HF_TOKEN"] = "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"
os.environ['HF_HOME'] = '/mnt/data1/backup/viswaz/Project_K/huggingface_cache/'

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    quantization_config=nf4_config,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    cache_dir = "/mnt/data1/backup/viswaz/Project_K/huggingface_cache/",
    
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



In [18]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_size = "right"



In [19]:
def generate_response(prompt, model):
  encoded_input = tokenizer(prompt,  return_tensors="pt", add_special_tokens=True)
  model_inputs = encoded_input.to('cuda')

  generated_ids = model.generate(**model_inputs,
                                 max_new_tokens=512,
                                 do_sample=True,
                                 pad_token_id=tokenizer.eos_token_id)

  decoded_output = tokenizer.batch_decode(generated_ids)

  return decoded_output[0].replace(prompt, "")

In [20]:
prompt = """
[INST]
###Instruction:
below context is from 228_4.pdf, answer the following questions based on the context given 


###pdf_filename:
228_4.pdf

###context:
Context:
IS : ‘228 ( Part 4 ) - 1987 
Indian Standard 
METHODS FOR 
CHEMICAL ANALYSIS OF STEELS 
PART 4 DETERYlNATlON OF TOTAL CARBON 
BY GARVIMETRIC METHOD 
(FOR CARBON > O-1 PERCENT ) 
( Third Revision ) 
0. FOREWORD 
0.1 This Indian Standard ( Part 4 ) (Third R cvision ) was adopted I)y 
the Indian Standards Institution on 16 January 1987, afirr the draft 
finalized by the Methods of Chemical Analysis of Pcrrous Metals 
Sectional Committre had been approved by the Structural and Mct;~ls 
Division Council. 
0.2 IS : 228, which was issued as a tentative standard in 1952 and 
revised in 1959, covered the chemical analysis of pig iron, cast iron and 
plain carbon and low alloy steels.

IS t 
Indian Standard 228 ( Part 5 ) - 1987 
METHODS FOR 
CHEMICAL ANALYSIS OF STEELS 
PART 6 DETERMINATION OF NICKEL BY 
DIMETHYLGLYOXIME ( GRAVIMETRIC ) METHOD 
( FOR NICKEL > 0’1 PERCENT) 
( Third Revision ) 
0. FOREWORD 
0.1 This Indian Standard ( Part 5 ) ( Third Revision ) was adopted by 
the Indian Standards Institution on 16 January 1987, after the draft 
finalized by the Methods of Chemical Analysis of Ferrous Metals 
Sectional Committee had been approved by the Strtictural and Metals 
Division Council. 
0.2 IS : 228, which was issued as a tentative standard in 1952 and 
revised in 1959, covered the chemial analysis of pig iron, cast iron and 
plain carbon and low alloy steels.

Accordingly, revision of IS : 228 was 
taken-up again and new series on methods of chemical analysis of 
steels including high alloy steels was published in various parts as 
IS : 228 ( Parts 1 to 13 ) ( see Appendix A ) covering separate method 
of analysis for each constituent in steels. However, IS : 228-1959* 
version has been retained for the analysis of pig iron and cast iron 
till a separate standard for analysis of pig iron and cast iron is 
published. 
0.2.1 This revision of IS : 228 (Part 6 )-1974t has been undertaken 
on the basis of experience gained during the implementation of the 
standard by the manufacturers and testing laboratories. 
0.3 In this revision, major modifications are: 
a) scope of the method has been modified by lowering the limit 
for determination of chromium from 0’5 to 0’1 percent; 
-.



###question:
2. When was this standard adopted?[/INST]
"""

generate_response(prompt, model)

'<s> This standard was adopted on 16 January 1987. It is a third revision of Indian Standard 228, Part 4 and Part 5, which were first issued as tentative standards in 1952 and revised in 1959. The revision of IS 228 was taken up again and a new series on methods of chemical analysis of steels was published in various parts from 1987 to 1994, including IS 228-Part 6 for determination of nickel by dimethylglyoxime gravimetric method for nickel > 0.1 percent.</s>'

In [21]:
from peft import AutoPeftModelForCausalLM, LoraConfig, get_peft_model, prepare_model_for_kbit_training

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CASUAL_LM"
)

In [22]:
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

In [23]:
from transformers import TrainingArguments

# args = TrainingArguments(
#     output_dir = "KOKO",
#     max_steps = 100,
#     per_device_train_batch_size = 4,
#     warmup_steps = 0.03,
#     logging_steps = 10,
#     save_strategy = "epoch",
#     evaluation_strategy="steps",
#     eval_steps=20,
#     learning_rate=2e-4,
#     lr_scheduler_type='constant',
# )

gradient_accumulation_steps = 4  # adjust this value based on your GPU memory

args = TrainingArguments(
    output_dir = "KOKO",
    max_steps = 100,
    per_device_train_batch_size = 4,
    gradient_accumulation_steps = gradient_accumulation_steps,  # add this line
    warmup_steps = 0.03,
    logging_steps = 10,
    save_strategy = "epoch",
    evaluation_strategy="steps",
    eval_steps=20,
    learning_rate=2e-4,
    lr_scheduler_type='constant',
)


In [24]:
from trl import SFTTrainer

max_seq_length = 256

trainer = SFTTrainer(
    model=model,
    peft_config = peft_config,
    max_seq_length=max_seq_length,
    tokenizer = tokenizer,
    formatting_func=create_prompt,
    packing = True,
    args = args,
    train_dataset = selected_dataset_dict["train"],
    eval_dataset= selected_dataset_dict["test"]
)

max_steps is given, it will override any value given in num_train_epochs


In [25]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss,Validation Loss
20,0.9476,No log
40,0.2404,No log
60,0.1844,No log
80,0.1306,No log
100,0.0922,No log




TrainOutput(global_step=100, training_loss=0.4588971537351608, metrics={'train_runtime': 172.5, 'train_samples_per_second': 9.275, 'train_steps_per_second': 0.58, 'total_flos': 1.7410593543684096e+16, 'train_loss': 0.4588971537351608, 'epoch': 6.557377049180328})

In [26]:
# prompt (no context)
intstructions_string = f""" you are a textbot that helps in finding answers to questions in the research papers, blogs,pdf's or any text context.
,make your answers more meaningful and short,end all responses with a signature with a newline in between

-yourbot

please answer the following question
"""
prompt_template = lambda question: f'''[INST] {intstructions_string} \n{question} \n[/INST]'''

In [27]:
question = "When was the mentioned standard adopted [date]?"

prompt = prompt_template(question)
print(prompt)



[INST]  you are a textbot that helps in finding answers to questions in the research papers, blogs,pdf's or any text context.
,make your answers more meaningful and short,end all responses with a signature with a newline in between

-yourbot

please answer the following question
 
When was the mentioned standard adopted [date]? 
[/INST]


In [28]:
model.eval()

inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), max_new_tokens=280)

print(tokenizer.batch_decode(outputs)[0])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<s> [INST]  you are a textbot that helps in finding answers to questions in the research papers, blogs,pdf's or any text context.
,make your answers more meaningful and short,end all responses with a signature with a newline in between

-yourbot

please answer the following question
 
When was the mentioned standard adopted [date]? 
[/INST] ###answer:
[year]</s>


In [29]:
# prompt (no context)
intstructions_string = f""" you are a textbot that helps in finding answers to questions in the research papers, blogs,pdf's or any text context.
,make your answers more meaningful and short,end all responses with a signature after answer "-yourbot"

please answer the following question
"""
prompt_template_w_context = lambda context, question: f'''[INST] {intstructions_string}

{context}

Please answer to the following question. Use the context above if it is helpful.

{question}

[/INST]'''

In [30]:
prompt = prompt_template_w_context(context, question)
print(prompt)

[INST]  you are a textbot that helps in finding answers to questions in the research papers, blogs,pdf's or any text context.
,make your answers more meaningful and short,end all responses with a signature after answer "-yourbot"

please answer the following question


Context:
IS : ‘228 ( Part 4 ) - 1987 
Indian Standard 
METHODS FOR 
CHEMICAL ANALYSIS OF STEELS 
PART 4 DETERYlNATlON OF TOTAL CARBON 
BY GARVIMETRIC METHOD 
(FOR CARBON > O-1 PERCENT ) 
( Third Revision ) 
0. FOREWORD 
0.1 This Indian Standard ( Part 4 ) (Third R cvision ) was adopted I)y 
the Indian Standards Institution on 16 January 1987, afirr the draft 
finalized by the Methods of Chemical Analysis of Pcrrous Metals 
Sectional Committre had been approved by the Structural and Mct;~ls 
Division Council. 
0.2 IS : 228, which was issued as a tentative standard in 1952 and 
revised in 1959, covered the chemical analysis of pig iron, cast iron and 
plain carbon and low alloy steels.

IS t 
Indian Standard 228 ( Part 5 )

In [31]:

inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), max_new_tokens=280)

print(tokenizer.batch_decode(outputs)[0])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<s> [INST]  you are a textbot that helps in finding answers to questions in the research papers, blogs,pdf's or any text context.
,make your answers more meaningful and short,end all responses with a signature after answer "-yourbot"

please answer the following question


Context:
IS : ‘228 ( Part 4 ) - 1987 
Indian Standard 
METHODS FOR 
CHEMICAL ANALYSIS OF STEELS 
PART 4 DETERYlNATlON OF TOTAL CARBON 
BY GARVIMETRIC METHOD 
(FOR CARBON > O-1 PERCENT ) 
( Third Revision ) 
0. FOREWORD 
0.1 This Indian Standard ( Part 4 ) (Third R cvision ) was adopted I)y 
the Indian Standards Institution on 16 January 1987, afirr the draft 
finalized by the Methods of Chemical Analysis of Pcrrous Metals 
Sectional Committre had been approved by the Structural and Mct;~ls 
Division Council. 
0.2 IS : 228, which was issued as a tentative standard in 1952 and 
revised in 1959, covered the chemical analysis of pig iron, cast iron and 
plain carbon and low alloy steels.

IS t 
Indian Standard 228 ( Part

In [32]:
trainer.save_model("KOKO")



In [33]:
# !pip install huggingface-hub -qU
# from huggingface_hub import login

# login(token="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", add_to_git_credential=True)
# trainer.push_to_hub("Dobby/KOKO")

In [34]:
merged_model = model.merge_and_unload()




In [35]:
import re
def generate_response(prompt, model):
  encoded_input = tokenizer(prompt,  return_tensors="pt", add_special_tokens=True)
  model_inputs = encoded_input.to('cuda')

  generated_ids = model.generate(**model_inputs,
                                 max_new_tokens=150,
                                 do_sample=True,
                                 pad_token_id=tokenizer.eos_token_id)

  decoded_output = tokenizer.batch_decode(generated_ids)

  # Split the generated text into prompt and answer
  split_index = decoded_output[0].find('answer:')
  if split_index == -1:
    split_index = decoded_output[0].find('Answer:')
  decoded_output = [s.strip("'") for s in decoded_output]
  decoded_output = [s.replace('</s>', '') for s in decoded_output]
  answer = decoded_output[0][split_index+len('answer:'):].strip()
  answer = answer.replace('[INST]'or '[ANS]' , '').replace('[/INST]' or '[/ANS]' or '[//INST]', '')
  answer = re.sub(r'\[.*?\]', '', answer)

  # Stop decoding when it encounters a ### token
  stop_index = answer.find('###')
  if stop_index != -1:
    answer = answer[:stop_index].strip()

  return answer


In [52]:
prompt = """
[INST]
###Instruction:
below context is from 228_4.pdf, answer the following questions based on the context given


###pdf_filename:
228_4.pdf

###context:
Context:
IS : ‘228 ( Part 4 ) - 1987 
Indian Standard 
METHODS FOR 
CHEMICAL ANALYSIS OF STEELS 
PART 4 DETERYlNATlON OF TOTAL CARBON 
BY GARVIMETRIC METHOD 
(FOR CARBON > O-1 PERCENT ) 
( Third Revision ) 
0. FOREWORD 
0.1 This Indian Standard ( Part 4 ) (Third R cvision ) was adopted I)y 
the Indian Standards Institution on 16 January 1987, afirr the draft 
finalized by the Methods of Chemical Analysis of Pcrrous Metals 
Sectional Committre had been approved by the Structural and Mct;~ls 
Division Council. 
0.2 IS : 228, which was issued as a tentative standard in 1952 and 
revised in 1959, covered the chemical analysis of pig iron, cast iron and 
plain carbon and low alloy steels.

IS t 
Indian Standard 228 ( Part 5 ) - 1987 
METHODS FOR 
CHEMICAL ANALYSIS OF STEELS 
PART 6 DETERMINATION OF NICKEL BY 
DIMETHYLGLYOXIME ( GRAVIMETRIC ) METHOD 
( FOR NICKEL > 0’1 PERCENT) 
( Third Revision ) 
0. FOREWORD 
0.1 This Indian Standard ( Part 5 ) ( Third Revision ) was adopted by 
the Indian Standards Institution on 16 January 1987, after the draft 
finalized by the Methods of Chemical Analysis of Ferrous Metals 
Sectional Committee had been approved by the Strtictural and Metals 
Division Council. 
0.2 IS : 228, which was issued as a tentative standard in 1952 and 
revised in 1959, covered the chemial analysis of pig iron, cast iron and 
plain carbon and low alloy steels.

Accordingly, revision of IS : 228 was 
taken-up again and new series on methods of chemical analysis of 
steels including high alloy steels was published in various parts as 
IS : 228 ( Parts 1 to 13 ) ( see Appendix A ) covering separate method 
of analysis for each constituent in steels. However, IS : 228-1959* 
version has been retained for the analysis of pig iron and cast iron 
till a separate standard for analysis of pig iron and cast iron is 
published. 
0.2.1 This revision of IS : 228 (Part 6 )-1974t has been undertaken 
on the basis of experience gained during the implementation of the 
standard by the manufacturers and testing laboratories. 
0.3 In this revision, major modifications are: 
a) scope of the method has been modified by lowering the limit 
for determination of chromium from 0’5 to 0’1 percent; 
-.



###question:
2. When was this standard adopted?[/INST]
"""

generate_response(prompt, merged_model)

'This Indian Standard (Part 5) (Third Revision) was adopted by the Indian Standards Institution on 16 January 1987.'