## model and tokenizer

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model_name = "filipealmeida/Mistral-7B-Instruct-v0.1-sharded"

def load_quantized_model(model_name: str):
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        load_in_4bit=True,
        torch_dtype = torch.bfloat16,
    )

    return model

def initialize_tokenizer(model_name: str):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.bos_token_id = 1
    return tokenizer

model = load_quantized_model(model_name)

tokenizer = initialize_tokenizer(model_name)

stop_token_ids = [0]


## check if working

In [None]:
##--- no run this ---##

text = "[INST]HOW WILL AI REPLACE HUMANS[/INST]"

encoded = tokenizer(text, return_tensors="pt", add_special_tokens=False)
model_input = encoded
generate_ids = model.generate(**model_input, max_new_tokens=200, do_sample=True)
decoded = tokenizer.batch_decode(generate_ids)
print(decoded[0])

## dataset load

In [23]:
from datasets import load_dataset

# Load the instruct_tune_dataset dataset
dataset = load_dataset("Dobby091/koko")



In [24]:
print(dataset["test"][0])

{'pdf_filename': '228_6.pdf', 'context': '- \n. . . IS : 228 (Part 8)-1987 \nIndian Standard \nMETHODS FOR \nCHEMICAL ANALYSIS OF STEELS \nPART 6 DETERMINATION OF-CHROMIUM BY \nPERSULPHATE OXIDATION METHOD \n(FOR CHROMIUM)04 PERCENT) \n( Third Revision ) \nFirst Reprint JANUARY 1991 \nUDC 669.14+669*15-194*2/*3 : 543[546*76] \n/-• \n’ ! \\ . _’ \nQ Copyright 1987 \nBUREAU OF INDIAN STANDARDS \nMANAK BHAVAN, 9 BAHADUR SHAH ZAFAR MARG \nNEW DBLHI 110002 \nGr3 August 1987 IS I 228 ( Part 6 ) - 1987 \n/ Indian Standard \nMETHODS FOR \nCHEMI-CAL ANALYSIS OF STEELS \nPART 8 DETERMINATION OF CHROMIUM BY \nPERSULPHATE OXIDATION METHOD \n( FOR CHROMIUM > W PERCENT ) \n( Third Revision ) \nMethods of Chemical Analysis of Ferrous Metals \nSectional Committee, SMDC 2 \nChairman \nDR C. S. P. IYER \nMembers \nSERI G. M. APPABAO Rcprcscnting \nBhabha Atomic Research Centre, Bombay \nSteel Authority of India Ltd (Bhilai Steel Plant ), \nBhilai \nSHRI R. D. A~ABWAL ( Altrmata) \nSHRI S. V. BHAQWAT Kha

## select the trainables

### not for now

In [26]:
from datasets import Dataset, DatasetDict

# Select the first 1000 samples in the train set and the first 200 samples in the test set
train_dataset = dataset['train'].select(range(15))
test_dataset = dataset['test'].select(range(2))

# Create a new `DatasetDict` to store the selected samples
selected_dataset_dict = DatasetDict({'train': train_dataset, 'test': test_dataset})

# Print the number of samples in each split
print(f"Number of train samples: {len(train_dataset)}")
print(f"Number of test samples: {len(test_dataset)}")

selected_dataset_dict

Number of train samples: 15
Number of test samples: 2


DatasetDict({
    train: Dataset({
        features: ['pdf_filename', 'context', 'question', 'answer'],
        num_rows: 15
    })
    test: Dataset({
        features: ['pdf_filename', 'context', 'question', 'answer'],
        num_rows: 2
    })
})

## create prompt

In [31]:
def create_prompt(sample):
    bos_token = "<s>"
    base_prompt1 = "below context is from "
    base_prompt2 = ", answer the folwing questions based on the context given \n"
    document = sample['pdf_filename']
    context = sample['context']
    answer = sample['answer']
    question = sample['question']
    eos_token = "</s>"
    full_prompt = ""
    full_prompt += bos_token
    full_prompt += base_prompt1
    full_prompt += document
    full_prompt += base_prompt2
    full_prompt += "\n###context:\n" + context
    full_prompt += "\n###question:\n" + question
    full_prompt += "\n###answer:\n" + answer
    full_prompt += eos_token
    print("------------------")
    print(full_prompt)
    print("------------------")
    return full_prompt

In [33]:
create_prompt(dataset["train"][0])


------------------
<s>below context is from 228_4.pdf, answer the folwing questions based on the context given 

###context:
IS:228(Part4)-1987 
( ReaIhncd 1997 ) 
Indian Standard 
METHOD FOR 
CHEMICAL ANALYSIS OF STEELS 
PART 4 DETERMINATION OF TOTAL CARBON 
BY GRAVIMETRIC METHOD 
( FOR CARBON r 0.1 PERCENT) 
( Third Revision ) 
I;wflh Reprint NOVEMBER 1998 
UDC 669.14 + 669.15-194.2/.3 : 543.21 [546.26] 
BUREAU OF INDIAN STANDARDS 
MANAK BHAVAN, 9 BAHADUR SHAH ZAFAR MARG 
NEW DELHI 110002 
Gr 2 Aqirst 1987 IS:228(Part4)-1987 
Indian Standard 
METHODS FOR 
CHEMICAL ANALYSIS OF STEELS 
PART 4 DETERMINATION OF TOTAL CARBON 
BY GRAVIMETRIC METHOD 
(FOR CARBON > O-1 PERCENT) 
( Third Revision ) 
Methods of Chemical Analysis of Ferrous Metals 
Sectional Committee, SlMDC 2 
Chuiman Rqmsmting 
DR C. S. P. IYLR Bhabha Atomic Research Centre, Bombay 
Mcmbnr 
SHKI G. M. APPAUAO Steel Authority of India Ltd ( Bhilai Steel Plant ), 
Bhilai 
SIIBX R. D. AOAIIWAL ( AllmW) 
SHEI S. V. BHAOW~Y Khande

'<s>below context is from 228_4.pdf, answer the folwing questions based on the context given \n\n###context:\nIS:228(Part4)-1987 \n( ReaIhncd 1997 ) \nIndian Standard \nMETHOD FOR \nCHEMICAL ANALYSIS OF STEELS \nPART 4 DETERMINATION OF TOTAL CARBON \nBY GRAVIMETRIC METHOD \n( FOR CARBON r 0.1 PERCENT) \n( Third Revision ) \nI;wflh Reprint NOVEMBER 1998 \nUDC 669.14 + 669.15-194.2/.3 : 543.21 [546.26] \nBUREAU OF INDIAN STANDARDS \nMANAK BHAVAN, 9 BAHADUR SHAH ZAFAR MARG \nNEW DELHI 110002 \nGr 2 Aqirst 1987 IS:228(Part4)-1987 \nIndian Standard \nMETHODS FOR \nCHEMICAL ANALYSIS OF STEELS \nPART 4 DETERMINATION OF TOTAL CARBON \nBY GRAVIMETRIC METHOD \n(FOR CARBON > O-1 PERCENT) \n( Third Revision ) \nMethods of Chemical Analysis of Ferrous Metals \nSectional Committee, SlMDC 2 \nChuiman Rqmsmting \nDR C. S. P. IYLR Bhabha Atomic Research Centre, Bombay \nMcmbnr \nSHKI G. M. APPAUAO Steel Authority of India Ltd ( Bhilai Steel Plant ), \nBhilai \nSIIBX R. D. AOAIIWAL ( AllmW) \nSHEI S. V.