In [None]:
!pip install langchain_community -qq #The langchain community builds tools that are stored here
!pip install -U langchain -qq
!pip install unstructured -qq #This will be used to load in our excel sheets (The textbooks)

In [None]:
!pip install transformers==4.33.0 accelerate==0.22.0 einops==0.6.1 xformers==0.0.21 bitsandbytes==0.41.1 sentence_transformers==2.2.2 chromadb==0.4.12

In [3]:
import pandas as pd
import numpy as np
import os
from torch import cuda, bfloat16
import torch
import transformers
from transformers import AutoTokenizer
from time import time
from tqdm import tqdm
from langchain.llms import HuggingFacePipeline
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA,ConversationalRetrievalChain
from langchain.vectorstores import Chroma
from langchain_community.document_loaders import UnstructuredExcelLoader

In [4]:
train = pd.read_csv("Train.csv")
train

Unnamed: 0,ID,Question Text,Question Answer,Reference Document,Paragraph(s) Number,Keywords
0,Q829,Compare the laboratory confirmation methods fo...,Chikungunya is confirmed using serological tes...,TG Booklet 6,"154, 166",Laboratory Confirmation For Chikungunya Vs. Di...
1,Q721,When should specimens be collected for Anthrax...,Specimens should be collected during the vesic...,TG Booklet 6,140,"Anthrax Specimen Collection: Timing, Preparati..."
2,Q464,Which key information should be recorded durin...,"During a register review, key information abou...",TG Booklet 3,439-440,"Register Review, Key Information, Suspected Ca..."
3,Q449,Why is the District log of suspected outbreaks...,The log includes information about response ac...,TG Booklet 3,412,"District Log, Response Activities, Steps Taken..."
4,Q6,What do Community based surveillance strategie...,Community-based surveillance strategies focus ...,TG Booklet 1,86,"Community-based Surveillance Strategies, Ident..."
...,...,...,...,...,...,...
743,Q413,Which section of the guidelines provides a des...,Section 11.0 of these 3rd Edition Malawi IDSR ...,TG Booklet 3,376,"Control Measures Description, Priority Disease..."
744,Q626,"Does MEF stand for an abbreviation in the TG, ...",Medical Teams International,TG Booklet 6,106,Medical Teams International
745,Q1141,In what ways do the verification and documenta...,"In emergency contexts, verification and docume...",TG Booklet 5,105-106,"Verification, Documentation, Early Warning, Em..."
746,Q331,What role does the examination of burial cerem...,Examining burial ceremonies helps identify pot...,TG Booklet 3,287,"Burial Ceremonies Examination, Exposure, Trans..."


In [5]:
test = pd.read_csv("Test.csv")
test

Unnamed: 0,ID,Question Text
0,Q4,"What is the definition of ""unusual event"""
1,Q5,What is Community Based Surveillance (CBS)?
2,Q9,What kind of training should members of VHC re...
3,Q10,What is indicator based surveillance (IBS)?
4,Q13,What is Case based surveillance?
...,...,...
494,Q1229,Where should completeness be evaluated in the ...
495,Q1230,Which dimensions of completeness are crucial i...
496,Q1236,How can the completeness of case reporting be ...
497,Q1239,Where should completeness and timeliness of re...


In [6]:
question = test["Question Text"][0]
question

'What is the definition of "unusual event"'

In [7]:
model_id = 'meta-llama/Llama-2-13b-chat-hf'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

# set quantization configuration to load large model with less GPU memory
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

In [8]:
# Using Transformers to load in the model
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
)
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

config.json:   0%|          | 0.00/587 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/33.4k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/9.90G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/6.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [9]:
# Transformers Pipeline
query_pipeline = transformers.pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        torch_dtype=torch.float16,
        device_map="auto",)

In [10]:
def test_model(tokenizer, pipeline, prompt_to_test):
    sequences = pipeline(
        prompt_to_test,
        do_sample=True,
        top_k=10,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        max_length=200,)
    for seq in sequences:
        print(f"Result: {seq['generated_text']}")

In [11]:
#testing performance of raw model
print(question)
test_model(tokenizer, query_pipeline, question)

What is the definition of "unusual event"
Result: What is the definition of "unusual event" in the context of the NRC's regulations?
A: The definition of "unusual event" in the context of the NRC's regulations can be found in 10 CFR 50.72(b), which states that an "unusual event" is any event that:

1. Is not expected to occur, or
2. Is not a normal operating condition, or
3. Is not a planned maintenance or testing activity, or
4. Is not a result of a design basis event or a natural phenomena, and
5. Could potentially have a significant impact on the safe operation of the facility or the health and safety of the public.

This definition encompasses a wide range of events that could potentially affect the safe operation of a nuclear facility, including equipment failures, human errors, and natural disasters. The NRC


###### Testing the Model with HuggingFace Pipeline Class

In [12]:
#HuggingFace Pipeline Class
llm = HuggingFacePipeline(pipeline=query_pipeline)
llm(prompt=question)

  warn_deprecated(


' in the context of the NRC\'s regulations?\nAnswer: The definition of "unusual event" in the context of the NRC\'s regulations can be found in 10 CFR 50.72(b), which states:\n\n"Unusual event means any event or situation that is not routine and that may have the potential to adversely affect the safe operation of the facility or the health and safety of the public."\n\nThis definition encompasses a wide range of events or situations that may be considered unusual or abnormal, such as equipment failures, natural disasters, cyber attacks, or other unforeseen events that could potentially impact the safe operation of a nuclear facility. The determination of whether an event is unusual is based on the specific circumstances of each situation and the potential for the event to have a significant impact on the safe operation of the facility or the health and safety of the public.'

In [13]:
books_path = "MWTGBookletsExcel"
booklets = os.listdir(books_path)

In [14]:
#load textbooks and extend in list
loaders = [UnstructuredExcelLoader(f"{books_path}/{i}") for i in booklets]
docs = []
for loader in loaders:
    docs.extend(loader.load())

In [15]:
# Splitting the data
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
all_splits = text_splitter.split_documents(docs)

In [16]:
# Tokenizing the data
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {"device": "cuda"}

embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [17]:
#create vector db
vectordb = Chroma.from_documents(documents=all_splits, embedding=embeddings, persist_directory="chroma_db")

In [18]:
#retrieve top k(=1) relevant splits, which when combined with the prompt(question), give a RAG-based answer
retriever = vectordb.as_retriever(k=1)

qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    verbose=True,
)


In [19]:
test=test[:1]

In [20]:
results = []
sources = []
for question in tqdm(test["Question Text"]):
      result = qa.run(question).split("\n")[0]
      docs = vectordb.similarity_search(result)
      source = docs[0].metadata['source'].split("/")[-1]

      results.append(result)
      sources.append(source)

  warn_deprecated(




[1m> Entering new RetrievalQA chain...[0m


100%|██████████| 1/1 [00:31<00:00, 31.99s/it]


[1m> Finished chain.[0m





In [21]:
results

[' The definition of "unusual event" is not specified in the provided text. However, the text mentions that an event is defined as "a manifestation of disease, or an occurrence that creates a potential for disease" and that it may be insignificant or could be a significant occurrence, planned or unplanned. It also mentions that an event may be an emergency incident or occurrence that may impact the safety and security of communities. Therefore, an unusual event would likely be an event that is not commonly seen or expected, such as an outbreak of a rare disease or an unexpected increase in the number of people falling ill.']

In [22]:
sources

['TG Booklet 6.xlsx']