# 0. Parameters

In [1]:
document_path = "./digitized_gazette"
cleaned_document_path = "./cleaned"
embedding_store_path = "./embeddings"
model_name = "Nous-Hermes-2-Mistral-7B-DPO.Q4_0.gguf"
chunk_size = 1200
chunk_overlap = 300
chunk_count = 5
# if chunk_size * chunk_count + question_size > 8000, the llm will crash
# processing time is also affected, proportional to chunk_size * chunk_count + question_size

random_seed = 0
question_contexts_filepath = "./question_contexts.txt"
reference_answers_filepath = "./reference_answers.txt"
generated_answers_path = "./generated_answers"
caqa_max = 500
test_subset = {
    "saga": (1, 100),
    "caqa": (1, caqa_max)
}
caqa_filepath = "./test.json"
caqa_subset_filepath = "./caqa_subset.json"

## 0.1. Setup

In [2]:
%pip install --upgrade langchain langchain-community langchainhub gpt4all langchain-chroma nltk pyside6

Note: you may need to restart the kernel to use updated packages.


In [10]:
from langchain import hub
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings import GPT4AllEmbeddings
from langchain_community.llms import GPT4All
from langchain_community.utils.math import cosine_similarity
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from nltk.metrics import scores
from pathlib import Path
from PySide6 import QtCore, QtWidgets
from statistics import mean, median, stdev
from time import perf_counter_ns
import json, nltk, random, re, time

In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# 1. Text Cleaning

## 1.1. Regular Expression Creation

In [4]:
# make a regular expression that matches invalid characters or space characters after the first
# only characters in the set [^\s!-;?A-\]_a-z‘’“”—£] are valid
remove_invalid_regex = re.compile(r'[^\s!-;?A-\]_a-z‘’“”—£]+')
remove_excess_spaces_regex = re.compile(r'(\s)\s+')
def clean_str(to_clean_str):
    removed_invalid_str = remove_invalid_regex.sub(' ', to_clean_str)
    return remove_excess_spaces_regex.sub('\\1', removed_invalid_str)

In [5]:
# test the regular expression
to_clean = "This text contains¬“special” characters that need   to be removed,\n\n\n\nsometimes across multiple lines."
print(clean_str(to_clean))

This text contains “special” characters that need to be removed,
sometimes across multiple lines.


## 1.2. Clean Documents

In [14]:
statistics = {"total": {"files": 0, "tokens": 0}}
tokenizer = nltk.tokenize.word_tokenize

for file_path in Path(document_path).rglob('*.txt'):
    year = file_path.parent.stem
    if year not in statistics:
        statistics[year] = {"files": 0, "tokens": 0}

    statistics[year]["files"] += 1
    statistics["total"]["files"] += 1
    
    cleaned_filename = year + '-' + file_path.name
    with file_path.open(encoding='utf-8', errors='ignore') as to_clean_file:
        to_clean_str = to_clean_file.read()
        cleaned_str = clean_str(to_clean_str)
    with open(Path(cleaned_document_path) / cleaned_filename, 'w', encoding='utf-8') as cleaned_file:
        cleaned_file.write(cleaned_str)

    len_tokens = len(tokenizer(to_clean_str))
    statistics[year]["tokens"] += len_tokens
    statistics["total"]["tokens"] += len_tokens

In [15]:
# this is for getting a pretty-printed table of the file statistics
print("Year\tFiles\tTokens")
for year,data in statistics.items():
    print(year, data["files"], data["tokens"], sep="\t")

Year	Files	Tokens
total	249	4178719
1907	11	248237
1919	22	226873
1920	27	256913
1922	12	252174
1926	12	280533
1927	11	214923
1928	12	121251
1929	6	93030
1930	12	271482
1931	11	246402
1932	12	189186
1933	12	112095
1934	11	65673
1935	12	187264
1936	12	266664
1937	12	210850
1939	1	13978
1941	1	16862
1947	12	260970
1948	12	275835
1949	1	28910
1951	1	18168
1953	10	212656
1958	4	107790


# 2. Embedding Database Creation

## 2.1. Load Documents

In [4]:
docs = []
for file_path in Path(cleaned_document_path).glob('*.txt'):
    loader = TextLoader(file_path, autodetect_encoding=True)
    docs += loader.load()

In [5]:
# verify documents have been loaded properly
print("Length:", len(docs[0].page_content))
print("Excerpt:", docs[0].page_content[:5000])

Length: 93576
Excerpt: 
VOL. XXXVII.
No. 495.
Price	THURSDAY, APRIL 4, 1907.	10 cents.
CONTENTS.
BIIITH.
ORDERS:Sailors taking dischargeAdulteration of Gutta .IimgknrLeper CampFirearms.
NOTIFICATIONS :Outstation Money Orders-Postcards -Money Orders Advice of Payment.
NOTICES :- Bankruptcy Court (Kuching)Probate (Simang- gang)To Master? of vessels entering SadongBankruptcy (Bau)	Bankruptcy (Kabong)Sale of Land Grant No.
800Wanted, Land Grant No. 842 of 3rd March 1907 Sale of Land Grants 684 and 792.
STRAITS SETTLEMENTS COINAGE PROCLAMATIONAPPOINTMENTS SARAWAK RACKS. 1907 THE ADVANTAGES OF A GOLD COINAGE THE CHINESE INSTITUTECLAY PIGEON SHOOTING AT Smu Ova NOTES.
MONTHLY REPORTS:Lower Rejnng-Baram--Lundu- -Sadong Ma tang EstateBintnluBauKalakaSimanggang M ukaOyn LinibangIAiwasKapit.
ADVERTISEMENTSPASSENGER LISTTRADE RETURNS- SHIPPINGALMANACMETEOROLOGICAL READINGS.
The Sarawak Gazette.
THURSDAY, APRIL 4, 1007.
Birth.
WADDELLAt Cheriton," Singapore, on the 20th March, the wife of Capt. WAD

## 2.2. Split Documents

In [6]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, add_start_index=True)
docs_splits = text_splitter.split_documents(docs)

In [7]:
# verify documents have been split properly
print("Length:", len(docs_splits))
print("\nExcerpt 1:", docs_splits[10000].page_content)
print("\nExcerpt 2:", docs_splits[10001].page_content)

Length: 23967

Excerpt 1: Forestry.—Forest Ranger Abang Daud reports that various quantities of billian timber have been worked and left in the Krangan area. Notices have been issued that this will be seized and auctioned if the owners do not lay their claims within a specified time limit.
Travelling.—I left, on the 18th intending to visit the Sea Dyaks in the Ulu Sebangan. At Kepayang however, it was found that there was no water in the river and no path through the jungle. The visit was abandoned and I returned to the station on the 21st.	Marine.—Two experts from the Government Workshop arrived to attend to m.l. Vava. They could not improve matters, however, and the launch was shipped to Kuching for overhaul.
General.—The Acting Resident, Mr. J. C. Swayne visited the station from the 42th to the 15th, and settled all outstanding matters requiring his attention.	The 24th, being the anniversary of Her Highness the Ranee's birthday, the offices were closed and the flagstaff dressed.
Two

## 2.3. Create Embeddings

In [3]:
gpt4all_embedder = GPT4AllEmbeddings(model_name="all-MiniLM-L6-v2.gguf2.f16.gguf", gpt4all_kwargs={"allow_download": True})

In [9]:
embeddings = Chroma.from_documents(
    documents=docs_splits,
    embedding=gpt4all_embedder,
    persist_directory=embedding_store_path
)
# because persist_directory is specified, embeddings are automatically saved

# 3. Use the Model

In [3]:
# load the saved embedder and get the Chroma database created earlier
# DO NOT run this block if the above block has already been run
gpt4all_embedder = GPT4AllEmbeddings(model_name="all-MiniLM-L6-v2.gguf2.f16.gguf", gpt4all_kwargs={"allow_download": True})
embeddings = Chroma(embedding_function=gpt4all_embedder, persist_directory=embedding_store_path)

## 3.1. Context Retriever

In [4]:
# create the retriver that grabs the most relevant contexts
# "similarity" is cosine similarity
# "k": n means that the n most similar contexts are plucked
retriever = embeddings.as_retriever(search_type="similarity", search_kwargs={"k": chunk_count})

In [5]:
# verify retriever
retrieved_contexts = retriever.invoke("What events have happened in Limbang?")
print("Contexts:", len(retrieved_contexts))
print("Best Match:", retrieved_contexts[0].page_content)

Contexts: 5
Best Match: and falsification of books. The Resident, Fourth Division, left Miri on Feb ruary 18th for a tour in the old Fifth Division, returning on the 28th. During the course of this tour he visited Limbang, Lawas, Trusan, Sundar, Brooketon. Brunei and Labuan. Nothing of much importance was brought forward except the difficulty of keeping up Government roads, grounds and buildings on the reduced votes. Limbang Bazaar appeared to be flourishing and full of people and goods, and Lawas Bazaar has built a substantial school. Lawas and Limbang rubber estates are doing well. The District Officer, Limbang and Lawas. reports that Court work was exceedingly light during the month, and no crime of any importance was reported or detected in his district. Very few Dayaks or up-river natives visited Limbang during February, being all busy with their padi-harvests. Reaping was in full swing in both hill and swamp land by the end of the month. The pepper crop was ripening by the end of

## 3.2. Prompt Building

In [6]:
prompt = hub.pull("rlm/rag-prompt")
print(prompt)

input_variables=['context', 'question'] metadata={'lc_hub_owner': 'rlm', 'lc_hub_repo': 'rag-prompt', 'lc_hub_commit_hash': '50442af133e61576e74536c6556cefe1fac147cad032f4377b60c436e6cdcb6e'} messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"))]


In [7]:
# load the LLM interface
llm = GPT4All(model=model_name, allow_download=True)

In [8]:
# while the retriever does return a list of context documents, they need to be concatenated
# into a single str for the LLM, so this function does exactly that
context_sep = "\n\n"
def format_docs(docs):
    # build a list of strs
    context_strs = [doc.page_content for doc in docs]
    # connect everything with context_sep and return it
    return context_sep.join(context_strs)

# this links all of the above into a single chain
rag_chain = (
    # question = user question
    # context = retriever over user question, then use format_docs on the retriever's outputs
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    # pass both to the prompt, filling it up with the question and context
    | prompt
    # pass the prompt to the LLM
    | llm
    # get the string from the LLM's output
    | StrOutputParser()
)

## 3.3. Wrapping Up
The documentation of LangChain notes that it is possible to add chat history into the application, but this results in two separate calls. Chat history is out-of-scope based on the criteria outlined in the research questions.

Instead, this section will end by the creation of a function for using the chain created above.

In [9]:
response_sep = " "
def rag(question):
    responses = (chunk for chunk in rag_chain.stream(question))
    response_str = response_sep.join(responses)
    return response_str

In [16]:
# verify chain
print(rag("What is the current situation of mines in Bau District, August 1948, and what suggestions does Mr. Richards have for them to show a profit?"))

 In August 1948, mines in Bau District were producing gold and showing promise with new developments. The Bidi Gold Mining Company was obtaining actual gold during April 1932, while the Ban Lee Mine had reports of production but showed a Nil Return. Two more mines opened by Ng Kui Hiung and Lee Tung Sen also showed potential for being relatively rich in ores. Mr. Richards suggests that mining companies should make genuine efforts to recommence operations within two years as the Supreme Council resolved upon certain measures, such as remission of rents and continuance of royalty on gold won, to assist the rehabilitation of the mining industry.


# 4. Evaluation Dataset & Generated Answer Creation
Section 3 *must* be run before this section!

## 4.1. Sarawak Gazette

In [17]:
text_splitter_qg = RecursiveCharacterTextSplitter(chunk_size=5000, chunk_overlap=300, add_start_index=True)
docs_splits_qg = text_splitter_qg.split_documents(docs)

In [18]:
# verify documents have been split properly
len(docs_splits_qg)

4656

In [19]:
# get 100 random chunks
random.seed(random_seed) # needed for reproducible results
indexes_to_retrieve = random.sample(range(len(docs_splits_qg)), 120) # an extra 20 is specified here as some chunks might be dirty
contexts = [docs_splits_qg[i].page_content + '\n\n[NEXT EXCERPT]\n\n' for i in indexes_to_retrieve]

In [20]:
# write the contexts into a file, unfortunately the contexts must be checked manually
with open(question_contexts_filepath, 'w', encoding='utf-8') as question_contexts_file:
    question_contexts_file.writelines(contexts)

## 4.2. Subset of ChroniclingAmericaQA

In [21]:
# load JSON
with open(caqa_filepath, encoding="utf-8") as testing_file:
    caqa_json = json.load(testing_file)
print(len(caqa_json))
print(caqa_json[0])

24084
{'question': 'How many lots did Thomas Peirce have?', 'answer': '183', 'org_answer': '183', 'para_id': 'New Hampshire_18030125_16', 'context': 'Axivil Roberts, part of lot 180 108 60 Capt. George Walker, 181 140 35 George Townson, 183 48 19 Samuel Snell, 184 36 9 Samuel Waterhouse, 185 24 6 John Parker, 186 36 10 John Davis, 187 45 20 John Cross, 188 15 4 Benjamin Cross, 189 50 13 Widow Gilman, 209 21 8 George Peirce, 209 200 75 Thomas Peirce, 220 185 71 SIXTH RANGE: Col. Henry Sherburne, 241 552 150 Nathaniel Roberts, 249 30 8 Jonathan Patridge, 252 60 18 John Sherburne’s widow, 253 24 7 Edward Wells, 254 120 50 John Collins, 272 60 90 Joseph Simpson, part lot 293 40 12 Abraham Senter, 276 68 24 John Cotton’s children, 278 30 60 Francis Devere, 270 60 23 The hundred acre lots: Ephraim Jackman, part of 6 65 5 Nathaniel Mendum, 13 65 35 Joshua Peirce, 87 105 35 Mark Wentworth, Esq. 38 105 35 Nathaniel Peverly, 49 5 8 James Gray, 41 105 85 The fifty acre lots: Jonathan Hanson, 50 5

In [24]:
# choose only n out of 24084 examples
random.seed(random_seed)
indexes_to_retrieve = random.sample(range(len(caqa_json)), caqa_max)
caqa_dataset = []
for i in indexes_to_retrieve:
    example = caqa_json[i]
    caqa_dataset.append({
        "context": example["context"],
        "question": example["question"],
        "answer": example["answer"]
    })
print(len(caqa_dataset))
print(caqa_dataset[0])

500
{'context': 'If IMIE uodcuigtinl KxAii.i urs appointed by JL County Commissioners for Hurt turd county, will meet on the ground, On THURSDAY, February 11th, 1868. At 10 o\'clock, A. M., to ascertain and determine whether the public convenience requires a public road to be opened, commencing near Ruff\'s Mill, and passing through the heads of Samuel M. Magraw, Thomas Sutton and Thomas Hendon, to intersect, at a convenient point, the road leading from Schuck’s to Bull\'s Shops. WM T. CLARK, WM. MECHEM. R. JOHN ROUSS. janS Examiners. CARRIAGE AND TIRE BOLTS, AXLE CLIPS, And Cushion Buttons, Carriage Oil Cloths, Linings, PrinuftS In stock of MOORE & NORRIS, novii ltd Ar, Md. Im have selling; ptxfililLg )o\\ Utr-M [r\'V A. Hln .1., Willi apl. Harne, I <.. Kew Y.irk. - F CMIKTIUNI, NEW AND I\'IlEMI.- A \\. ■ I■, T„ £• ‘H, WM.AE MUSIC t POrutAß |MUC>;g. ‘ IlilrhciHk S Hcll-UiuihSurii\'aai Mmir Ir.rthf Mi 11;,,,, " uTA^v m, i. v^ ,ll “l ”f 111.- r,.i,ii,- s,. ,ir CAPTAIN JINKS OF THE HORSE

In [25]:
# save as new JSON file
with open(caqa_subset_filepath, 'w', encoding='utf-8') as caqa_subset_file:
    json.dump(caqa_dataset, caqa_subset_file, indent=4)

## 4.3. Generated Answers for Sarawak Gazette

In [26]:
# load up the questions
questions = []
with open(reference_answers_filepath, encoding="utf-8") as testing_file:
    for line in testing_file:
        if line.startswith("Q: "):
            questions.append(line[3:])

In [27]:
# verify questions are properly loaded
print("Questions:", len(questions))

Questions: 100


In [28]:
# feed the questions into the LLM to get the generated answers, and save after each generated answer
file_path = Path(generated_answers_path)
test_subset_saga = test_subset["saga"]
question_subset = questions[(test_subset_saga[0]-1):test_subset_saga[1]]
questions_len = len(question_subset)
start_time_ns = perf_counter_ns()
print("Questions to generate answers for:", questions_len)
print("({} to {})".format(test_subset_saga[0], test_subset_saga[1]))

for index, question in enumerate(question_subset):
    print("Generating answer for question \"", question, "\"...")
    generated_answer = rag(question)
    
    with open(file_path / "saga" / (str(index + test_subset_saga[0]) + '.txt'), 'w', encoding='utf-8') as generated_answer_file:
        generated_answer_file.write(generated_answer)
    
    print("Done. Questions Left:", questions_len - index - 1)
    time_elapsed_ns = perf_counter_ns() - start_time_ns
    time_estimated_ns = time_elapsed_ns / (index + 1) * (questions_len - (index + 1))
    print("Minutes Left: {:.2f}".format(time_estimated_ns / 6e10))

Questions to generate answers for: 100
(1 to 100)
Generating answer for question " What was the reason for issuing notices prohibiting strangers from collecting fruit in certain Dyak areas?
 "...
Done. Questions Left: 99
Minutes Left: 213.95
Generating answer for question " Who visited Saratok on June 14th, 1933?
 "...
Done. Questions Left: 98
Minutes Left: 214.92
Generating answer for question " What was the cause of the stampede among the cattle in Simanggang?
 "...
Done. Questions Left: 97
Minutes Left: 212.98
Generating answer for question " What were some of the events that occurred in Balingian during May 1927?
 "...
Done. Questions Left: 96
Minutes Left: 212.13
Generating answer for question " What were some of the challenges faced by passengers during their journey from Hong Kong to Swatow in 1927?
 "...
Done. Questions Left: 95
Minutes Left: 212.59
Generating answer for question " What were some of the challenges faced by communities living near rivers such as Simunjan and Ged

## 4.4. Generated Answers for Subset of ChroniclingAmericaQA

In [29]:
# load JSON
with open(caqa_subset_filepath, encoding="utf-8") as testing_file:
    caqa_dataset = json.load(testing_file)
print(len(caqa_dataset))
print(caqa_dataset[0])

500
{'context': 'If IMIE uodcuigtinl KxAii.i urs appointed by JL County Commissioners for Hurt turd county, will meet on the ground, On THURSDAY, February 11th, 1868. At 10 o\'clock, A. M., to ascertain and determine whether the public convenience requires a public road to be opened, commencing near Ruff\'s Mill, and passing through the heads of Samuel M. Magraw, Thomas Sutton and Thomas Hendon, to intersect, at a convenient point, the road leading from Schuck’s to Bull\'s Shops. WM T. CLARK, WM. MECHEM. R. JOHN ROUSS. janS Examiners. CARRIAGE AND TIRE BOLTS, AXLE CLIPS, And Cushion Buttons, Carriage Oil Cloths, Linings, PrinuftS In stock of MOORE & NORRIS, novii ltd Ar, Md. Im have selling; ptxfililLg )o\\ Utr-M [r\'V A. Hln .1., Willi apl. Harne, I <.. Kew Y.irk. - F CMIKTIUNI, NEW AND I\'IlEMI.- A \\. ■ I■, T„ £• ‘H, WM.AE MUSIC t POrutAß |MUC>;g. ‘ IlilrhciHk S Hcll-UiuihSurii\'aai Mmir Ir.rthf Mi 11;,,,, " uTA^v m, i. v^ ,ll “l ”f 111.- r,.i,ii,- s,. ,ir CAPTAIN JINKS OF THE HORSE

In [30]:
# make a new chain since the retriever is not used
caqa_eval_rag_chain = (
    RunnablePassthrough()
    | prompt
    | llm
    | StrOutputParser()
)

In [31]:
# make a new function that passes the input to the llm
response_sep = " "
def caqa_eval_rag(example):
    question_and_context = {"question": example["question"], "context": example["context"]}
    responses = (chunk for chunk in caqa_eval_rag_chain.stream(question_and_context))
    response_str = response_sep.join(responses)
    return response_str

In [32]:
# verify chain
print("Generated Answer:", caqa_eval_rag(caqa_dataset[0]))

Generated Answer:  The heads of Thomas Hendon were passed through by a road that was being considered for opening, commencing near Ruff's Mill and intersecting at a convenient point with the road leading from Schuck’s to Bull's Shops.


In [34]:
# feed the questions into the LLM to get the generated answers, and save after each generated answer
file_path = Path(generated_answers_path)
test_subset_caqa = test_subset["caqa"]
caqa_subset = caqa_dataset[(test_subset_caqa[0]-1):test_subset_caqa[1]]
caqa_subset_len = len(caqa_subset)
start_time_ns = perf_counter_ns()
print("Questions to generate answers for:", caqa_subset_len)
print("({} to {})".format(test_subset_caqa[0], test_subset_caqa[1]))

for index, example in enumerate(caqa_subset):
    print("Generating answer for question \"", example["question"], "\"...")
    generated_answer = caqa_eval_rag(example)
    
    with open(file_path / "caqa" / (str(index + test_subset_caqa[0]) + '.txt'), 'w', encoding='utf-8') as generated_answer_file:
        generated_answer_file.write(generated_answer)
    
    print("Done. Questions Left:", caqa_subset_len - index - 1)
    time_elapsed_ns = perf_counter_ns() - start_time_ns
    time_estimated_ns = time_elapsed_ns / (index + 1) * (caqa_subset_len - (index + 1))
    print("Minutes Left: {:.2f}".format(time_estimated_ns / 6e10))

Questions to generate answers for: 500
(1 to 500)
Generating answer for question " Along with Samuel M. Magraw, Thomas Hendon and Thomas Hendon, who passed through the heads of Samuel M. Magraw? "...
Done. Questions Left: 499
Minutes Left: 509.28
Generating answer for question " When was the Masonic Temple Bonds Office Masonic Trust established? "...
Done. Questions Left: 498
Minutes Left: 448.64
Generating answer for question " Who was the Judge of the Supreme Court? "...
Done. Questions Left: 497
Minutes Left: 440.56
Generating answer for question " Along with Louisiana, Missouri, Iowa, and Minnesota, what state has a portion west of the Mississippi River? "...
Done. Questions Left: 496
Minutes Left: 410.67
Generating answer for question " How often is wheat sold in Chicago? "...
Done. Questions Left: 495
Minutes Left: 389.66
Generating answer for question " How many delegates did Buptem er Hi h 88b elect? "...
Done. Questions Left: 494
Minutes Left: 429.87
Generating answer for ques

# 5. Model Evaluation

## 5.1. Evaluation Prepration

In [31]:
# create a function to calculate metrics
tokenizer = nltk.tokenize.word_tokenize
stemmer = nltk.stem.snowball.SnowballStemmer("english")

def perform_evaluation(generated_answers: list, reference_answers: list) -> dict:
    precisions = []
    recalls = []
    cosines = []
    
    for gen_answer, ref_answer in zip(generated_answers, reference_answers, strict=True):
        gen_tokens = tokenizer(gen_answer)
        ref_tokens = tokenizer(ref_answer)

        # lemmatize tokens
        gen_lemmatized = (stemmer.stem(token) for token in gen_tokens)
        ref_lemmatized = (stemmer.stem(token) for token in ref_tokens)
        
        # transform into sets
        gen_set = set(gen_lemmatized)
        ref_set = set(ref_lemmatized)

        # calculate metrics for this example
        precisions.append(scores.precision(ref_set, gen_set))
        recalls.append(scores.recall(ref_set, gen_set))

        # embed answers
        gen_embedding = gpt4all_embedder.embed_query(gen_answer)
        ref_embedding = gpt4all_embedder.embed_query(ref_answer)

        # calculate cosine similarity for this example
        cosines.append(cosine_similarity([gen_embedding], [ref_embedding])[0][0])

    print("\tPrecision\tRecall\t\tCosine Similarity")
    print("Mean\t{:.6f}\t{:.6f}\t{:.6f}".format(mean(precisions), mean(recalls), mean(cosines)))
    print("Median\t{:.6f}\t{:.6f}\t{:.6f}".format(median(precisions), median(recalls), median(cosines)))
    print("Minimum\t{:.6f}\t{:.6f}\t{:.6f}".format(min(precisions), min(recalls), min(cosines)))
    print("Maximum\t{:.6f}\t{:.6f}\t{:.6f}".format(max(precisions), max(recalls), max(cosines)))
    if len(generated_answers) > 1:
        print("St.Dev.\t{:.6f}\t{:.6f}\t{:.6f}".format(stdev(precisions), stdev(recalls), stdev(cosines)))

In [32]:
# verify precision is calculated properly
perform_evaluation(
    [
        "These examples will have 100% precision.",
        "This is because excess tokens are not present."
    ],
    [
        "These examples will have 100% average precision over two examples.",
        "This is because excess tokens that are not in the reference answer are not present."
    ]
)

	Precision	Recall		Cosine Similarity
Mean	1.000000	0.685065	0.849552
Median	1.000000	0.685065	0.849552
Minimum	1.000000	0.642857	0.808039
Maximum	1.000000	0.727273	0.891066
St.Dev.	0.000000	0.059691	0.058708


In [33]:
# verify recall is calculated properly
perform_evaluation(
    [
        "These examples will have 100% average recall over two examples.",
        "This is because there are no tokens that are present in the reference answer that are also absent in the generated answer."
    ],
    [
        "These examples will have 100% recall.",
        "This is because there are no tokens that are absent."
    ]
)

	Precision	Recall		Cosine Similarity
Mean	0.657754	1.000000	0.722244
Median	0.657754	1.000000	0.722244
Minimum	0.588235	1.000000	0.559506
Maximum	0.727273	1.000000	0.884983
St.Dev.	0.098314	0.000000	0.230147


In [34]:
# verify cosine similarity is calculated properly
perform_evaluation(
    [
        "Perfect cosine similarity is practically impossible to demonstrate, \
        but two sentences with different words that mean the same should yield higher values for this metric than precision and recall.",
    ],
    [
        "It is impractical to show flawless cosine similarity, \
        but this metric should give higher values than recall & precision to two sentences with similar meanings, even if the words are dissimilar."
    ]
)

	Precision	Recall		Cosine Similarity
Mean	0.677419	0.677419	0.930462
Median	0.677419	0.677419	0.930462
Minimum	0.677419	0.677419	0.930462
Maximum	0.677419	0.677419	0.930462


## 5.2. Testing Over Sarawak Gazette QA Dataset

In [35]:
# load reference answers
reference_answers = []
with open(reference_answers_filepath, encoding="utf-8") as testing_file:
    for line in testing_file:
        if line.startswith("A: "):
            reference_answers.append(line[3:])
# verify reference answers are properly loaded
print("Reference Answers:", len(reference_answers))
print("Example:", reference_answers[0])

Reference Answers: 100
Example: The reason for issuing notices prohibiting strangers from collecting fruit in certain Dyak areas was because they could not afford to offer them free board and lodging this season as is their usual custom, and they hoped to raise some ready cash by trading surplus fruit in the bazaars themselves, to augment their shortage of padi.



In [36]:
# load generated answers
file_path = Path(generated_answers_path)
generated_answers = []
for i in range(len(reference_answers)):
    with open(file_path / "saga" / (str(i+1) + '.txt'), encoding='utf-8') as generated_answer_file:
        generated_answers.append(generated_answer_file.read())
# verify generated answers are properly loaded
print("Generated Answers:", len(generated_answers))
print("Example:", generated_answers[0])

Generated Answers: 100
Example:  The reason for issuing notices prohibiting strangers from collecting fruit in certain Dyak areas was to protect the ownership rights of the local Dyaks over their fruit trees and prevent disputes arising from outsiders encroaching on their lands or planting crops, such as rubber gardens, near their fruit groves.


In [37]:
# evaluate
perform_evaluation(generated_answers, reference_answers)

	Precision	Recall		Cosine Similarity
Mean	0.487864	0.430915	0.678473
Median	0.494186	0.407077	0.719342
Minimum	0.000000	0.000000	-0.066291
Maximum	0.942857	0.938776	0.981141
St.Dev.	0.196458	0.199531	0.218081


## 5.3. Testing Over ChroniclingAmericaQA Dataset

In [38]:
# load JSON
reference_answers = []
with open(caqa_subset_filepath, encoding="utf-8") as testing_file:
    caqa_dataset = json.load(testing_file)
for example in caqa_dataset:
    reference_answers.append(example["answer"])

print("Reference Answers:", len(reference_answers))
print("Example:", reference_answers[0])

Reference Answers: 500
Example: Thomas Sutton


In [39]:
# load generated answers
file_path = Path(generated_answers_path)
generated_answers = []
for i in range(len(reference_answers)):
    with open(file_path / "caqa" / (str(i+1) + '.txt'), encoding='utf-8') as generated_answer_file:
        generated_answers.append(generated_answer_file.read())
# verify generated answers are properly loaded
print("Generated Answers:", len(generated_answers))
print("Example:", generated_answers[0])

Generated Answers: 500
Example:  The heads of Thomas Hendon were passed through by a road that was being considered for opening, commencing near Ruff's Mill and intersecting at a convenient point with the road leading from Schuck’s to Bull's Shops.


In [40]:
# evaluate
perform_evaluation(generated_answers, reference_answers)

	Precision	Recall		Cosine Similarity
Mean	0.076981	0.645786	0.311018
Median	0.058824	1.000000	0.310754
Minimum	0.000000	0.000000	-0.128224
Maximum	0.800000	1.000000	0.818458
St.Dev.	0.087765	0.448431	0.186698


# 6. User Interface
Section 3 *must* be run before this section!

In [13]:
class MyWidget(QtWidgets.QDialog):
    start_message = "SYSTEM: Hello! I am a chatbot designed to answer questions related to the Sarawak Gazette. Type your question in the text box below."
    
    def __init__(self, invoke_func = None):
        super(MyWidget, self).__init__()
        self.invoke_func = invoke_func
        
        self.texts = QtWidgets.QLabel(self.start_message)
        self.text_input = QtWidgets.QLineEdit()
        self.button = QtWidgets.QPushButton("Send")
        self.button.clicked.connect(self.send_user_input)
        self.button.clicked.connect(self.text_input.clear)

        self.layout = QtWidgets.QVBoxLayout()
        self.layout.addWidget(self.texts)
        self.layout.addWidget(self.text_input)
        self.layout.addWidget(self.button)
        self.setLayout(self.layout)

    @QtCore.Slot()
    def send_user_input(self):
        question_str = self.text_input.text()
        self.texts.setText(self.texts.text() + "\nHUMAN: " + question_str)
        self.text_input.setText("")
        
        self.button.setText("Waiting for response, program may hang during this time...")
        if self.invoke_func:
            llm_response = self.invoke_func(question_str)
            self.texts.setText(self.texts.text() + "\nSYSTEM: " + llm_response)
            self.button.setText("Send")

In [14]:
if __name__ == '__main__':
    app = QtWidgets.QApplication([])

In [None]:
if __name__ == '__main__':
    widget = MyWidget(rag)
    widget.show()
    app.exec()