## 1. Data Preparation

In [1]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=512,
                                               chunk_overlap=20)

In [24]:
from langchain.document_loaders import PyMuPDFLoader

document = PyMuPDFLoader("./data/wordWarData.pdf")
loaded_doc = document.load_and_split(text_splitter=text_splitter)
splitted_doc = [doc.page_content for doc in loaded_doc]
doc_ids = list(range(0, len(loaded_doc)))

In [5]:
splitted_doc

['A Brief Description over World War 1 and 2 \n \nWorld War 1 and World War 2: A Historical Overview \nWorld War 1 (WW1) and World War 2 (WW2) stand as two \nof the most consequential conflicts in modern history, \nprofoundly shaping global politics, societies, and economies. \nThese wars, spanning from 1914 to 1918 and 1939 to 1945 \nrespectively, involved numerous nations and had far-reaching \nimpacts on military strategy, technology, and international \nrelations. \nWorld War 1',
 'World War 1 \nCauses and Major Events: World War 1 erupted in 1914 \nfollowing the assassination of Archduke Franz Ferdinand of \nAustria-Hungary by a Serbian nationalist. This event triggered \na chain reaction of alliances and declarations of war, leading \nto the formation of two main opposing alliances: the Allies \nand the Central Powers. The Allies included France, Britain, \nRussia, and later the United States, while the Central Powers \ncomprised Germany, Austria-Hungary, and the Ottoman \nEmpire

In [6]:
from langchain.embeddings import FastEmbedEmbeddings

embedding_model = FastEmbedEmbeddings(
    model_name="BAAI/bge-small-en-v1.5"
)

vectors = embedding_model.embed_documents(splitted_doc)
EMBEDDING_DIM = len(vectors[0])
print(EMBEDDING_DIM)

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

384


## 2. Storing Vectors in the Database

In [7]:
from qdrant_client import QdrantClient

client = QdrantClient(":memory:")

In [8]:
from qdrant_client.models import Distance, VectorParams

client.delete_collection(collection_name="cf_data")
client.create_collection(
    collection_name="cf_data",
    vectors_config=VectorParams(size=EMBEDDING_DIM, distance=Distance.COSINE)
)

client.upload_collection(
    collection_name="cf_data",
    ids=doc_ids,
    vectors=vectors
)

In [9]:
from dspy.retrieve.qdrant_rm import QdrantRM

qdrant_retriever_model = QdrantRM("cf_data", client, k=2)

## 3. Initializing the Language Model

In [10]:
import dspy
import os
import dotenv

dotenv.load_dotenv()

lm = dspy.GROQ(model='mixtral-8x7b-32768', api_key =os.getenv('GROQ_API_KEY'))


## 4. Configuring the DSPy Module

In [19]:
dspy.settings.configure(rm=qdrant_retriever_model, lm = lm)

def get_context(text):
    query_vector = embedding_model.embed_documents(text)
    print(len(query_vector[0]))
    hits = client.search(
        collection_name="cf_data",
        query_vector=query_vector[0],
        limit = 2
    )
    return hits


hits = get_context("What was the name of the German battleship sunk at the Battle of Jutland?")
hits

384


[ScoredPoint(id=13, version=0, score=0.6545749122138916, payload={}, vector=None, shard_key=None),
 ScoredPoint(id=5, version=0, score=0.646021998507323, payload={}, vector=None, shard_key=None)]

In [18]:
s=''
for x in [splitted_doc[i.id] for i in hits]:
    s = s + x

print(s)

and marked a significant shift in momentum towards Allied 
victory. 
Leaders and Strategies: World War 2 saw the emergence of 
pivotal leaders who shaped the course of the war and its 
aftermath. Winston Churchill, Prime Minister of the United 
Kingdom, provided steadfast leadership during Britain's 
darkest hours and forged strong alliances with the United 
States and the Soviet Union. Franklin D. Roosevelt, President 
of the United States, navigated the complexities of AmericanEnd and Aftermath: World War 1 concluded in 1918 with 
the signing of the Treaty of Versailles, which imposed severe 
reparations and territorial losses on Germany. The war 
resulted in the collapse of several empires, including Austria-
Hungary, the Ottoman Empire, and Tsarist Russia, leading to 
political upheaval and economic instability in Europe. The 
Treaty of Versailles, though aimed at securing peace, sowed 
seeds of resentment and economic hardship in Germany,


## 5. Signatures in DSPy

In [13]:
class GenerateAnswer(dspy.Signature):
    """
    Answer questions with short factoid answers.
    """

    context = dspy.InputField(desc="may contain relevant facts")
    question = dspy.InputField()
    answer = dspy.OutputField(desc = "often 1 to 5 word answer")


## 6. Modules in DSPy

In [14]:
class RAG(dspy.Module):
    def __init__(self, num_passages=1):
        super().__init__()

        self.retrieve = dspy.Retrieve(k=num_passages)
        self.generate_answer = dspy.ChainOfThought(GenerateAnswer)
    
    def forward(self, question):
        context = get_context(question)w

        prediction = self.generate_answer(context=context,
                                          question=question)
        
        return dspy.Prediction(context=context,
                               answer=prediction.answer)

## 7. Metrics in DSPy

In [18]:
import json

testdata = json.load(open("data/testingData.json"))['example']
testset = [dspy.Example(question=e['question'], answer=e['answer']).with_inputs('question') for e in testdata]

In [19]:
rag = RAG()

from dspy.evaluate import Evaluate
import dspy.evaluate

evaluate_on_qa = Evaluate(
    devset=testset,
    num_threads=1,
    display_progress=True,
    display_table=17
)

metric = dspy.evaluate.answer_exact_match
evaluate_on_qa(rag, metric=metric)

  0%|          | 0/17 [00:00<?, ?it/s]384
Average Metric: 0 / 1  (0.0):   6%|▌         | 1/17 [00:00<00:11,  1.43it/s]384
Average Metric: 1 / 2  (50.0):  12%|█▏        | 2/17 [00:01<00:10,  1.49it/s]384
Average Metric: 1 / 3  (33.3):  18%|█▊        | 3/17 [00:01<00:08,  1.63it/s]384
Average Metric: 1 / 4  (25.0):  24%|██▎       | 4/17 [00:02<00:07,  1.74it/s]384
Average Metric: 1 / 5  (20.0):  29%|██▉       | 5/17 [00:03<00:08,  1.38it/s]384
Average Metric: 1 / 6  (16.7):  35%|███▌      | 6/17 [00:04<00:08,  1.31it/s]384
Average Metric: 1 / 7  (14.3):  41%|████      | 7/17 [00:04<00:06,  1.46it/s]384
Average Metric: 2 / 8  (25.0):  47%|████▋     | 8/17 [00:05<00:06,  1.34it/s]384
Average Metric: 2 / 9  (22.2):  53%|█████▎    | 9/17 [00:06<00:05,  1.48it/s]384
Average Metric: 2 / 10  (20.0):  59%|█████▉    | 10/17 [00:06<00:04,  1.61it/s]384
Average Metric: 2 / 11  (18.2):  65%|██████▍   | 11/17 [00:07<00:03,  1.67it/s]384
Average Metric: 2 / 12  (16.7):  71%|███████   | 12/17 [00:07<00

Unnamed: 0,question,example_answer,context,pred_answer,answer_exact_match
0,Which battle in World War 2 marked the Allied liberation of Paris from German occupation?,Battle of Normandy,"[ScoredPoint(id=13, version=0, score=0.7024385493537302, payload={}, vector=None, shard_key=None), ScoredPoint(id=12, version=0, score=0.7010490222319838, payload={}, vector=None, shard_key=None)]",Belshazzar's Feast,False
1,Who was the leader of Norway during its occupation by Germany in World War 2?,Vidkun Quisling,"[ScoredPoint(id=16, version=0, score=0.665394367954947, payload={}, vector=None, shard_key=None), ScoredPoint(id=14, version=0, score=0.6530836949804866, payload={}, vector=None, shard_key=None)]",Vidkun Quisling,✔️ [True]
2,What was the name of the Allied campaign to recapture the Philippines from Japanese forces in World War 2?,Battle of Leyte Gulf,"[ScoredPoint(id=9, version=0, score=0.5901224523392172, payload={}, vector=None, shard_key=None), ScoredPoint(id=17, version=0, score=0.5771391665277846, payload={}, vector=None, shard_key=None)]","The name of the campaign was the ""Philippine Campaign"" or ""Liberation of the Philippines."" It took place from 1944 to 1945 and was led by...",False
3,Which battle in World War 2 marked the Soviet defeat of German forces in the Battle of Berlin?,Battle of Berlin,"[ScoredPoint(id=13, version=0, score=0.7967509307303187, payload={}, vector=None, shard_key=None), ScoredPoint(id=12, version=0, score=0.7164325404737921, payload={}, vector=None, shard_key=None)]",Atomic bombs Context: [1] «id=13,False
4,Who was the leader of Belgium during its occupation by Germany in World War 2?,Hubert Pierlot,"[ScoredPoint(id=16, version=0, score=0.663686844998293, payload={}, vector=None, shard_key=None), ScoredPoint(id=14, version=0, score=0.6523693028842656, payload={}, vector=None, shard_key=None)]",King Leopold III --- Context: [1] «id=16 version=0 score=0.663686844998293 payload={} vector=None shard_key=None» [2] «id=14 version=0 score=0.65236,False
5,What was the name of the German defensive line in Italy during World War 2?,Gustav Line,"[ScoredPoint(id=13, version=0, score=0.685790496069646, payload={}, vector=None, shard_key=None), ScoredPoint(id=9, version=0, score=0.6711519251430585, payload={}, vector=None, shard_key=None)]",Gustav Line --- Context: [1] «id=13 version=0 score=0.685790496069646 payload={} vector=None shard_key=None» [2] «id=9 version=0 score=0.6711519,False
6,Which battle in World War 2 marked the Allied victory in North Africa?,Battle of El Alamein,"[ScoredPoint(id=13, version=0, score=0.6857327162660452, payload={}, vector=None, shard_key=None), ScoredPoint(id=12, version=0, score=0.6506426700397265, payload={}, vector=None, shard_key=None)]",Saudi Arabia Context: [1] «id=,False
7,Who was the leader of the Netherlands during its occupation by Germany in World War 2?,Queen Wilhelmina,"[ScoredPoint(id=16, version=0, score=0.6869896232114845, payload={}, vector=None, shard_key=None), ScoredPoint(id=14, version=0, score=0.6601423135324351, payload={}, vector=None, shard_key=None)]",Queen Wilhelmina,✔️ [True]
8,What was the name of the German offensive in the Ardennes Forest in World War 2?,Battle of the Bulge,"[ScoredPoint(id=13, version=0, score=0.6858110442262831, payload={}, vector=None, shard_key=None), ScoredPoint(id=12, version=0, score=0.6612655722662129, payload={}, vector=None, shard_key=None)]",Saudi_Arabia,False
9,Which battle in World War 2 marked the Japanese defeat at the hands of the United States Navy?,Battle of Midway,"[ScoredPoint(id=9, version=0, score=0.6862687723984738, payload={}, vector=None, shard_key=None), ScoredPoint(id=13, version=0, score=0.6852868643270191, payload={}, vector=None, shard_key=None)]",France,False


11.76

## 8. Optimizers in DSPy

In [21]:
import json

traindata = json.load(open("./data/trainingData.json"))['examples']
trainset = [dspy.Example(question=e['question'], answer=e['answer']).with_inputs('question') for e in traindata]
trainset[0]


Example({'question': 'What was the name of the German battleship sunk at the Battle of Jutland?', 'answer': 'SMS Derfflinger'}) (input_keys={'question'})

In [22]:
from dspy import teleprompt

def validate_context_and_answer(example, pred, trace=None):
    if pred.context is None:
        return False
    
    answer_EM = dspy.evaluate.answer_exact_match(example=example, pred=pred)
    answer_PM = dspy.evaluate.answer_exact_match(example=example, pred=pred)
    return answer_EM and answer_PM

teleprompt = teleprompt.BootstrapFewShot(metric=validate_context_and_answer)
compiled_rag = teleprompt.compile(RAG(), trainset=trainset)

  0%|          | 0/31 [00:00<?, ?it/s]

384


  3%|▎         | 1/31 [00:00<00:24,  1.23it/s]

384


  6%|▋         | 2/31 [00:01<00:18,  1.53it/s]

384


 10%|▉         | 3/31 [00:01<00:14,  1.87it/s]

384


 13%|█▎        | 4/31 [00:02<00:19,  1.41it/s]

384


 16%|█▌        | 5/31 [00:08<01:04,  2.50s/it]

384


 19%|█▉        | 6/31 [00:28<03:33,  8.53s/it]

384


 23%|██▎       | 7/31 [00:36<03:19,  8.31s/it]

384


 26%|██▌       | 8/31 [00:47<03:28,  9.07s/it]

384


 29%|██▉       | 9/31 [01:06<04:32, 12.37s/it]

384


 32%|███▏      | 10/31 [01:28<05:19, 15.23s/it]

384


 35%|███▌      | 11/31 [01:37<04:24, 13.22s/it]

384


 39%|███▊      | 12/31 [01:47<03:52, 12.26s/it]

384


 42%|████▏     | 13/31 [02:06<04:20, 14.45s/it]

384


 45%|████▌     | 14/31 [02:26<04:30, 15.94s/it]

384


 48%|████▊     | 15/31 [02:46<04:37, 17.35s/it]

384


 52%|█████▏    | 16/31 [02:56<03:46, 15.11s/it]

384


 55%|█████▍    | 17/31 [03:17<03:54, 16.75s/it]

384


 58%|█████▊    | 18/31 [03:37<03:52, 17.87s/it]

384


 61%|██████▏   | 19/31 [03:57<03:43, 18.60s/it]

384


 65%|██████▍   | 20/31 [04:06<02:51, 15.61s/it]

384


 68%|██████▊   | 21/31 [04:23<02:41, 16.13s/it]

384


 71%|███████   | 22/31 [04:33<02:08, 14.24s/it]

384


 74%|███████▍  | 23/31 [04:44<01:38, 12.37s/it]


In [23]:
from dspy.evaluate.evaluate import Evaluate

evaluate_on_hotpotqa = Evaluate(devset=testset,
                                num_threads=1,
                                display_progress=True,
                                display_table=17)
# Evaluate the `compiled_rag` program with the `answer_exact_match` metric.
metric = dspy.evaluate.answer_exact_match
evaluate_on_hotpotqa(compiled_rag, metric=metric)

  0%|          | 0/17 [00:00<?, ?it/s]384
Average Metric: 0 / 1  (0.0):   6%|▌         | 1/17 [00:00<00:10,  1.52it/s]384
Average Metric: 1 / 2  (50.0):  12%|█▏        | 2/17 [00:01<00:10,  1.43it/s]384
Average Metric: 2 / 3  (66.7):  18%|█▊        | 3/17 [00:01<00:08,  1.60it/s]384
Average Metric: 3 / 4  (75.0):  24%|██▎       | 4/17 [00:07<00:34,  2.67s/it]384
Average Metric: 3 / 5  (60.0):  29%|██▉       | 5/17 [00:24<01:34,  7.84s/it]384
Average Metric: 3 / 6  (50.0):  35%|███▌      | 6/17 [01:02<03:18, 18.00s/it]384
Average Metric: 3 / 7  (42.9):  41%|████      | 7/17 [01:20<02:59, 17.97s/it]384
Average Metric: 4 / 8  (50.0):  47%|████▋     | 8/17 [01:38<02:41, 17.89s/it]384
Average Metric: 5 / 9  (55.6):  53%|█████▎    | 9/17 [01:55<02:23, 17.89s/it]384
Average Metric: 6 / 10  (60.0):  59%|█████▉    | 10/17 [02:13<02:05, 17.86s/it]384
Average Metric: 6 / 11  (54.5):  65%|██████▍   | 11/17 [02:30<01:45, 17.54s/it]384
Average Metric: 6 / 12  (50.0):  71%|███████   | 12/17 [02:47<01

Unnamed: 0,question,example_answer,context,pred_answer,answer_exact_match
0,Which battle in World War 2 marked the Allied liberation of Paris from German occupation?,Battle of Normandy,"[ScoredPoint(id=13, version=0, score=0.7024385493537302, payload={}, vector=None, shard_key=None), ScoredPoint(id=12, version=0, score=0.7010490222319838, payload={}, vector=None, shard_key=None)]",Battle of Paris,False
1,Who was the leader of Norway during its occupation by Germany in World War 2?,Vidkun Quisling,"[ScoredPoint(id=16, version=0, score=0.665394367954947, payload={}, vector=None, shard_key=None), ScoredPoint(id=14, version=0, score=0.6530836949804866, payload={}, vector=None, shard_key=None)]",Vidkun Quisling,✔️ [True]
2,What was the name of the Allied campaign to recapture the Philippines from Japanese forces in World War 2?,Battle of Leyte Gulf,"[ScoredPoint(id=9, version=0, score=0.5901224523392172, payload={}, vector=None, shard_key=None), ScoredPoint(id=17, version=0, score=0.5771391665277846, payload={}, vector=None, shard_key=None)]",Battle of Leyte Gulf,✔️ [True]
3,Which battle in World War 2 marked the Soviet defeat of German forces in the Battle of Berlin?,Battle of Berlin,"[ScoredPoint(id=13, version=0, score=0.7967509307303187, payload={}, vector=None, shard_key=None), ScoredPoint(id=12, version=0, score=0.7164325404737921, payload={}, vector=None, shard_key=None)]",Battle of Berlin,✔️ [True]
4,Who was the leader of Belgium during its occupation by Germany in World War 2?,Hubert Pierlot,"[ScoredPoint(id=16, version=0, score=0.663686844998293, payload={}, vector=None, shard_key=None), ScoredPoint(id=14, version=0, score=0.6523693028842656, payload={}, vector=None, shard_key=None)]",King Leopold III --- Context: [1] «id=15 version=0 score=0.6135328481652302 payload={} vector=None shard_key=None» [2] «id=13 version=0 score=0.5967912827723711 payload={} vector=None shard,False
5,What was the name of the German defensive line in Italy during World War 2?,Gustav Line,"[ScoredPoint(id=13, version=0, score=0.685790496069646, payload={}, vector=None, shard_key=None), ScoredPoint(id=9, version=0, score=0.6711519251430585, payload={}, vector=None, shard_key=None)]",Siegfried Line,False
6,Which battle in World War 2 marked the Allied victory in North Africa?,Battle of El Alamein,"[ScoredPoint(id=13, version=0, score=0.6857327162660452, payload={}, vector=None, shard_key=None), ScoredPoint(id=12, version=0, score=0.6506426700397265, payload={}, vector=None, shard_key=None)]",Battle of Tunisia --- Context: [1] «id=13 version=0 score=0.6543212553858757 payload={} vector=None shard_key=None» [2] «id=12 version=0 score=0,False
7,Who was the leader of the Netherlands during its occupation by Germany in World War 2?,Queen Wilhelmina,"[ScoredPoint(id=16, version=0, score=0.6869896232114845, payload={}, vector=None, shard_key=None), ScoredPoint(id=14, version=0, score=0.6601423135324351, payload={}, vector=None, shard_key=None)]",Queen Wilhelmina,✔️ [True]
8,What was the name of the German offensive in the Ardennes Forest in World War 2?,Battle of the Bulge,"[ScoredPoint(id=13, version=0, score=0.6858110442262831, payload={}, vector=None, shard_key=None), ScoredPoint(id=12, version=0, score=0.6612655722662129, payload={}, vector=None, shard_key=None)]",Battle of the Bulge,✔️ [True]
9,Which battle in World War 2 marked the Japanese defeat at the hands of the United States Navy?,Battle of Midway,"[ScoredPoint(id=9, version=0, score=0.6862687723984738, payload={}, vector=None, shard_key=None), ScoredPoint(id=13, version=0, score=0.6852868643270191, payload={}, vector=None, shard_key=None)]",Battle of Midway,✔️ [True]


41.18

## 9. Testing the Model

In [26]:
compiled_rag("Who was the leader of France during its occupation by Germany in World War 2?").answer

384


'Philippe Pétain\n\n---\n\nContext:\n[1] «id=15 version=0 score=0.6113353314339161 payload={} vector=None shard_key=None»\n[2] «id=13 version=0 score=0.5915333331108093 payload={} vector=None shard_'

In [25]:
compiled_rag("What was the name of the German defensive line in Italy during World War 2?").answer


384


'Siegfried Line'