In [15]:
!pip install git+https://github.com/stanfordnlp/dspy.git
!pip install -U openai
import openai
!pip install datasets
!pip install joblib
!pip install -U sentence-transformers
!apt-get install libomp-dev
!pip install faiss-cpu

import faiss

from datasets import load_dataset
medqa_dataset = load_dataset("bigbio/med_qa")

Collecting git+https://github.com/stanfordnlp/dspy.git
  Cloning https://github.com/stanfordnlp/dspy.git to /tmp/pip-req-build-e0cn9si_
  Running command git clone --filter=blob:none --quiet https://github.com/stanfordnlp/dspy.git /tmp/pip-req-build-e0cn9si_
  Resolved https://github.com/stanfordnlp/dspy.git to commit 16764d5e68d7f209814dbb54911d1583ef8f449b
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting openai
  Using cached openai-1.6.1-py3-none-any.whl (225 kB)
Installing collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 0.28.1
    Uninstalling openai-0.28.1:
      Successfully uninstalled openai-0.28.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
llmx 0.0.15a0 requires cohere, which is not installed.
llmx 0.0.15a0 requires tiktoken, which is not installed.
dspy-ai 2.0.8 requires openai

In [2]:
from google.colab import userdata
api_key = userdata.get('api_key')

In [3]:
import dspy
model=dspy.OpenAI(model='gpt-3.5-turbo', api_key=api_key)
dspy.settings.configure(lm=model)

In [73]:
#training on the first 20 questions
train_subset = medqa_dataset["train"][:20]

train_questions = train_subset["question"]
train_answers_id = train_subset["answer_idx"]
train_options= train_subset["options"]
train_answer = train_subset["answer"]

In [99]:
def formatting_options(qoptions):
  return [' '.join(f"{option['key']} {option['value']}" for option in options) for options in qoptions]

In [75]:
formatted_train_options = formatting_options(train_options)

In [29]:
print(formatted_train_options)

['A Esophagogastroduodenoscopy B CT scan of the abdomen C Hydrogen breath test D Cardiac stress test E Abdominal ultrasonography of the right upper quadrant']


In [9]:
# We are expecting a single choice answer so signature accordingly.
class MultipleChoiceQA(dspy.Signature):
    """Answer questions with single letter answers."""

    question = dspy.InputField(desc="The multiple-choice question.")
    options = dspy.InputField(desc="The set of options in the format : A option1 B option2 C option3 D option4 E option5 where A corresponds to option1, B to option2 and so on.")
    answer = dspy.OutputField(desc="A single-letter answer corresponding to the selected option.")

In [10]:
def create_train_set(train_questions, train_options, train_answers):
    train_set = []

    for question, options, answer in zip(train_questions, train_options, train_answers):
        example = dspy.Example(
            question=question,
            options=options,
            answer=answer
        ).with_inputs("question", "options")

        train_set.append(example)

    return train_set

In [100]:
val_subset = medqa_dataset["validation"][:20]

val_questions = val_subset["question"]
val_answers_id = val_subset["answer_idx"]
val_options= val_subset["options"]
val_answer = val_subset["answer"]

formatted_val_options = formatting_options(val_options)
val_set = create_train_set(val_questions, formatted_val_options, val_answers_id)

In [81]:
# Creating chain of thoughts list for questions which are answered correctly.
generate_answer = dspy.ChainOfThought(MultipleChoiceQA)

def store_correct_cot(questions: list[str], option_sets: list[str], answers: list[str]) -> list[str]:
    train_set = []
    for question, options, answer in zip(questions, option_sets, answers):
        pred_response = generate_answer(question=question, options=options)
        if pred_response.answer == answer:
          example = dspy.Example(
            question=question,
            options=options,
            context=pred_response.rationale.split('.', 1)[1].strip(),
            answer=answer
        ).with_inputs("question", "options")

          train_set.append(example)

    return train_set


In [82]:
trainset = store_correct_cot(train_questions, formatted_train_options, train_answers_id)

In [92]:
print(len(trainset))
print(trainset)

11
[Example({'question': 'A 23-year-old pregnant woman at 22 weeks gestation presents with burning upon urination. She states it started 1 day ago and has been worsening despite drinking more water and taking cranberry extract. She otherwise feels well and is followed by a doctor for her pregnancy. Her temperature is 97.7°F (36.5°C), blood pressure is 122/77 mmHg, pulse is 80/min, respirations are 19/min, and oxygen saturation is 98% on room air. Physical exam is notable for an absence of costovertebral angle tenderness and a gravid uterus. Which of the following is the best treatment for this patient?', 'options': 'A Ampicillin B Ceftriaxone C Ciprofloxacin D Doxycycline E Nitrofurantoin', 'context': 'We have a pregnant woman presenting with burning upon urination, which is suggestive of a urinary tract infection (UTI). She has already tried increasing her water intake and taking cranberry extract, but her symptoms have not improved. We also note that she has an absence of costoverteb

In [84]:
class MultipleQABot(dspy.Module):
    def __init__(self):
        super().__init__()
        self.generate_answer = dspy.Predict(MultipleChoiceQA)

    def forward(self, question, options):
        answer = self.generate_answer(question=question,options=options)

        return answer


In [105]:
class ChoiceShuffle(dspy.Module):
      def __init__(self):
        super().__init__()

      def shuffle_options(options):
        shuffled_options = options.copy()
        random.shuffle(shuffled_options)
        return shuffled_options

      def choice_shuffling(test_question, test_options, number_of_iterations):
        answers = []
        for i in range (number_of_iterations):
          shuffled_options = shuffle_options(test_options)
          formatted_test_options = formatting_options([shuffled_options])
          print(formatted_test_options)
          answers.append(compiled_knn(question = test_question, options = formatted_test_options[0]).answer)
        return answers

In [106]:
choice = ChoiceShuffle()

In [107]:
choice("How are you?"," A okay B Fine")

AttributeError: 'ChoiceShuffle' object has no attribute 'forward'

In [85]:
from dspy.teleprompt import KNNFewShot
from dspy.predict.knn import KNN

knn_teleprompter = KNNFewShot(KNN, 5, trainset)
compiled_knn = knn_teleprompter.compile(MultipleQABot(), trainset=trainset)

In [93]:
import random

def shuffle_options(options):
    shuffled_options = options.copy()
    random.shuffle(shuffled_options)
    return shuffled_options

def choice_shuffling(test_question, test_options, number_of_iterations):
  answers = []
  for i in range (number_of_iterations):
    shuffled_options = shuffle_options(test_options)
    formatted_test_options = formatting_options([shuffled_options])
    print(formatted_test_options)
    answers.append(compiled_knn(question = test_question, options = formatted_test_options[0]).answer)
  return answers

In [30]:
t1="A 68-year-old man comes to the physician because of recurrent episodes of nausea and abdominal discomfort for the past 4 months. The discomfort is located in the upper abdomen and sometimes occurs after eating, especially after a big meal. He has tried to go for a walk after dinner to help with digestion, but his complaints have only increased. For the past 3 weeks he has also had symptoms while climbing the stairs to his apartment. He has type 2 diabetes mellitus, hypertension, and stage 2 peripheral arterial disease. He has smoked one pack of cigarettes daily for the past 45 years. He drinks one to two beers daily and occasionally more on weekends. His current medications include metformin, enalapril, and aspirin. He is 168 cm (5 ft 6 in) tall and weighs 126 kg (278 lb); BMI is 45 kg/m2. His temperature is 36.4°C (97.5°F), pulse is 78/min, and blood pressure is 148/86 mm Hg. On physical examination, the abdomen is soft and nontender with no organomegaly. Foot pulses are absent bilaterally. An ECG shows no abnormalities. Which of the following is the most appropriate next step in diagnosis?"
t1_op=[{'key': 'A', 'value': 'Esophagogastroduodenoscopy'}, {'key': 'B', 'value': 'CT scan of the abdomen'}, {'key': 'C', 'value': 'Hydrogen breath test'}, {'key': 'D', 'value': 'Cardiac stress test'}, {'key': 'E', 'value': 'Abdominal ultrasonography of the right upper quadrant'}]

In [95]:
choice_shuffling(t1, t1_op, 5)

['D Cardiac stress test B CT scan of the abdomen C Hydrogen breath test E Abdominal ultrasonography of the right upper quadrant A Esophagogastroduodenoscopy']


 80%|████████  | 4/5 [00:00<00:00, 821.25it/s]

Bootstrapped 4 full traces after 5 examples in round 0.





['C Hydrogen breath test A Esophagogastroduodenoscopy D Cardiac stress test B CT scan of the abdomen E Abdominal ultrasonography of the right upper quadrant']


 80%|████████  | 4/5 [00:00<00:00, 812.81it/s]

Bootstrapped 4 full traces after 5 examples in round 0.





['E Abdominal ultrasonography of the right upper quadrant B CT scan of the abdomen D Cardiac stress test A Esophagogastroduodenoscopy C Hydrogen breath test']


 80%|████████  | 4/5 [00:00<00:00, 786.89it/s]


Bootstrapped 4 full traces after 5 examples in round 0.
['B CT scan of the abdomen E Abdominal ultrasonography of the right upper quadrant A Esophagogastroduodenoscopy C Hydrogen breath test D Cardiac stress test']


 80%|████████  | 4/5 [00:00<00:00, 455.61it/s]


Bootstrapped 4 full traces after 5 examples in round 0.
['A Esophagogastroduodenoscopy B CT scan of the abdomen D Cardiac stress test E Abdominal ultrasonography of the right upper quadrant C Hydrogen breath test']


 80%|████████  | 4/5 [00:00<00:00, 717.01it/s]

Bootstrapped 4 full traces after 5 examples in round 0.





['A', 'C', 'C', 'B', 'A']

In [96]:
import tqdm
import random

from dspy.teleprompt.teleprompt import Teleprompter

class Ensemble(Teleprompter):
    def __init__(self, *, reduce_fn=None, size=None, deterministic=False):
        """A common reduce_fn is dspy.majority."""

        assert deterministic is False, "TODO: Implement example hashing for deterministic ensemble."

        self.reduce_fn = reduce_fn
        self.size = size
        self.deterministic = deterministic

    def compile(self, programs):
        size = self.size
        reduce_fn = self.reduce_fn

        class EnsembledProgram(dspy.Module):
            def __init__(self):
                super().__init__()
                self.programs = programs

            def forward(self, *args, **kwargs):
                programs = random.sample(self.programs, size) if size else self.programs
                outputs = [prog(*args, **kwargs) for prog in programs]

                if reduce_fn:
                    return reduce_fn(outputs)

                return outputs

        return EnsembledProgram()

In [108]:
import tqdm
import random

from dspy.teleprompt.teleprompt import Teleprompter

class Ensemble(Teleprompter):
    def __init__(self, *, reduce_fn=None, size=None, deterministic=False):
        """A common reduce_fn is dspy.majority."""

        assert deterministic is False, "TODO: Implement example hashing for deterministic ensemble."

        self.reduce_fn = reduce_fn
        self.size = size
        self.deterministic = deterministic

    def compile(self, programs, choice_shuffle=False):
        size = self.size
        reduce_fn = self.reduce_fn

        class EnsembledProgram(dspy.Module):
            def __init__(self):
                super().__init__()
                self.programs = programs

            def forward(self, *args, **kwargs):
                outputs = []

                if choice_shuffle:
                    outputs = self.choice_shuffle(*args, **kwargs)
                else:
                    # Standard ensemble without choice shuffling
                    programs_to_use = random.sample(self.programs, size) if size else self.programs
                    outputs = [prog(*args, **kwargs) for prog in programs_to_use]

                if reduce_fn:
                    return reduce_fn(outputs)

                return outputs

            def choice_shuffle(self, *args, **kwargs):
                # Ensemble with choice shuffling
                shuffle_iterations = 5
                for _ in range(shuffle_iterations):
                    shuffled_args = []
                    for arg in args:
                        if isinstance(arg, list):
                            shuffled_options = random.sample(arg, len(arg))
                            shuffled_args.append(shuffled_options)
                        else:
                            shuffled_args.append(arg)

                    # Execute the programs with shuffled options
                    programs_to_use = random.sample(self.programs, size) if size else self.programs
                    shuffled_outputs = [prog(*shuffled_args, **kwargs) for prog in programs_to_use]
                    outputs.append(shuffled_outputs)

                return outputs

        return EnsembledProgram()


In [109]:
programs = [compiled_knn]
ensembled_program = Ensemble(reduce_fn=dspy.majority).compile(programs, choice_shuffle = False)

In [110]:
from dspy.evaluate.evaluate import Evaluate
from dspy.evaluate import Evaluate
metric = dspy.evaluate.metrics.answer_exact_match
evaluator = Evaluate(devset=val_set, num_threads=1, display_progress=True, display_table=0)

In [111]:
evaluator(ensembled_program, metric = metric)

  0%|          | 0/20 [00:00<?, ?it/s]
  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:00<00:03,  1.21it/s][A
 40%|████      | 2/5 [00:01<00:01,  2.08it/s][A
 60%|██████    | 3/5 [00:01<00:01,  1.90it/s][A
 80%|████████  | 4/5 [00:02<00:00,  1.86it/s]
Average Metric: 0.0 / 1  (0.0):   5%|▌         | 1/20 [00:02<00:42,  2.24s/it]

Bootstrapped 4 full traces after 5 examples in round 0.
Error for example in dev set: 		 MultipleQABot.forward() missing 1 required positional argument: 'options'



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:00<00:01,  2.13it/s][A
 40%|████      | 2/5 [00:00<00:01,  2.10it/s][A
 60%|██████    | 3/5 [00:01<00:00,  2.14it/s][A
 80%|████████  | 4/5 [00:01<00:00,  2.13it/s]
Average Metric: 0.0 / 2  (0.0):  10%|█         | 2/20 [00:04<00:37,  2.06s/it]

Bootstrapped 4 full traces after 5 examples in round 0.
Error for example in dev set: 		 MultipleQABot.forward() missing 1 required positional argument: 'options'



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:00<00:03,  1.25it/s][A
 40%|████      | 2/5 [00:01<00:01,  1.67it/s][A
 60%|██████    | 3/5 [00:01<00:00,  2.05it/s][A
 80%|████████  | 4/5 [00:02<00:00,  1.92it/s]
Average Metric: 0.0 / 3  (0.0):  15%|█▌        | 3/20 [00:06<00:36,  2.13s/it]

Bootstrapped 4 full traces after 5 examples in round 0.
Error for example in dev set: 		 MultipleQABot.forward() missing 1 required positional argument: 'options'



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:00<00:01,  2.26it/s][A
 40%|████      | 2/5 [00:01<00:01,  1.55it/s][A
 60%|██████    | 3/5 [00:01<00:01,  1.80it/s][A
 80%|████████  | 4/5 [00:02<00:00,  1.94it/s]
Average Metric: 0.0 / 4  (0.0):  20%|██        | 4/20 [00:08<00:34,  2.15s/it]

Bootstrapped 4 full traces after 5 examples in round 0.
Error for example in dev set: 		 MultipleQABot.forward() missing 1 required positional argument: 'options'



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:00<00:01,  2.17it/s][A
 40%|████      | 2/5 [00:00<00:01,  2.18it/s][A
 60%|██████    | 3/5 [00:02<00:01,  1.16it/s][A
 80%|████████  | 4/5 [00:03<00:00,  1.29it/s]

Bootstrapped 4 full traces after 5 examples in round 0.





TypeError: MultipleQABot.forward() missing 1 required positional argument: 'options'