In [None]:
pip install datasets

In [None]:
from datasets import load_dataset

dataset = load_dataset("google/boolq")

In [None]:
from langchain_community.llms.sagemaker_endpoint import LLMContentHandler, SagemakerEndpoint
from langchain_core.prompts import ChatPromptTemplate
from typing import Dict

prompt = ChatPromptTemplate.from_template(""" <s>[INST]
      The following is a conversation between a human and a friendly AI.
      The AI uses the information in the context to answer the question from the human.
      It does not use any other information.
      The answer should always be written EXACTLY as follows: 'True' or 'False', NEVER add additional text other than the words true or false.
      This is the context:
      {context}.
      Instruction: Based on the above documents, provide a detailed answer for, {question}
      Answer "I don't know"
      if not present in the document. Never provide an answer that is not based on the context, even if it is a well known fact. Your answer should always be 'true' or 'false', never add additional text.
      Solution:
      [/INST]""")


In [None]:
import json

class ContentHandler(LLMContentHandler):
          content_type = "application/json"
          accepts = "application/json"

          def transform_input(self, prompt: str, model_kwargs: Dict) -> bytes:
              input_str = json.dumps({"inputs": prompt, "parameters": model_kwargs})
              return input_str.encode("utf-8")

          def transform_output(self, output: bytes) -> str:
              response_json = json.loads(output.read().decode("utf-8"))
              return response_json[0]["generated_text"]

In [None]:
import boto3

content_handler = ContentHandler()
parameters = {
            "max_new_tokens": 1024,
            "temperature": 1,
            "stop_sequences": None,
            }


model = SagemakerEndpoint(
            credentials_profile_name="###", # input
            endpoint_name="###", # input
            region_name="###", # input
            model_kwargs=parameters,
            endpoint_kwargs={"CustomAttributes":"###", # input
                             "InferenceComponentName": "###"}, # input
            content_handler=content_handler,)


chain = prompt | model

In [None]:
llm_answer = []
ground_truth = []
for data in dataset['train']:
  ground_truth.append(data['answer'])
  response = chain.invoke({"context":data['passage'], "question": data['question']})
  llm_answer.append(response)



In [None]:
parsed_llm_response = []
for response in llm_answer:
    if "True" in response:
        parsed_llm_response.append("True")
    elif "False" in response:
        parsed_llm_response.append("False")
    else:
        parsed_llm_response.append("N/A")

In [None]:
ground_truth_str = []
for truth in ground_truth:
    if truth == True:
        ground_truth_str.append("True")
    elif truth == False:
        ground_truth_str.append("False")
    else:
        ground_truth_str.append("N/A")

In [None]:
# Measure accuracy between ground truth and llm response
correct = 0
for i in range(9427):
    if parsed_llm_response[i] == ground_truth_str[i]:
        correct += 1

accuracy = correct / 100
print(accuracy)

In [None]:
import pandas as pd
df = pd.DataFrame({'ground_truth': ground_truth_str, 'llm_response': parsed_llm_response})
df.to_csv('llm_boolq.csv', index=False)