In [None]:
from huggingface_hub import login
login(token="")

In [1]:
!pip install deepeval

Collecting deepeval
  Downloading deepeval-1.5.9-py3-none-any.whl.metadata (977 bytes)
Collecting pytest-repeat (from deepeval)
  Downloading pytest_repeat-0.9.3-py3-none-any.whl.metadata (4.9 kB)
Collecting pytest-xdist (from deepeval)
  Downloading pytest_xdist-3.6.1-py3-none-any.whl.metadata (4.3 kB)
Collecting portalocker (from deepeval)
  Downloading portalocker-3.0.0-py3-none-any.whl.metadata (8.5 kB)
Collecting langchain (from deepeval)
  Downloading langchain-0.3.8-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core (from deepeval)
  Downloading langchain_core-0.3.21-py3-none-any.whl.metadata (6.3 kB)
Collecting langchain-openai (from deepeval)
  Downloading langchain_openai-0.2.9-py3-none-any.whl.metadata (2.6 kB)
Collecting ragas (from deepeval)
  Downloading ragas-0.2.6-py3-none-any.whl.metadata (8.1 kB)
Collecting docx2txt~=0.8 (from deepeval)
  Downloading docx2txt-0.8.tar.gz (2.8 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting tenacity~=8.4.1 (from 

In [2]:
!pip install lm-format-enforcer

Collecting lm-format-enforcer
  Downloading lm_format_enforcer-0.10.9-py3-none-any.whl.metadata (17 kB)
Collecting interegular>=0.3.2 (from lm-format-enforcer)
  Downloading interegular-0.3.3-py37-none-any.whl.metadata (3.0 kB)
Downloading lm_format_enforcer-0.10.9-py3-none-any.whl (43 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.9/43.9 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading interegular-0.3.3-py37-none-any.whl (23 kB)
Installing collected packages: interegular, lm-format-enforcer
Successfully installed interegular-0.3.3 lm-format-enforcer-0.10.9


In [None]:
import openai

openai.api_key = ""


In [5]:
!pip install bitsandbytes>=0.39.0
!pip install --upgrade accelerate transformers

Collecting accelerate
  Downloading accelerate-1.1.1-py3-none-any.whl.metadata (19 kB)
Collecting transformers
  Downloading transformers-4.46.2-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Downloading accelerate-1.1.1-py3-none-any.whl (333 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m333.2/333.2 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hDownloading transformers-4.46.2-py3-none-any.whl (10.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m80.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[?25hInstalling collected packages: accelerate, transformers
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.34.2
    Uninstalling accelerate-0.34.2:
      Successfully uninstalled accelerate-0.34.2
  Attempting uninstall: transformers
    Found existing installation: transformer

In [7]:
import json
from pydantic import BaseModel
import torch
from lmformatenforcer import JsonSchemaParser
from lmformatenforcer.integrations.transformers import (
    build_transformers_prefix_allowed_tokens_fn,
)
from transformers import BitsAndBytesConfig, pipeline
from transformers import AutoModelForCausalLM, AutoTokenizer

from deepeval.models import DeepEvalBaseLLM


class CustomMistral7B(DeepEvalBaseLLM):
    def __init__(self):
        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_use_double_quant=True,
        )

        model_4bit = AutoModelForCausalLM.from_pretrained(
            "mistralai/Mistral-7B-v0.3",
            device_map="auto",
            quantization_config=quantization_config,
        )
        tokenizer = AutoTokenizer.from_pretrained(
            "mistralai/Mistral-7B-v0.3"
        )

        self.model = model_4bit
        self.tokenizer = tokenizer

    def load_model(self):
        return self.model

    def generate(self, prompt: str, schema: BaseModel) -> BaseModel:
        model = self.load_model()
        generate_pipeline = pipeline(
            "text-generation",
            model=model,
            tokenizer=self.tokenizer,
            use_cache=True,
            device_map="auto",
            max_length=2500,
            do_sample=True,
            top_k=5,
            num_return_sequences=1,
            eos_token_id=self.tokenizer.eos_token_id,
            pad_token_id=self.tokenizer.eos_token_id,
        )

        # Create parser required for JSON confinement using lmformatenforcer
        parser = JsonSchemaParser(schema.schema())
        prefix_function = build_transformers_prefix_allowed_tokens_fn(
            generate_pipeline.tokenizer, parser
        )

        # Output and load valid JSON
        output_dict = generate_pipeline(prompt, prefix_allowed_tokens_fn=prefix_function)
        output = output_dict[0]["generated_text"][len(prompt) :]
        json_result = json.loads(output)

        # Return valid JSON object according to the schema DeepEval supplied
        return schema(**json_result)

    async def a_generate(self, prompt: str, schema: BaseModel) -> BaseModel:
        return self.generate(prompt, schema)

    def get_model_name(self):
        return "Mistral-7B v0.3"

In [10]:
from deepeval import evaluate
from deepeval.metrics import HallucinationMetric
from deepeval.test_case import LLMTestCase

# Replace this with the actual documents that you are passing as input to your LLM.
context=["""You are an intelligent clinical languge model.
Below is a medical question.
Write a response that appropriately completes the instruction.
The response should provide the accurate answer to the instruction, while being concise.
Polycystic ovary syndrome is a condition in which a woman has an imbalance of female sex hormones. This may lead to changes in the menstrual cycle, cysts in the ovaries, trouble getting pregnant, and other health problems.)"""]

# Replace this with the actual output from your LLM application
actual_output="Polycystic ovary syndrome (PCOS) is a common health condition that affects 1 in 10 women during their reproductive years. It's caused by an imbalance of reproductive hormones. Women with PCOS may have a variety of symptoms, such as irregular menstrual periods, excess hair growth, acne, or weight"

test_case = LLMTestCase(
    input= """What is (are) Polycystic ovary syndrome ? 
    Also called: Polycystic ovaries; Polycystic ovary disease; Stein-Leventhal syndrome; Polyfollicular ovarian disease""",
    actual_output=actual_output,
    context=context
)

customllm = CustomMistral7B()
metric = HallucinationMetric(model = customllm,include_reason = True, strict_mode = True)

metric.measure(test_case)
print(metric.score)
print(metric.reason)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Output()

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


0.0
The score is 0.0 because the actual output provides a concise, accurate answer to the instruction.


In [8]:
import io
import json
import csv
import pandas as pd
path = '/kaggle/input/asclepius-medquad/Asclepius.csv'
with io.open(path, 'r', encoding='windows-1252') as csv_file:
  reader = csv.reader(csv_file)
  data = [row for row in reader]
df_A = pd.DataFrame(data[1:], columns=data[0])
print(df_A)

                       ANSWERID  \
0      0  ADAM_0003147_Sec1.txt   
1      1  ADAM_0003147_Sec2.txt   
2      2  ADAM_0002818_Sec7.txt   
3      3  ADAM_0002818_Sec9.txt   
4      4  GARD_0004375_Sec1.txt   
..   ...                    ...   
136  136  ADAM_0001283_Sec7.txt   
137  137  ADAM_0001284_Sec1.txt   
138  138  ADAM_0001284_Sec8.txt   
139  139  ADAM_0002089_Sec1.txt   
140  140  ADAM_0002320_Sec1.txt   

                                              Question  \
0    What is (are) Polycystic ovary syndrome ? (Als...   
1    What causes Polycystic ovary syndrome ? (Also ...   
2      What are the complications of Noonan syndrome ?   
3                     How to prevent Noonan syndrome ?   
4    What are the symptoms of Neurofibromatosis-Noo...   
..                                                 ...   
136  Do I need to see a doctor for Drug abuse first...   
137  What is (are) Drug allergies ? (Also called: A...   
138  What is (are) Drug allergies ? (Also called: A...   

In [None]:
for i in range(0,140):
    context = ["""You are an intelligent clinical languge model.
    Below is a medical question.
    Write a response that appropriately completes the instruction.
    The response should provide the accurate answer to the instruction, while being concise.""", df_A['Answer'][i]]
    actual_output = df_A['AesclepiusAnswer'][i]
    test_case = LLMTestCase(
        input = df_A['Question'][i],
        actual_output=actual_output,
        context=context
    )
    metric = HallucinationMetric(model = customllm,include_reason = True, strict_mode = True)
    
    
    

In [None]:
from deepeval import evaluate
from deepeval.metrics import HallucinationMetric
from deepeval.test_case import LLMTestCase

# Replace this with the actual documents that you are passing as input to your LLM.
context=["""You are an intelligent clinical languge model.
Below is a medical question.
Write a response that appropriately completes the instruction.
The response should provide the accurate answer to the instruction, while being concise.
Polycystic ovary syndrome is a condition in which a woman has an imbalance of female sex hormones. This may lead to changes in the menstrual cycle, cysts in the ovaries, trouble getting pregnant, and other health problems.)"""]

# Replace this with the actual output from your LLM application
actual_output="Polycystic ovary syndrome (PCOS) is a common health condition that affects 1 in 10 women during their reproductive years. It's caused by an imbalance of reproductive hormones. Women with PCOS may have a variety of symptoms, such as irregular menstrual periods, excess hair growth, acne, or weight"

test_case = LLMTestCase(
    input= """What is (are) Polycystic ovary syndrome ? 
    Also called: Polycystic ovaries; Polycystic ovary disease; Stein-Leventhal syndrome; Polyfollicular ovarian disease""",
    actual_output=actual_output,
    context=context
)

customllm = CustomMistral7B()
metric = HallucinationMetric(model = customllm)

metric.measure(test_case)
print(metric.score)
print(metric.reason)

# or evaluate test cases in bulk
evaluate([test_case], [metric])

In [None]:
from deepeval import evaluate
from deepeval.metrics import HallucinationMetric
from deepeval.test_case import LLMTestCase

for i in range(0,140):
    # Replace this with the actual documents that you are passing as input to your LLM.
    context=["""You are an intelligent clinical languge model.
    Below is a medical question.
    Write a response that appropriately completes the instruction.
    The response should provide the accurate answer to the instruction, while being concise.
    Polycystic ovary syndrome is a condition in which a woman has an imbalance of female sex hormones. This may lead to changes in the menstrual cycle, cysts in the ovaries, trouble getting pregnant, and other health problems.)"""]

    # Replace this with the actual output from your LLM application
    actual_output="Polycystic ovary syndrome (PCOS) is a common health condition that affects 1 in 10 women during their reproductive years. It's caused by an imbalance of reproductive hormones. Women with PCOS may have a variety of symptoms, such as irregular menstrual periods, excess hair growth, acne, or weight"

    test_case = LLMTestCase(
        input= """What is (are) Polycystic ovary syndrome ? 
        Also called: Polycystic ovaries; Polycystic ovary disease; Stein-Leventhal syndrome; Polyfollicular ovarian disease""",
        actual_output=actual_output,
        context=context
    )

    customllm = CustomMistral7B()
    metric = HallucinationMetric(model = customllm)

    metric.measure(test_case)
    print(metric.score)
    print(metric.reason)

    # or evaluate test cases in bulk
    evaluate([test_case], [metric])