### Generate Test case 

In [1]:
import os
import requests
import base64
import getpass
import json
import time
import re

In [3]:
from jsonl import *
from bert_score import score

In [4]:
def get_prompt(ex_usecase, ex_testcases, usecase):
    return """You are a tester tasked with creating comprehensive test cases for a given usecase description.

## Usecase description
""" + ex_usecase + """


## Testcase 
""" + ex_testcases + """


## Usecase description
""" + usecase + """

## Testcase


--------
**Important Instruction:**
    - Understand the last usecase.
    - Generate test cases similar to the given example that covers both:
        - **Normal** and **Edge** case scenarios
        - **Positive** and **Negative** case scenarios
        - **Valid** and **Invalid** case scenarios
    - Do not add any explanation or any unnecessary word.
    - Your generated testcase must be json parsable and must follow the style of the given example.
"""

In [5]:
API_KEY = getpass.getpass()

In [6]:
headers = {
    "Content-Type": "application/json",
    "api-key": API_KEY,
}

In [7]:
def parse_response(response: str) -> str:
    if response is None:
        return ''
    
    if "```" not in response:
        return response

    code_pattern = r'```((.|\n)*?)```'
    if "```json" in response:
        code_pattern = r'```json((.|\n)*?)```'

    code_blocks = re.findall(code_pattern, response, re.DOTALL)

    if type(code_blocks[-1]) == tuple or type(code_blocks[-1]) == list:
        code_str = "\n".join(code_blocks[-1])
    elif type(code_blocks[-1]) == str:
        code_str = code_blocks[-1]
    else:
        code_str = response

    return code_str.strip()


In [8]:
def generate_testcases(ex_usecase, ex_testcases, usecase):
    # Payload for the request
    payload = {
        "messages": [
            {
                "role": "user",
                "content": get_prompt(ex_usecase, ex_testcases, usecase)
            },
        ],
        "temperature": 0.0,
        "top_p": 0.95,
        "max_tokens": 2000
    }

    ENDPOINT = "https://qcri-llm-rag-3.openai.azure.com/openai/deployments/gpt-4o/chat/completions?api-version=2024-02-15-preview"

    start_time = time.perf_counter()
    response = requests.post(ENDPOINT, headers=headers, json=payload)
    end_time = time.perf_counter()

    response.raise_for_status()
    response = response.json()

    cost = 0
    cost += (2.5 * response["usage"]["prompt_tokens"]) / 1e6
    cost += (10 * response["usage"]["completion_tokens"]) / 1e6

    with open("stat.csv", mode="a") as file:
        file.write(f'GPT4o,{response["usage"]["prompt_tokens"]},{response["usage"]["completion_tokens"]},{cost},{end_time-start_time}\n')

    # print(response["choices"][0]["message"]["content"])

    return json.loads(parse_response(response["choices"][0]["message"]["content"]))

In [9]:
def calculate_bert_score(reference, candidate):
    return 0, 0, 0
    # P, R, F1 = score([candidate], [reference], lang="en", verbose=False)
    # return P.mean().item(), R.mean().item(), F1.mean().item()

In [11]:
DATASET_PATH = "dataset/dataset-20-rag.jsonl"
RESULTS_PATH = "results/GPT4o-results-20-rag.jsonl"

In [27]:
if not os.path.exists(RESULTS_PATH):
    with open(RESULTS_PATH, mode="w", encoding='utf-8') as file:
        file.write("")

In [28]:
results = read_jsonl(RESULTS_PATH)

In [29]:
dataset = read_jsonl(DATASET_PATH)[:100]

In [30]:
for idx, data in enumerate(dataset):
    if len(results) > idx:
        continue
    
    usecase = data["usecase"]
    ex_usecase = data["rag-example"]["usecase"]
    ex_testcases = data["rag-example"]["testcases"]

    if "author" in usecase: del usecase["author"]
    if "id" in usecase: del usecase["id"]

    usecase = json.dumps(usecase, indent=4)
    ex_usecase = json.dumps(ex_usecase, indent=4)
    ex_testcases = json.dumps(ex_testcases, indent=4)
    
    testcases = generate_testcases(ex_usecase, ex_testcases, usecase)

    p, r, f1 = calculate_bert_score(
        reference=json.dumps(data["testcases"], indent=4),
        candidate=json.dumps(testcases, indent=4),
    )

    results.append({
        "usecase": data["usecase"],
        "testcases": data["testcases"],
        "GPT4o_testcases": testcases,
        "bert_score": {
            "Precision": p,
            "Recall": r,
            "F1": f1
        }
    })

    write_jsonl(RESULTS_PATH, results)
    # break
    


In [31]:
# precisions, recalls, f1_scores = [], [], []
# for res in results:
#     precisions.append(res["bert_score"]["Precision"])
#     recalls.append(res["bert_score"]["Recall"])
#     f1_scores.append(res["bert_score"]["F1"])

# print(f"Average Precision: {(sum(precisions)*100)/len(precisions):0.2f}")
# print(f"Average Recall: {(sum(recalls)*100)/len(recalls):0.2f}")
# print(f"Average F1: {(sum(f1_scores)*100)/len(f1_scores):0.2f}")

Average Precision: 93.40
Average Recall: 93.75
Average F1: 93.57
