# General Settings and Functions

## Settings

In [151]:
import os, sys
from openai import OpenAI, ChatCompletion, APITimeoutError

In [89]:
from dotenv import load_dotenv

load_dotenv("../../settings/.env", override=True)

True

In [132]:
import importlib

importlib.reload(sys.modules['AIChampTools'])
from AIChampTools import AIChampTools, LLMUsage

aicht = AIChampTools()

## Functions

### visualize_df

In [53]:
from IPython.display import display, HTML
import pandas as pd

pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

def visualize_df(df, widths={}):

    # Define the CSS styling
    css = '''
    <style>
        table.dataframe {
            width: 100%;
        }
        table.dataframe td, table.dataframe th {
            width: 400px;
            text-align: left !important;
            vertical-align: top;
        }
        table.dataframe th:nth-child(1), table.dataframe td:nth-child(1) {
            width: 30px !important;
        }
    '''
    
    for n, w in widths.items():
        css += f'''
        table.dataframe th:nth-child({int(n)+1}), table.dataframe td:nth-child({n}) {{
            width: {w}px !important;
            text-align: left !important;
        }}
        '''

    css += '''
        table.dataframe th {
            text-align: left !important;
        }
    </style>
    '''
    
    # Apply the CSS and display the DataFrame
    display(HTML(css))
    display(df)

# Test Setup

## Settings

In [144]:
report_filename_prefix = "multiple_system_messages"

system_prompt_template = """
You know 3 facts and 3 facts only:
1. apples are green
2. ducks are fun
3. roses are red

Think step by step and then answer.

If the user asks about a fact you don't know, you answer: "I don't know".

If the user makes a statement or asks a question that contradicts your knowledge, you answer: "Wrong"
"""

system_prompt2_template = """
You know 3 more facts:
4. platypuses are weird
5. people invented AI
6. otters can swim

If the user is requesting you for all the facts you know, you share them as a list.
"""

messages = [
    {"role": "system", "content": system_prompt_template},
    {"role": "system", "content": system_prompt2_template},
    {"role": "user", "content": "{question}"},
]

test_data = [

    # with these questions we are testing if the second system is not rewriting the first one
    { "question": "what are apples?", "type": "remembering previous system message" },
    { "question": "apples are red", "type": "remembering previous system message" },
    { "question": "which color are apples?", "type": "remembering previous system message" },
    { "question": "are ducks fun?", "type": "remembering previous system message" },
    { "question": "ducks are boring", "type": "remembering previous system message" },
    { "question": "what do you know about the roses?", "type": "remembering previous system message" },
    
    # with these questions we are testing if AI also considers the second system message
    { "question": "who invented AI?", "type": "usage of second system message" },
    { "question": "did god create AI?", "type": "usage of second system message" },
    { "question": "what do you know about platypuses?", "type": "usage of second system message" },
    { "question": "what do you know about otters?", "type": "usage of second system message" },
    { "question": "what do you know about dolphins?", "type": "usage of second system message" },
    
    # double checking if AI considers both system message
    { "question": "list all the facts you know", "type": "AI using both rulesets" },

    # double checking if AI is not making stuff up and following the instructions
    { "question": "who created people?", "type": "AI not making things up" },
    { "question": "which color are mangoes?", "type": "AI not making things up" },
]


llm_params = {
    "model": "gpt-3.5-turbo-1106",
    "temperature": 0,
}


## Assessment Functions

In [103]:
import json

def assess_response(messages, ai_response, llm_params):

    messages.append({"role":"assistant","content":ai_response})

    openai_client = OpenAI()
    
    system_message = f"""

    Here's a conversation between an Assistant and a User:
    --------
    {messages}
    --------
    
    Your objective is to assess if the Assistant's response is correctly following all system messages.
    
    You must reply in JSON format with the fields:
    - "reasoning": your step by step detailed reasoning of the assessment
    - "status", all the possible options:
        - correct
        - incorrect
    """
    
    messages4ai = [
        {"role": "system", "content": system_message},
    ]

    try:
        llm_response = openai_client.chat.completions.create(**llm_params, messages=messages4ai)
        llm_response = json.loads(llm_response.model_dump_json())
    except APITimeoutError:
        #logging.info("The function took too long to complete, so it was aborted.")
        status = "timeout"
    except Exception as e:
        print("\n\n")
        print(f"Unexpected error: {e}")
        print(f"llm_params: {llm_params}")
        print(f"messages: {messages}")
        print("\n\n")

    try:
        res = json.loads(llm_response["choices"][0]["message"]["content"])
    except:
        print("\n\n")
        print("PROBABLY NOT JSON")
        print(f"llm_response: {llm_response}")
        print("\n\n")
        return
    
    return {
        "reasoning": res["reasoning"],
        "status": res["status"],
        "llm_params": llm_params,
        "llm_usage": LLMUsage(
            
        )
    }

llm_params_gpt35turbo1106json = {
    "model": "gpt-3.5-turbo-1106",
    "temperature": 0,
    "response_format": { "type": "json_object" },
    "timeout": 10
}
llm_params_gpt4turbo1106json = {
    "model": "gpt-4-1106-preview",
    "temperature": 0,
    "response_format": { "type": "json_object" },
    "timeout": 20
}

validate_response_validator = {
    "function": assess_response,
    "input": [
        ["messages"],
        ["synth_data"],
    ],
    "input2": [
        llm_params_gpt4turbo1106json
    ]
}

print(validate_response_validator["function"].__name__)


assess_response


# Testing

In [145]:
from datetime import datetime

now = datetime.now()
now_str = now.strftime("%Y%m%d%H%M%S")

test_filename = f"reports/prompt_testing_{report_filename_prefix}_{now_str}_{llm_params['model']}-t{llm_params['temperature']}.csv"

test_results = aicht.generate_synth_data_openai(
    llm_params=llm_params,
    messages=messages,
    data=test_data,
    reps=20,
    max_data_points=0,
    save_to=test_filename,
    validators=[validate_response_validator]
)

In [146]:
test_results = aicht.load_synth_data(
    [
        # "reports/prompt_testing_multiple_system_messages_20231208163137_gpt-3.5-turbo-1106-t0.csv",
        # "reports/prompt_testing_multiple_system_messages_20231208175023_gpt-3.5-turbo-1106-t0.csv",
        test_filename,
    ],
    flatten=1)

In [None]:
# test_results.columns

In [147]:
columns = {
    "data.question": "Question",
    "data.type": "Question Type",
    "generation": "Answer",
    "validator_assess_response.assess_response_status": "Is Correct",
    "validator_assess_response.assess_response_reasoning": "Reasoning"
}
test_results_pres = test_results[columns.keys()]
test_results_pres.columns = columns.values()

In [148]:
test_results_pres.groupby('Question Type')['Is Correct'].apply(lambda x: (x == 'correct').mean()*100)

Question Type
AI not making things up                100.0
AI using both rulesets                 100.0
remembering previous system message    100.0
usage of second system message         100.0
Name: Is Correct, dtype: float64

In [149]:
len(test_results_pres)

280

In [150]:
visualize_df(
    test_results_pres
)

Unnamed: 0,Question,Question Type,Answer,Is Correct,Reasoning
0,what are apples?,remembering previous system message,Apples are green.,correct,"The user asked 'what are apples?' which is a direct question about a fact that the Assistant has been informed of. According to the first fact provided by the system, 'apples are green'. The Assistant correctly responded with 'Apples are green.' This response is in line with the instructions given by the system."
1,what are apples?,remembering previous system message,Apples are green.,correct,"The user asked 'what are apples?' and the Assistant correctly responded with one of the facts it knows, which is 'Apples are green.' This is in line with the instructions given in the system messages, where the first fact listed is that apples are green. The Assistant's response does not contradict any of the instructions or facts provided."
2,what are apples?,remembering previous system message,Apples are green.,correct,"The user asked 'what are apples?' and the Assistant correctly responded with the fact it knows, which is 'Apples are green.' This is in line with the first fact provided by the system messages. The Assistant's response follows the instructions given and does not contradict any of the additional facts it has been told to know."
3,what are apples?,remembering previous system message,Apples are green.,correct,The user asked 'what are apples?' which is a direct question about the nature of apples. The Assistant responded with 'Apples are green.' This is in line with the first fact provided by the system: 'apples are green.' The Assistant's response is consistent with the information given to it and follows the instructions correctly.
4,what are apples?,remembering previous system message,Apples are green.,correct,"The user asked 'what are apples?' and the Assistant correctly responded with the fact it knows, which is 'Apples are green.' This is in line with the first fact provided by the system: 'apples are green.' The Assistant's response is consistent with the instructions given."
5,what are apples?,remembering previous system message,Apples are green.,correct,"The user asked 'what are apples?' which directly relates to one of the facts the Assistant knows, which is 'apples are green'. The Assistant correctly responded with the fact it knows, following the instructions given in the system messages."
6,what are apples?,remembering previous system message,Apples are green.,correct,"The user asked 'what are apples?' which directly relates to the first fact the Assistant knows: 'apples are green'. The Assistant correctly responded with 'Apples are green.' This response is in line with the instructions given in the system messages, as the Assistant is supposed to answer based on the facts it knows."
7,what are apples?,remembering previous system message,Apples are green.,correct,"The user asked 'what are apples?' and according to the first fact the Assistant knows ('apples are green'), the Assistant's response 'Apples are green.' is correct and follows the instructions given by the system messages."
8,what are apples?,remembering previous system message,Apples are green.,correct,"The user asked 'what are apples?' which is a direct question about the nature of apples. According to the facts provided by the system, the Assistant correctly responded with 'Apples are green.' This is in line with fact number 1 provided by the system. The Assistant's response does not contradict any of the instructions or facts given by the system."
9,what are apples?,remembering previous system message,Apples are green.,correct,"The user asked 'what are apples?' which is a direct question about the nature of apples. The Assistant's response 'Apples are green.' is in line with the first fact it knows ('apples are green'). Therefore, the Assistant's response is correct as it follows the instructions given in the system messages."
