In [4]:
import os, sys
import json

In [2]:
from dotenv import load_dotenv

load_dotenv("../../../../settings/.env", override=True)

True

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

def variability_from_most_common(all_values):#, reference, value

    variability_matrix = []
    for i, v in enumerate(all_values):
        variability_matrix.append([])
        for j, v2 in enumerate(all_values):
            vectorizer = TfidfVectorizer().fit_transform([v, v2])
            vectors = vectorizer.toarray()
            csim = cosine_similarity(vectors)
            variability_matrix[i].append(csim[0][1])

    variability_matrix_df = pd.DataFrame(variability_matrix,columns=all_values)
    
    return variability_matrix_df.mean().max()

In [38]:
from IPython.display import Markdown, display

def printmd(string):
    display(Markdown(string))

In [5]:
sys.path.insert(0, '../../../../05 Own Solutions/AIChampTools')

import importlib

from AIChampTools import AIChampTools, LLMUsage, PromptEngineeringExperiment
prevent_output = importlib.reload(sys.modules['AIChampTools'])

In [6]:
experiment = PromptEngineeringExperiment(
    name="openai_seed_parameter",
    logs_folder="../../logs/",
)

# Experiment Report

In this experiment I was trying to see if seed parameter of OpenAI GPT API can give stable results and how stable they are compared to very low top_p.

## Version 09

In this version, I have set a limit of 200 tokens for completions.

In [56]:
ver09_results_nf = experiment.load_results(ver="09")

ver09_results_nf["llm_params"] = ver09_results_nf["llm_params"].apply(json.dumps)
ver09_results_nf["messages_template"] = ver09_results_nf["messages_template"].apply(str)

for i, llm_params in enumerate(ver09_results_nf["llm_params"].unique()):
    completions = ver09_results_nf[ver09_results_nf["llm_params"]==llm_params]
    
    printmd(f'### Variation {i+1}')
    printmd(f'**Messages template**:\n{list(completions["messages_template"].unique())}')
    printmd(f'**User Messages**:\n{completions["data"].apply(str).unique()}')
    printmd(f"**LLM Params**:\n{llm_params}")
    printmd(f'**Completions in the experiment**:\n{len(completions)}')

    vrblt = variability_from_most_common(list(completions["generation"]))
    printmd(f'**Results variability**: {vrblt}')
    printmd("")

### Variation 1

**Messages template**:
["[{'role': 'user', 'content': '{user_message}'}]"]

**User Messages**:
['{\'user_message\': \'Describe how WW2 started.\', \'expected_answer\': \'""\'}']

**LLM Params**:
{"model": "gpt-4-1106-preview", "max_tokens": 200, "seed": 33, "temperature": 0, "timeout": 30}

**Completions in the experiment**:
100

**Results variability**: 0.8988740587231018



### Variation 2

**Messages template**:
["[{'role': 'user', 'content': '{user_message}'}]"]

**User Messages**:
['{\'user_message\': \'Describe how WW2 started.\', \'expected_answer\': \'""\'}']

**LLM Params**:
{"model": "gpt-4-1106-preview", "max_tokens": 200, "top_p": 1e-06, "temperature": 0, "timeout": 30}

**Completions in the experiment**:
100

**Results variability**: 0.8403985918161336



## Version 10

In this version, I have set a limit of 50 tokens for completions.

In [57]:
ver10_results_nf = experiment.load_results(ver="10")

ver10_results_nf["llm_params"] = ver10_results_nf["llm_params"].apply(json.dumps)
ver10_results_nf["messages_template"] = ver10_results_nf["messages_template"].apply(str)

for i, llm_params in enumerate(ver10_results_nf["llm_params"].unique()):
    completions = ver10_results_nf[ver10_results_nf["llm_params"]==llm_params]
    
    printmd(f'### Variation {i+1}')
    printmd(f'**Messages template**:\n{list(completions["messages_template"].unique())}')
    printmd(f'**User Messages**:\n{completions["data"].apply(str).unique()}')
    printmd(f"**LLM Params**:\n{llm_params}")
    printmd(f'**Completions in the experiment**:\n{len(completions)}')

    vrblt = variability_from_most_common(list(completions["generation"]))
    printmd(f'**Results variability**: {vrblt}')
    printmd("")

### Variation 1

**Messages template**:
["[{'role': 'user', 'content': '{user_message}'}]"]

**User Messages**:
['{\'user_message\': \'Describe how WW2 started.\', \'expected_answer\': \'""\'}']

**LLM Params**:
{"model": "gpt-4-1106-preview", "max_tokens": 50, "seed": 33, "temperature": 0, "timeout": 30}

**Completions in the experiment**:
100

**Results variability**: 0.9999999999999999



### Variation 2

**Messages template**:
["[{'role': 'user', 'content': '{user_message}'}]"]

**User Messages**:
['{\'user_message\': \'Describe how WW2 started.\', \'expected_answer\': \'""\'}']

**LLM Params**:
{"model": "gpt-4-1106-preview", "max_tokens": 50, "top_p": 1e-06, "temperature": 0, "timeout": 30}

**Completions in the experiment**:
100

**Results variability**: 0.9019718039364597



## (Preliminary) Conclusion

More variations of the experiment (both in terms of models and input data) are required to make definitive conclusion.

From the tests done so far we can see that `seed` parameter gives more stable results that low `top_p` and the longer the completion, the higher variability.