In [1]:
import groq
import os
from dotenv import load_dotenv
import dspy
import numpy as np
from dspy.evaluate.metrics import answer_exact_match
import pandas as pd
from IPython.core.display import Markdown
import io
import contextlib
from dspy import Example
from dspy.teleprompt import BootstrapFewShot
from dspy.teleprompt import BootstrapFewShotWithRandomSearch
from dspy.evaluate import Evaluate
from module_v002 import FullLLMChain 
from optimize import passage_similarity_metric, custom_evaluation_function, similar_score_metric, evaluate_expectations_metric, create_dataframe_with_validation_results, capture_and_save_output
import json
from data.preprocess import create_dspy_examples_train_test_validation_sets
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")


llama3_8b = dspy.OllamaLocal(model = "llama3:8b",
                             temperature = 0,
                             max_tokens = 800)

gpt35turbo = dspy.OpenAI(model = "gpt-3.5-turbo-0125",
                         api_key = openai_api_key,
                         temperature = 0,
                         max_tokens = 800,
                         model_type = "chat")



# Import Data and Create Training, Testing, Validation Splits

In [2]:
data = pd.read_excel("data/300_snippets_transcripts_all_labeled_v002.xlsx")

train_set, test_set, validation_set = create_dspy_examples_train_test_validation_sets(
    data=data, 
    train_size=50, 
    test_size=50,
    validation_size=50
)

# Run with the Unoptimzed GPT Model

In [3]:
dspy.settings.configure(lm=gpt35turbo)
full_llm_chain_unoptimized = FullLLMChain()

In [4]:
results_unoptimized = create_dataframe_with_validation_results(
    validation_examples=validation_set,
    llm_chain=full_llm_chain_unoptimized
)

Finished validation example 0 out of 50 (0.00%)
Finished validation example 1 out of 50 (2.00%)
Finished validation example 2 out of 50 (4.00%)
Finished validation example 3 out of 50 (6.00%)
Finished validation example 4 out of 50 (8.00%)
Finished validation example 5 out of 50 (10.00%)
Finished validation example 6 out of 50 (12.00%)
Finished validation example 7 out of 50 (14.00%)
Finished validation example 8 out of 50 (16.00%)
Finished validation example 9 out of 50 (18.00%)
Finished validation example 10 out of 50 (20.00%)
Finished validation example 11 out of 50 (22.00%)
Finished validation example 12 out of 50 (24.00%)
Finished validation example 13 out of 50 (26.00%)
Finished validation example 14 out of 50 (28.00%)
Finished validation example 15 out of 50 (30.00%)
Finished validation example 16 out of 50 (32.00%)
Finished validation example 17 out of 50 (34.00%)
Finished validation example 18 out of 50 (36.00%)
Finished validation example 19 out of 50 (38.00%)
Finished valida

In [5]:
results_unoptimized.to_excel("validation/validation_unoptimized_gpt_v009.xlsx")

In [6]:
capture_and_save_output(gpt35turbo.inspect_history, "validation/prompts_gpt_unoptimized_v009.txt", n=3)

In [7]:
llama3_8b.inspect_history(n=3)

# Run with the Optimized GPT Model

In [30]:
dspy.settings.configure(lm=gpt35turbo)

full_llm_chain_optimized = FullLLMChain()
full_llm_chain_optimized.load(path="optimized_llm_chains/optimized_gpt_chain_v009.json")

In [31]:
results_optimized = create_dataframe_with_validation_results(
    validation_examples=validation_set,
    llm_chain=full_llm_chain_optimized
)

Finished validation example 0 out of 50 (0.00%)
Finished validation example 1 out of 50 (2.00%)
Finished validation example 2 out of 50 (4.00%)
Finished validation example 3 out of 50 (6.00%)
Finished validation example 4 out of 50 (8.00%)
Finished validation example 5 out of 50 (10.00%)
Finished validation example 6 out of 50 (12.00%)
Finished validation example 7 out of 50 (14.00%)
Finished validation example 8 out of 50 (16.00%)
Finished validation example 9 out of 50 (18.00%)
Finished validation example 10 out of 50 (20.00%)
Finished validation example 11 out of 50 (22.00%)
Finished validation example 12 out of 50 (24.00%)
Finished validation example 13 out of 50 (26.00%)
Finished validation example 14 out of 50 (28.00%)
Finished validation example 15 out of 50 (30.00%)
Finished validation example 16 out of 50 (32.00%)
Finished validation example 17 out of 50 (34.00%)
Finished validation example 18 out of 50 (36.00%)
Finished validation example 19 out of 50 (38.00%)
Finished valida

In [32]:
results_optimized.to_excel("validation/validation_optimized_gpt_v011.xlsx")

In [33]:
capture_and_save_output(gpt35turbo.inspect_history, "validation/prompts_gpt_optimized_v011.txt", n=3)

In [34]:
gpt35turbo.inspect_history(n=3)





---CONTEXT---
    You are an experienced analyst known for your ability to succinctly summarize complex information and extract key insights from various texts.

    ---TASK---
    Your task is to summarize the role of the country in the given text excerpt, i.e. why the country is mentioned.

    ---GUIDELINES---
    - Focus on clearly and concisely capturing the main points related to the country's role.
    - Only include information on one country - the one mentioned in the keyword.

---

Follow the following format.

Country Keyword: keyword that represents a country
Excerpt: excerpt from a financial services company's earnings conference call
Answer: one or two sentences

---

Country Keyword: spain
Excerpt: e specific products, and we are in constant cooperation with the government on this. but in terms of the impact, you were more asking for the impact, i guess. for the gdp, first of all, for the gdp, this next-generation eu, our bbva research team, they do expect a 1 percen

# Look at Metrics

In [24]:
metric_unoptimized = results_unoptimized["Evaluation_Score"].mean()
number_05_scores = len(results_unoptimized[results_unoptimized["Evaluation_Score"] == 0.5])
number_1_scores = len(results_unoptimized[results_unoptimized["Evaluation_Score"] == 1.0])
print(metric_unoptimized, number_05_scores, number_1_scores)


0.37 15 11


In [25]:
metric_optimized = results_optimized["Evaluation_Score"].mean()
number_05_scores = len(results_optimized[results_optimized["Evaluation_Score"] == 0.5])
number_1_scores = len(results_optimized[results_optimized["Evaluation_Score"] == 1.0])
number_0_scores = len(results_optimized[results_optimized["Evaluation_Score"] == 0.0])
print(metric_optimized, number_05_scores, number_1_scores, number_0_scores)

0.57 3 27 20
