# Setup

In [2]:
import dspy, os, random, re, requests, json
from pprint import pprint
from dspy import Example
import tqdm as notebook_tqdm
from pprint import pprint
from typing import Union, Literal, Optional
from dataclasses import dataclass
from collections import Counter

together_openai = dspy.OpenAI(
    api_base = os.getenv("TOGETHER_API_BASE"),
    api_key= os.getenv("TOGETHER_API_KEY"),
    # api_key=os.getenv("OPENAI_API_KEY"),
    # model="gpt-3.5-turbo-0125"
    # model="Qwen/Qwen1.5-72B-Chat",
    # model="mistralai/Mistral-7B-Instruct-v0.2", 
    model="mistralai/Mixtral-8x7B-Instruct-v0.1",
    # model="mistralai/Mixtral-8x22B",
    # model="mistralai/Mixtral-8x22B-Instruct-v0.1",
    # model="meta-llama/Llama-3-70b-chat-hf",
    # model="meta-llama/Meta-Llama-3-70B",
    # model="nonexistent/snowflake-arctic-instruct",
)
dspy.configure(lm=together_openai, trace=[])
# together_openai("Hello, how are you?")

# Inference monitoring

In [4]:
# INFERENCE MONITORING
import phoenix as px
from openinference.instrumentation.dspy import DSPyInstrumentor
from opentelemetry import trace as trace_api
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
from opentelemetry.sdk import trace as trace_sdk
from opentelemetry.sdk.resources import Resource
from opentelemetry.sdk.trace.export import SimpleSpanProcessor

endpoint = "http://127.0.0.1:6006/v1/traces"
resource = Resource(attributes={})
tracer_provider = trace_sdk.TracerProvider(resource=resource)
span_otlp_exporter = OTLPSpanExporter(endpoint=endpoint)
tracer_provider.add_span_processor(SimpleSpanProcessor(span_exporter=span_otlp_exporter))

trace_api.set_tracer_provider(tracer_provider=tracer_provider)
DSPyInstrumentor().instrument()

phoenix_session = px.launch_app()

🌍 To view the Phoenix app in your browser, visit http://localhost:6006/
📺 To view the Phoenix app in a notebook, run `px.active_session().view()`
📖 For more information on how to use Phoenix, check out https://docs.arize.com/phoenix


# Compilation

In [33]:
from dspy.teleprompt import BootstrapFewShotWithRandomSearch, COPRO, MIPRO
from llm_modules.candidates import CompiledCandidates
# from dspy.functional import TypedPredictor
from dspy.datasets import DataLoader

userprefs = [
    dict(n=1, foreign_word='Sperre', keywordcount=Counter({'spear': 8, 'pear': 1, 'spare': 1, 'sphere': 1})),
    dict(n=2, foreign_word='Hose', keywordcount=Counter({'hose': 7, 'house': 2, 'horse': 1, 'hoes': 1})),
    dict(n=3, foreign_word='Nehmen', keywordcount=Counter({'nemo': 5, 'naming': 1, 'neanderthal': 1, 'neiman': 1, 'neem': 1, 'no men': 1, 'nah man': 1})),
    dict(n=4, foreign_word='Haben', keywordcount=Counter({'haven': 6, 'have': 2, 'hobby': 1, 'habit': 1, 'happen': 1})),
    dict(n=5, foreign_word='Ecke', keywordcount=Counter({'echo': 9, 'eek': 1})),
    dict(n=6, foreign_word='Dohle', keywordcount=Counter({'dollar': 2, 'dolly': 1, 'dole': 1, 'doily': 1, 'dolphin': 1, 'ice cream': 1, 'doha': 1, 'dodo': 1, 'owl': 1, 'doll': 1})),
    dict(n=7, foreign_word='Kaufen', keywordcount=Counter({'coffin': 7, 'kaufland': 1, 'cough': 1})),
    dict(n=8, foreign_word='Fliegen', keywordcount=Counter({'flies': 4, 'fly': 3, 'flying': 2, 'fleeing': 1, 'طيارة': 1})),
    dict(n=9, foreign_word='Leiter', keywordcount=Counter({'leader': 3, 'ladder': 2, 'later': 1, 'lighter': 1, 'latter': 1, 'liter': 1})),
    dict(n=10, foreign_word='Friseur', keywordcount=Counter({'freezer': 9, 'fries': 1, 'frisbee': 1})),
    dict(n=11, foreign_word='Stellen', keywordcount=Counter({'stallion': 6, 'steal': 1, 'stellan': 1, 'stellar': 1, 'stealing': 1})),
    dict(n=12, foreign_word='Brauchen', keywordcount=Counter({'broken': 4, 'bra': 2, 'brunch': 1, 'bracken': 1, 'braunschweig': 1, 'brought': 1, 'break in': 1})),
    dict(n=13, foreign_word='Teller', keywordcount=Counter({'teller': 9, 'telly': 1, 'to tell': 1})),
    dict(n=14, foreign_word='Küche', keywordcount=Counter({'kitchen': 5, 'couch': 2, 'cutie': 1, 'cooking': 1, 'ketchup': 1, 'cook': 1})),
    dict(n=15, foreign_word='Mieten', keywordcount=Counter({'mitten': 9, 'meet': 1, 'meeting': 1})),
    dict(n=16, foreign_word='Zahlen', keywordcount=Counter({'salad': 2, 'zara': 1, 'betalen': 1, 'zaal': 1, 'sailing': 1, 'zach': 1, 'hall': 1, 'salem': 1, "sellin'": 1})),
    dict(n=17, foreign_word='Klippe', keywordcount=Counter({'clip': 7, 'clipper': 2})),
    dict(n=18, foreign_word='Fahne', keywordcount=Counter({'fan': 8, 'fane': 2})),
    dict(n=19, foreign_word='Rufen', keywordcount=Counter({'roofing': 9, 'rufus': 1})),
    dict(n=20, foreign_word='Graben', keywordcount=Counter({'grab': 11})),
    dict(n=21, foreign_word='Schere', keywordcount=Counter({'chair': 4, 'shears': 2, 'sphere': 2, 'shear': 1, 'cheer': 1, 'sheer': 1})),
    dict(n=22, foreign_word='Rasen', keywordcount=Counter({'racing': 4, 'raisin': 3, 'razor': 2, 'rasin': 1, 'raising': 1})),
    dict(n=23, foreign_word='Stoßen', keywordcount=Counter({'stone': 4, 'stolen': 3, 'stopping': 1, 'stowing': 1})),
    dict(n=24, foreign_word='Streichen', keywordcount=Counter({'strike': 4, 'stretch': 4, 'striking': 1, 'stricken': 1})),
    dict(n=25, foreign_word='Schalter', keywordcount=Counter({'shoulder': 4, 'shelter': 3, 'shatter': 2, 'shall tear': 1, 'saltier': 1})),
    dict(n=26, foreign_word='Flasche', keywordcount=Counter({'flash': 7, 'flask': 2, 'flash card': 1})),
    dict(n=27, foreign_word='Streiten', keywordcount=Counter({'straighten': 3, 'straiten': 3, 'street': 2, 'stripes': 1, 'straight': 1, 'strengthen': 1})),
    dict(n=28, foreign_word='Laufen', keywordcount=Counter({'laughing': 7, 'laugh': 2, 'launch': 1})),
    dict(n=29, foreign_word='Brücke', keywordcount=Counter({'bridge': 3, 'brooke': 1, 'broke': 1, 'broken': 1, 'brick': 1, 'brook': 1, 'bruck': 1})),
    dict(n=30, foreign_word='Messer', keywordcount=Counter({'mess': 4, 'messi': 2, 'massage': 2, 'massive': 1, 'massacre': 1, 'messier': 1})),
    dict(n=31, foreign_word='Treten', keywordcount=Counter({'treat': 9, 'tread': 1, 'treating': 1})),
    dict(n=32, foreign_word='Tragen', keywordcount=Counter({'dragon': 3, 'tragic': 2, 'take': 1, 'trigger': 1, 'dragging': 1, 'tragedy': 1})),
    dict(n=33, foreign_word='Nagel', keywordcount=Counter({'nail': 5, 'nigel': 2, 'knob': 1, 'navel': 1, 'nagael': 1})),
    dict(n=34, foreign_word='Birne', keywordcount=Counter({'burn': 6, 'bear': 1, 'bernie sanders': 1, 'beer': 1, 'berney sanders': 1})),
    dict(n=35, foreign_word='Sagen', keywordcount=Counter({'sage': 3, 'saying': 2, 'saigon': 1, 'sagan': 1, 'sagging': 1, 'sacking': 1})),
    dict(n=36, foreign_word='Reißen', keywordcount=Counter({'rice': 5, 'raisin': 1, 'rise': 1, 'rising': 1, 'rain': 1, 'reisen': 1})),
]
examples = [Example(language="German", foreign_word=up['foreign_word'], similar_word=k, ratio=v/sum((i for i in up['keywordcount'].values()))).with_inputs("language", "foreign_word")
            for up in userprefs 
            for k, v in up['keywordcount'].items()
            ]
print(len(examples))
# examples = [Example(language="German", foreign_word=up['foreign_word'], similar_words = list(up['keywordcount'])).with_inputs("language", "foreign_word") for up in userprefs]

dl = DataLoader()
splits = dl.train_test_split(examples, train_size=0.8) # `dataset` is a List of dspy.Example
trainset = splits['train']
testset = splits['test']

class SimilarOrthography(dspy.Signature):
    """Generate an English word that is as similar as possible to the foreign word, either orthographically or phonetically."""
    language = dspy.InputField()
    foreign_word = dspy.InputField()
    similar_word: list[str] = dspy.OutputField()

class SimilarWordModule(dspy.Module):
    def __init__(self):
        super().__init__()
        self.generate_word = dspy.Predict(SimilarOrthography)

    def forward(self, language, foreign_word, translation=None):
        sim_word = self.generate_word(language=language, foreign_word=foreign_word, config=dict(n=10, stop="\n"))
        return sim_word

def get_keywordcount(foreign_word):
    for pref in userprefs:
        if pref['foreign_word'] == foreign_word:
            return pref['keywordcount']
    return None

def weighted_precision(example, prediction, trace=[]):
    # print(example)
    # print(prediction)
    foreign_word = example.foreign_word
    candidates = prediction.completions.similar_word
    candidates = [p.lower() for p in candidates]

    keywordcount = get_keywordcount(foreign_word)
    total_weight = sum(keywordcount.values())
    match_weight = sum(keywordcount.get(kw, 0) for kw in candidates)
    # print("--------------------")
    # print("foreign_word:", foreign_word, "keywordcount:", keywordcount, "candidates:", candidates, "\n", 
    #       "match_weight:", match_weight, "total_weight:", total_weight, "precision:", match_weight / total_weight if total_weight > 0 else 0)

    return match_weight / total_weight if total_weight > 0 else 0


import numpy as np
from sklearn.metrics import ndcg_score
def ndcg_score(example, prediction, trace=[]):
    candidates = prediction.completions.similar_word
    candidates_lower = [p.lower() for p in candidates]
    # Remove duplicates, sort by count
    candidates_dedup_sorted = sorted(list({p for p in candidates_lower}), key=lambda x: candidates_lower.count(x), reverse=True)
    
    keywordcount = get_keywordcount(example.foreign_word)
    scores = [candidates.count(kw) for kw in candidates_dedup_sorted]
    true_relevance = [keywordcount.get(kw, 0) for kw in candidates_dedup_sorted]
    if len(scores) < 2:
        return 0

    return ndcg_score(true_relevance, scores)


# smw = SimilarWordModule()
# smw("German", "Hause").completions
# config = dict(max_bootstrapped_demos=4, max_labeled_demos=4, num_candidate_programs=10, num_threads=2)
# teleprompter = BootstrapFewShotWithRandomSearch(metric=weighted_precision, **config)
# bfswrs_new_examples = teleprompter.compile(SimilarWordModule(), trainset=trainset)

# # COPRO
# eval_kwargs = dict(num_threads=16, display_progress=True, display_table=0)
# copro_teleprompter = COPRO(prompt_model=together_openai, metric=weighted_precision, breadth=5, depth=5, init_temperature=0.7, verbose=True)
# compiled_copro = copro_teleprompter.compile(SimilarWordModule(), trainset=trainset, eval_kwargs=eval_kwargs)

# MIPRO
# teleprompter = MIPRO(prompt_model=together_openai, task_model=model_that_solves_task, metric=your_defined_metric, num_candidates=num_new_prompts_generated, init_temperature=prompt_generation_temperature)
# kwargs = dict(num_threads=NUM_THREADS, display_progress=True, display_table=0)
# compiled_program_optimized_bayesian_signature = teleprompter.compile(your_dspy_program, trainset=trainset, num_trials=100, max_bootstrapped_demos=3, max_labeled_demos=5, eval_kwargs=kwargs)

# BootstrapFewShotWithOptuna
# from dspy.teleprompt import BootstrapFewShotWithOptuna
# fewshot_optuna_optimizer2 = BootstrapFewShotWithOptuna(metric=weighted_precision, max_bootstrapped_demos=20, num_candidate_programs=8, num_threads=4)
# compiled_optuna = fewshot_optuna_optimizer2.compile(student=SimilarWordModule(), max_demos=20, trainset=trainset, valset=trainset)


169


# Test

In [12]:
# together_openai = dspy.OpenAI(
#     api_base = os.getenv("TOGETHER_API_BASE"),
#     api_key= os.getenv("TOGETHER_API_KEY"),
#     model="meta-llama/Meta-Llama-3-70B",
# )
# dspy.configure(lm=together_openai, trace=[])

pr = dspy.Predict('question -> answer')
response = pr(question="What is the capital of Egypt?", config={"stop": "\n"})
print(response.answer)


Cairo


In [33]:

# print(str(optimized_program) + "sss")
# optimized_program.save(path="optimized_fswrs.json")
# optimized_program_2.save(path="optimized_fswrs_2.json")
# compiled_copro.save(path="compiled_copro.json")
# copmiled_optuna.save(path="compiled_optuna.json")
compiled_optuna.save(path="compiled_optuna2.json")

# Evaluate

In [40]:
from dspy.evaluate.evaluate import Evaluate
from dspy.teleprompt import LabeledFewShot
from pprint import pprint

# evaluation = Evaluate(devset=testset, metric=weighted_precision, num_threads=1, display_progress=True, display_table=0)

# lfs4 = LabeledFewShot(k=4)
# lfs4_optimized = lfs4.compile(SimilarWordModule(), trainset=trainset[:4])
# lfs4_evaluation = evaluation(lfs4_optimized)
print("LabeledFewShot evaluation", lfs4_evaluation)

lms_evaluation = evaluation(LabeledFewShot(k=40).compile(SimilarWordModule(), trainset=trainset))
print("LabeledManyShot evaluation", lms_evaluation)

# fswrs_optimized = SimilarWordModule()
# fswrs_optimized.load(path="optimized_fswrs.json")
# fswrs_evaluation = evaluation(fswrs_optimized)
print("Fewshotwithrandomsearch evaluation", fswrs_evaluation)

# fswrs_optimized_2 = SimilarWordModule()
# fswrs_optimized_2.load(path="optimized_fswrs_2.json")
# fswrs_evaluation_2 = evaluation(fswrs_optimized_2)
print("Fewshotwithrandomsearch evaluation 2", fswrs_evaluation_2)

# compiled_copro = SimilarWordModule()
# compiled_copro.load(path="compiled_copro.json")
# copro_evaluation = evaluation(compiled_copro)
print("Copro evaluation", copro_evaluation)

# compiled_optuna = SimilarWordModule()
# compiled_optuna.load(path="compiled_optuna.json")
# optuna_evaluation = evaluation(compiled_optuna)
print("Optuna evaluation", optuna_evaluation)

# compiled_optuna2 = SimilarWordModule()
# compiled_optuna2.load(path="compiled_optuna2.json")
# optuna_evaluation_2 = evaluation(compiled_optuna2)
print("Optuna evaluation 2", optuna_evaluation_2)

# normal_evaluation = evaluation(SimilarWordModule(), display_table=10)
print("Normal evaluation", normal_evaluation)

LabeledFewShot evaluation 136.09


Average Metric: 31.09696969696969 / 34  (91.5): 100%|██████████| 34/34 [00:01<00:00, 22.24it/s] 
  df = df.applymap(truncate_cell)


Average Metric: 31.09696969696969 / 34  (91.5%)
LabeledManyShot evaluation 91.46
Fewshotwithrandomsearch evaluation 136.47
Fewshotwithrandomsearch evaluation 2 141.19
Copro evaluation 108.16
Optuna evaluation 85.74
Optuna evaluation 2 115.77
Normal evaluation 108.16
