## Mixtual paraphraser

Try to deliver a function that can paraphrase a sentence without plagiarizing it.

- Objective: avoid plagiarism / copyright infringement
- Target metrics: Argmin(Jaccard similarity), Argmax(BertScore)
- Tune: prompt
- Testset: seed 5 selected covid questions from testset, generate 10 paraphrases for each question, and evaluate the paraphrases with the paraphraser with target metrics before and after paraphrasing.


In [None]:
import requests
from dotenv import load_dotenv
import os
import pandas as pd
import altair as alt
from openai import OpenAI
from requests.auth import HTTPBasicAuth

load_dotenv()

In [None]:
def fetch_paragraphs(topic: str, question: str, n: int = 10) -> list[str]:
    endpoint = "http://cosmos0001.chtc.wisc.edu:4502/hybrid"
    header = {"Api-Key": os.getenv("RETRIEVER_APIKEY")}

    data = {
        "question": question,
        "topic": topic,
        "top_k": n,
        "doc_type": "paragraph",
    }

    response = requests.post(endpoint, headers=header, json=data)
    response.raise_for_status()

    return [doc["text"] for doc in response.json()]

In [None]:
def ask_mixtral(messages: list[dict]) -> dict:
    """Ask mixtral with a data package.

    Example input: [{"role": "user", "content": "Hello world example in python."}]
    """
    url = os.getenv("MIXTRAL_URL")
    user = os.getenv("MIXTRAL_USER")
    password = os.getenv("MIXTRAL_PASSWORD")
    data = {
        "model": "mixtral",
        "messages": messages,
        "stream": False,  # set to True to get a stream of responses token-by-token
    }
    # Non-streaming mode
    response = requests.post(url, auth=HTTPBasicAuth(user, password), json=data)
    response.raise_for_status()
    return response.json()["message"]["content"]

In [None]:
def ask_openai(messages: list[dict]) -> dict:
    """Ask gpt with a data package.

    Example input: [{"role": "user", "content": "Hello world example in python."}]
    """

    client = OpenAI()
    chat_completion = client.chat.completions.create(
        messages=messages,
        model="gpt-4-1106-preview",
    )
    return chat_completion.choices[0].message.content

### Make a 50 paragraph paraphraser testset


In [None]:
# Manually select 5 questions in COVID preset
# with open("askem/demo/preset_questions/preset_covid_q.txt", "r") as f:
#     questions = f.readlines()

# selected = [0, 4, 12, 18, 21]
# questions = [questions[i] for i in selected]
# print(questions)

# questions

# paragraphs = set()

# for q in questions:
#     paragraphs.update(fetch_paragraphs(topic="covid", question=q, n=10))

# # Save paragraphs to file

# with open("data/covid_50_paragraphs.txt", "w") as f:
#     f.writelines(paragraphs)

In [None]:
with open("data/covid_50_paragraphs.txt", "r") as f:
    paragraphs = f.readlines()

print(paragraphs[:3])

Use a basic COSTAR prompt setup

https://levelup.gitconnected.com/a-comprehensive-guide-to-prompt-engineering-unveiling-the-power-of-the-costar-template-944897251101


In [None]:
def get_prompt(
    context: str, objective: str, style: str, tone: str, audience: str, response: str
) -> str:
    """Get a COSTAR prompt."""

    return f"""
    CONTEXT: {context}
    OBJECTIVE: {objective}
    STYLE: {style}
    TONE: {tone}
    Audience: {audience}
    Response: {response}
    """


def costar_v1(paragraph: str) -> str:
    """Generate a prompt for COSTAR v1."""
    return get_prompt(
        context=f"Rewrite this paragraph: {paragraph}",
        objective="To paraphrase without plagiarizing the original content, rewrite the given paragraph.",
        style="Informative and Simple",
        tone="Neutral",
        audience="General Public",
        response="A short paragraph with the same meaning as the original paragraph.",
    )


def costar_v2(paragraph: str) -> str:
    """Generate a prompt for COSTAR v2. Addressing hallucinations."""
    return get_prompt(
        context=f"Rewrite this paragraph: {paragraph}",
        objective="To paraphrase without plagiarizing the original content, rewrite the given paragraph. Do not change the meaning of the paragraph. Do not copy the original paragraph.",
        style="Informative and Simple",
        tone="Neutral",
        audience="General Public",
        response="A short paragraph with the same meaning as the original paragraph.",
    )

### Evaluation


Target objectives:

- Meaning should be preserved -> metrics: BERTScore, higher = better
- Overlapping words should be avoided -> metrics: Jaccard similarity, lower=better


In [None]:
import string
import evaluate

bs = evaluate.load("bertscore")


def jaccard_similarity(str1, str2):
    """Calculate Jaccard similarity between two texts.

    It's defined as the size of the intersection divided by the size of the union of the sample sets.
    """

    # Remove punctuation
    str1 = str1.translate(str.maketrans("", "", string.punctuation))
    str2 = str2.translate(str.maketrans("", "", string.punctuation))

    # Convert the strings to sets of words
    set1 = set(str1.split(" "))
    set2 = set(str2.split(" "))

    # Calculate the intersection and union
    intersection = set1.intersection(set2)
    union = set1.union(set2)

    # Calculate Jaccard similarity
    similarity = len(intersection) / len(union)
    return similarity

In [None]:
def evaluate(paragraph: str, prompt_creation_fn: callable, llm_fn: callable) -> dict:
    """Evaluate a paragraph with a prompt."""

    prompt = prompt_creation_fn(paragraph)
    paraphrased = llm_fn([{"role": "user", "content": prompt}])

    return {
        "paragraph": paragraph,
        "paraphrased": paraphrased,
        "jaccard": jaccard_similarity(paragraph, paraphrased),
        "bertscore": bs.compute(
            predictions=[paraphrased], references=[paragraph], lang="en"
        )["f1"][0],
    }

In [None]:
eval_data = [evaluate(p, costar_v1, ask_mixtral) for p in paragraphs]
df = pd.DataFrame(eval_data)
df.to_parquet("costar_v1_mixtral.parquet")

In [None]:
df

Do some basic plot and show some examples


In [None]:
import pandas as pd
import altair as alt
from pathlib import Path

v1_openai = Path("costar_v1_openai.parquet")
v1_mixtral = Path("costar_v1_mixtral.parquet")

df_v1_openai = pd.read_parquet(v1_openai)
df_v1_mixtral = pd.read_parquet(v1_mixtral)

df_v1_mixtral.to_csv("costar_v1_mixtral.csv")
df_v1_openai.to_csv("costar_v1_openai.csv")

In [None]:
def report(parquet_file: Path) -> alt.Chart:
    """Generate a report for a parquet file."""

    name = parquet_file.stem
    df = pd.read_parquet(parquet_file)
    df["idx"] = df.index

    plot_title = f"{name} benchmark (jaccard: {df.jaccard.mean():.2f} bert:{df.bertscore.mean():.2f})"

    return (
        alt.Chart(df)
        .mark_circle()
        .encode(
            x=alt.X("jaccard", scale=alt.Scale(domain=(0, 1))),
            y=alt.Y("bertscore", scale=alt.Scale(domain=(0, 1))),
            tooltip=["idx", "paragraph", "paraphrased", "jaccard", "bertscore"],
        )
        .properties(title=plot_title, width=600, height=600)
        .interactive()
    )

In [None]:
report(v1_openai)

Look at one of the best case

In [None]:
df.to_csv("costar_v2.csv")

In [None]:
df.loc[43].to_dict()

In [None]:
df.loc[22].to_dict()

- Somewhat hallucinating? It is making a lot of assumptions, particularly in messier paragraphs.
- 