## Mixtual paraphraser
Try to deliver a function that can paraphrase a sentence without plagiarizing it.

- Objective: avoid plagiarism / copyright infringement
- Target metrics: Argmin(Jaccard similarity), Argmax(BertScore) 
- Tune: prompt
- Testset: seed with a mix of questions from testset, generate 10 paraphrases for each question, and evaluate the paraphrases with the paraphraser with target metrics before and after paraphrasing.


In [None]:
import requests
from dotenv import load_dotenv
import os
from requests.auth import HTTPBasicAuth

load_dotenv()


def ask_mixtral(messages: list[dict]) -> dict:
    """Ask mixtral with a data package.

    Example input: [{"role": "user", "content": "Hello world example in python."}]
    """
    url = os.getenv("MIXTRAL_URL")
    user = os.getenv("MIXTRAL_USER")
    password = os.getenv("MIXTRAL_PASSWORD")
    data = {
        "model": "mixtral",
        "messages": messages,
        "stream": False,  # set to True to get a stream of responses token-by-token
    }
    # Non-streaming mode
    response = requests.post(url, auth=HTTPBasicAuth(user, password), json=data)
    response.raise_for_status()
    return response.json()

In [None]:
def fetch_paragraphs(topic: str, question: str, n: int = 10) -> list[str]:
    endpoint = "http://cosmos0001.chtc.wisc.edu:4502/hybrid"
    header = {"Api-Key": os.getenv("RETRIEVER_APIKEY")}

    data = {
        "question": "What is temperature at sea?",
        "topic": topic,
        "top_k": n,
        "doc_type": "paragraph",
    }

    response = requests.post(endpoint, headers=header, json=data)
    response.raise_for_status()

    return [doc["text"] for doc in response.json()]

In [None]:
paragraphs = fetch_paragraphs(topic="covid", question="formulation of SIR model.")

Use a basic COSTAR prompt template

https://levelup.gitconnected.com/a-comprehensive-guide-to-prompt-engineering-unveiling-the-power-of-the-costar-template-944897251101

In [None]:
def get_prompt(
    context: str, objective: str, style: str, tone: str, audience: str, response: str
) -> str:
    """Get a COSTAR prompt."""

    return f"""
    CONTEXT: {context}
    OBJECTIVE: {objective}
    STYLE: {style}
    TONE: {tone}
    Audience: {audience}
    Response: {response}
    """

In [None]:
# Tune this later

baseline_prompt = f"paraphrase and avoid plagiarism: {paragraphs[0]}"

prompt = get_prompt(
    context=f"Rewrite this paragraph: {paragraphs[0]}",
    objective="To paraphrase without plagiarizing the original content, rewrite the given paragraph.",
    style="Informative and Simple",
    tone="Neutral",
    audience="General Public",
    response="The COVID-19 vaccine is effective at preventing the spread of the virus.",
)

In [None]:
paragraphs[0]

In [None]:
results_baseline = ask_mixtral([{"role": "user", "content": baseline_prompt}])
results_costar = ask_mixtral([{"role": "user", "content": prompt}])

In [None]:
results_baseline["message"]["content"]

In [None]:
results_costar["message"]["content"]

### Evaluation

Target objectives:
- Meaning should be preserved -> metrics: BERTScore, higher = better
- Overlapping words should be avoided -> metrics: Jaccard similarity, lower=better 


In [None]:
import string
import evaluate

bs = evaluate.load("bertscore")


def jaccard_similarity(str1, str2):
    """Calculate Jaccard similarity between two texts.

    It's defined as the size of the intersection divided by the size of the union of the sample sets.
    """

    # Remove punctuation
    str1 = str1.translate(str.maketrans("", "", string.punctuation))
    str2 = str2.translate(str.maketrans("", "", string.punctuation))

    # Convert the strings to sets of words
    set1 = set(str1.split(" "))
    set2 = set(str2.split(" "))

    # Calculate the intersection and union
    intersection = set1.intersection(set2)
    union = set1.union(set2)

    # Calculate Jaccard similarity
    similarity = len(intersection) / len(union)
    return similarity

In [None]:
# Example usage
str1 = "hello world!"
str2 = "hello dolphin, happy hippo!"
similarity = jaccard_similarity(str1, str2)
print(f"Jaccard Similarity: {similarity}")
bert = bs.compute(predictions=[str1], references=[str2], lang="en")
print(f"BertScore: {bert}")

In [None]:
results_baseline["message"]["content"]

In [None]:
def evaluate_all(x: str, y: str) -> None:
    print(
        f"bertscore: {bs.compute(predictions=[x], references=[y], lang='en', verbose=True)}"
    )
    print(f"{jaccard_similarity(x, y)=}")

In [None]:
evaluate_all(results_baseline["message"]["content"], paragraphs[0])

In [None]:
evaluate_all(results_costar["message"]["content"], paragraphs[0])