# Custom Chatbot Project

The dataset is the parsed version of the Wikipedia page https://it.wikipedia.org/wiki/Castelnuovo_di_Garfagnana in Italian language.

It is an interesting use case because the trained model is able to answer something, but it mades a lot of errors. The data is probably only partially present in the training dataset or under-represented.

Using RAG it is possible to see how the answers correctness improve, although they tend to be more coincise.

## Data Wrangling

Parse the Wikipedia page and create a pandas DataFrame with a "text" column with a sentence in each row

In [2]:
PAGE_TITLE = "Castelnuovo di Garfagnana"
OUTPUT_DATA_FILEPATH = "./data/wiki_it_castelnuovo_garfagnana_nb.csv"
WIKIPEDIA_LANG = "it"
SKIP_SECTIONS = ["Collegamenti_esterni", "Altri_progetti"]

In [18]:
import argparse
import os
from collections import defaultdict
from typing import Optional

import bs4
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup


def get_dict_key_from_headings(
    last_h2_level_paragraph: str,
    last_h3_level_paragraph: Optional[str] = None,
    last_h4_level_paragraph: Optional[str] = None,
) -> str:
    key = f"{last_h2_level_paragraph}"
    if last_h3_level_paragraph is not None:
        key += f" - {last_h3_level_paragraph}"
    if last_h4_level_paragraph is not None:
        key += f" - {last_h4_level_paragraph}"
    return key


def get_cleaned_text(element) -> str:
    """
    Strip text and remove '\n' inside the paragraph
    """
    return element.get_text().strip().replace(u"\xa0"," ").replace("\n", " ")

# "query" action documentation: https://en.wikipedia.org/w/api.php?action=help&modules=query
# Don't pass "explaintext": 1 to get the text in HTML format. It is a bit more complex to parse, but we have
# all the information to understand when a list is present
params = {
    "action": "query",
    "prop": "extracts",
    "exlimit": 1,
    "titles": PAGE_TITLE,
    "exsectionformat": "wiki",
    "format": "json",
}

resp = requests.get(
    f"https://{WIKIPEDIA_LANG}.wikipedia.org/w/api.php", params=params
)
response_dict = resp.json()

page_dict = next(iter(response_dict["query"]["pages"].values()))
title = page_dict["title"]
html_text = page_dict["extract"]
soup = BeautifulSoup(html_text, "html.parser")
# Enable for DEBUG
#print(soup.prettify())

# Use BeatifulSoap the go element by element
# headings -> new sectopm level
# <p>...</p> sentences
# <ul><li>..</li><li>...</li>...</ul> <li> elements to merge
sentences_dict = defaultdict(list)
last_h2_level_paragraph = None
last_h3_level_paragraph = None
last_h4_level_paragraph = None
for element in soup:
    if type(element) == bs4.Tag:
        if element.name == "p" and last_h2_level_paragraph is None:
            # Intro before the first headings
            sentences_dict[title].append(get_cleaned_text(element))
        elif element.name == "h2":
            # First level paragraph
            last_h2_level_paragraph = element.attrs["data-mw-anchor"]
            last_h3_level_paragraph = None
            last_h4_level_paragraph = None
            building_list = False
        elif element.name == "h3":
            # Second level paragraph
            last_h3_level_paragraph = element.attrs["data-mw-anchor"]
            last_h4_level_paragraph = None
            building_list = False
        elif element.name == "h4":
            # Third level paragraph
            last_h4_level_paragraph = element.attrs["data-mw-anchor"]
            building_list = False
        elif element.name == "p":
            # Sentence of a paragraph
            # Concatenate the headings to provide context
            key = get_dict_key_from_headings(
                last_h2_level_paragraph,
                last_h3_level_paragraph,
                last_h4_level_paragraph,
            )
            # Search for <ul> inside <p>
            for p_children in element.children:
                if type(p_children) == bs4.Tag and p_children.name == "ul":
                    raise ValueError("List <ul> inside a <p> not supported")
            sentences_dict[key].append(get_cleaned_text(element))

        elif element.name == "ul" or element.name == "dl":
            # Get the list elements and merge them when necessary
            # DO NOT MERGE when there is no sentence before ending with ":"
            # MERGE when the previous sentence ends with ":" or when another list element is preceding

            # Logic to merge the list elements
            list_content_str = ""
            for list_element in element.children:
                if (
                    list_element.name == "li"
                    or list_element.name == "dd"
                    or list_element.name == "dt"
                ):
                    list_text = get_cleaned_text(list_element)
                    list_content_str += list_text + "\n"
            list_content_str = (
                list_content_str.replace("\n", "; ")
                .replace(",;", ";")
                .replace(";;", ";")
                .replace(".;", ";")[: -len(", ")]
            )
            key = get_dict_key_from_headings(
                last_h2_level_paragraph,
                last_h3_level_paragraph,
                last_h4_level_paragraph,
            )
            last_sentence_for_key = (
                sentences_dict[key][-1] if len(sentences_dict[key]) > 0 else ""
            )
            if last_sentence_for_key.endswith(":"):
                # Concatenate the list elements with the previous sentence which explains the list content,
                sentences_dict[key][-1] += " " + list_content_str
            elif last_element_type == "ul" or last_element_type == "dl":
                # The list could already been started with a different ul or dl element,
                # in this case we don't support nesting and we simply concatenate
                print(
                    f"WARNING: probably there is a nested list; it will be squashed into a single level, list element content: '{list_content_str}'"
                )
                sentences_dict[key][-1] += "; " + list_content_str
            else:
                # The list is probably part of an entire section and not introduce with ":",
                # so it does worth keeping split
                sentences_dict[key].extend(list_content_str.split("; "))
        else:
            raise ValueError(f"Tag {element.name} not supported")
        last_element_type = element.name

df_content = {"text": []}
skip_keys_start = tuple(
    [skip_section + " - " for skip_section in SKIP_SECTIONS]
)
for key, key_sentences in sentences_dict.items():
    if key not in SKIP_SECTIONS and key.startswith(skip_keys_start) is False:
        for key_sentence in key_sentences:
            if key_sentence != "":
                df_content["text"].append(f"{key} - {key_sentence}")

df = pd.DataFrame.from_dict(df_content)
print(f"{len(df)} sentences obtained from the page '{PAGE_TITLE}'")

os.makedirs(os.path.dirname(OUTPUT_DATA_FILEPATH), exist_ok=True)
df.to_csv(OUTPUT_DATA_FILEPATH)
print(f"CSV file saved to '{OUTPUT_DATA_FILEPATH}'")

80 sentences obtained from the page 'Castelnuovo di Garfagnana'
CSV file saved to './data/wiki_it_castelnuovo_garfagnana_nb.csv'


In [19]:
df.head()

Unnamed: 0,text
0,Castelnuovo di Garfagnana - Castelnuovo di Gar...
1,Geografia_fisica - Territorio - Sorge alla con...
2,Geografia_fisica - Clima - Classificazione sis...
3,Geografia_fisica - Clima - Classificazione cli...
4,Geografia_fisica - Clima - Diffusività atmosfe...


## Custom Query Completion

In [20]:
VOCAREUM_OPENAI_API_KEY = "YOUR API KEY"
EMBEDDING_MODEL_NAME = "text-embedding-ada-002"
REQUEST_SIZE = 100

In [21]:
import openai

openai.api_base = "https://openai.vocareum.com/v1"
openai.api_key = VOCAREUM_OPENAI_API_KEY

# Get the embeddings
# In order to avoid a `RateLimitError` the data is sent in batches to the `Embedding.create` function
print("Extracting embeddings...")
embeddings = []
for i in range(0, len(df), REQUEST_SIZE):
    # Send text data to OpenAI model to get embeddings, the embeddings are at sentence level, not word
    response = openai.Embedding.create(
        input=df.iloc[i : i + REQUEST_SIZE]["text"].tolist(),
        engine=EMBEDDING_MODEL_NAME,
    )

    # Add embeddings to list
    embeddings.extend([data["embedding"] for data in response["data"]])

# Add embeddings list formatted as numpy array to the dataframe
df["embeddings"] = embeddings
df["embeddings"] = df["embeddings"].apply(np.array)

print("'embeddings' column added to the dataframe")
print(
    f"Embeddings space size using {EMBEDDING_MODEL_NAME}: {len(embeddings[0])}"
)

Extracting embeddings...
'embeddings' column added to the dataframe
Embeddings space size using text-embedding-ada-002: 1536


In [22]:
df.head()

Unnamed: 0,text,embeddings
0,Castelnuovo di Garfagnana - Castelnuovo di Gar...,"[0.017287807539105415, -0.0054532866925001144,..."
1,Geografia_fisica - Territorio - Sorge alla con...,"[0.015181456692516804, -0.007471294142305851, ..."
2,Geografia_fisica - Clima - Classificazione sis...,"[0.019138965755701065, -0.0005533070070669055,..."
3,Geografia_fisica - Clima - Classificazione cli...,"[0.022875944152474403, -0.00048568969941698015..."
4,Geografia_fisica - Clima - Diffusività atmosfe...,"[-0.0010899270419031382, 0.003765960456803441,..."


### Utility functions to exploit the ambeddings to answer the questions

In [None]:
def get_rows_sorted_by_relevance(
    question: str, df: pd.DataFrame, embedding_model_name: str
) -> pd.DataFrame:
    """
    Function that takes in input a question string, a dataframe and an embedding model name.
    Each dataframe row includes a text and the associated embeddings vector.

    Returns:
        Copy of the input dataframe sorted by descending question relevance
    """
    # Get embeddings for the question text
    question_embeddings = get_embedding(question, engine=embedding_model_name)

    # Make a copy of the dataframe and add a "distances" column containing
    # the cosine distances between each row's embeddings and the
    # embeddings of the question
    df_copy = df.copy()
    df_copy["distances"] = distances_from_embeddings(
        question_embeddings, df_copy["embeddings"].values, distance_metric="cosine"
    )

    # Sort the copied dataframe by the distances and return it
    # (shorter distance = more relevant so we sort in ascending order)
    df_copy.sort_values("distances", ascending=True, inplace=True)
    return df_copy


def count_tokens(text: str, encoding: str = "cl100k_base"):
    """
    Count the number of tokens before calculating the embeddings

    Args:
        text: text for which you want to count the tokens
        encoding: encoding name

    Returns:
        the number of tokens to represent the text
    """
    tokenizer = tiktoken.get_encoding(encoding)
    return len(tokenizer.encode(text))

def answer_question(question: str) -> List[str]:
    # Create the embeddings for the question using under the hood openai.Embedding.create
    df_sorted_distances = get_rows_sorted_by_relevance(
        question=args.question,
        df=df_embed,
        embedding_model_name=args.embedding_model_name,
    )

    if args.closest_sentences_output_filepath:
        df_sorted_distances.to_csv(args.closest_sentences_output_filepath)

    # Create the prompt with a template to get an answer to the question
    prompt_template = """
    Answer the question based on the context below, and if the question
    can't be answered based on the context, say "I don't know"

    Context: 

    {}

    ---

    Question: {}
    Answer:
    """

    # We want to exploit the available number of tokens for the model, but with a limit, because we are charged based
    # on the number of tokens
    current_token_count = count_tokens(prompt_template) + count_tokens(args.question)
    print(f"Prompt template + question number of tokens: {current_token_count}")

    # Add context until max tokens (which can be exceeded with the last step)
    context = []
    for text in df_sorted_distances["text"].values:

        # Increase the counter based on the number of tokens in this row
        text_token_count = count_tokens(text)
        current_token_count += text_token_count

        # Add the row of text to the list if we haven't exceeded the max.
        # The last step can exceed max_prompt_tokens
        if current_token_count <= args.max_prompt_tokens:
            context.append(text)
        else:
            break

    # Create the prompt with the context in a specific format to highlight each line (event)
    prompt = prompt_template.format("\n\n###\n\n".join(context), args.question)
    print(f"Prompt: {prompt}")
    print(f"Prompt tokens: {count_tokens(prompt)}")

    # From the documentation: the token count of your prompt plus max_tokens cannot exceed the model's context length.

    # Answer without using the context
    initial_answer = openai.Completion.create(
        model="gpt-3.5-turbo-instruct",
        prompt=args.question,
        max_tokens=args.max_answer_tokens,
    )["choices"][0]["text"].strip()
    print(f"Initial answer: {initial_answer}")

    # Answer using the context
    answer_with_context = openai.Completion.create(
        model="gpt-3.5-turbo-instruct", prompt=prompt, max_tokens=args.max_answer_tokens
    )["choices"][0]["text"].strip()
    print(f"RAG answer: {answer_with_context}")

    # TODO: return the two answers before and after RAG

## Custom Performance Demonstration

TODO: In the cells below, demonstrate the performance of your custom query using at least 2 questions. For each question, show the answer from a basic `Completion` model query as well as the answer from your custom query.

### Question 1

### Question 2