In [1]:
from typing import Any, List
import random
import logging
import warnings
import os
import glob

from datasets import Dataset

from llama_index.core.llama_pack.base import BaseLlamaPack
from llama_index.core import SimpleDirectoryReader

from llama_index.core.llms import ChatMessage
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from IPython.display import clear_output

In [2]:
os.environ["OPENAI_API_KEY"] = ""

In [3]:
chunk_dir = 'Tanenbaum_chunks'
data_dir = 'Tanenbaum_data'
llm = OpenAI(temperature=0, n=1, model="gpt-3.5-turbo")
size = 'small'
num_distract_docs = 3
num_questions_per_chunk = 8 if size=='small' else 12 if size=='medium' else 15

In [4]:
def get_chunks(chunk_dir:str,size:str):
  directory = os.path.join(chunk_dir,size)
  txt_files = glob.glob(os.path.join(directory, '*.txt'))
  chunks = []

  for file_path in txt_files:
      with open(file_path, 'r',encoding="utf8") as file:
          chunks.append(file.read())

  return chunks

In [5]:
def strip_str(s) -> str:
    """
    Helper function for helping format strings returned by GPT-4.
    """
    if s.startswith("assistant:"):  # Check if the string starts with 'assistant '
        s = s.replace("assistant:", "", 1)  # Replace the first occurrence

    start_index, end_index = 0, len(s) - 1
    beg_found = False
    for i in range(len(s)):
        if s[i].isalpha():
            if not beg_found:
                start_index = i
                beg_found = True
            else:
                end_index = i
    end_index += 2
    return s[start_index : min(end_index, len(s))]

In [6]:
def encode_question_gen(question, chunk) -> List[str]:
    """
    Encode multiple prompt instructions into a single string for the general case.
    """
    prompt = f"""
        Question: {question}\nContext: {chunk}\n
        Answer this question using the information given in the context above. Here is things to pay attention to:
        - First provide step-by-step reasoning on how to answer the question.
        - In the reasoning, if you need to copy paste some sentences from the context, include them in ##begin_quote## and ##end_quote##. This would mean that things outside of ##begin_quote## and ##end_quote## are not directly copy paste from the context.
        - End your response with final answer in the form <ANSWER>: $answer, the answer should be succinct.
    """
    return [
        ChatMessage(
            role="system",
            content="You are a helpful question answerer who can provide an answer given a question and relevant context.",
        ),
        ChatMessage(role="user", content=prompt),
    ]

In [7]:
def generate_label(question, context) -> str:
    """
    Generates the label / answer to `question` using `context` and GPT-4.
    """
    question_messages = encode_question_gen(question, context)
    response = llm.chat(question_messages)
    return str(response)

In [8]:
def generate_instructions_gen(chunk, x=5) -> List[str]:
    """
    Generates `x` questions / use cases for `chunk`. Used when the input document is of general types
    `pdf`, `json`, or `txt`.
    """
    messages = [
        ChatMessage(
            role="system",
            content="You are a synthetic question-answer pair generator. Given a chunk of context about some topic(s), generate %s example questions a user could ask and would be answered using information from the chunk. For example, if the given context was a Wikipedia paragraph about the United States, an example question could be 'How many states are in the United States?'. The questions should be able to be answered in a few words or less."
            % (x),
        ),
        ChatMessage(role="user", content=str(chunk)),
    ]

    queries = str(llm.chat(messages)).split("\n")
    questions = [strip_str(q) for q in queries]
    questions = [q for q in questions if any(c.isalpha() for c in q)][:x]

    num_questions_generated = len(questions)
    if num_questions_generated < x:
        warnings.warn(
            f"Fewer questions generated ({num_questions_generated}) "
            f"than requested ({x})."
        )

    return questions

In [9]:
def add_chunk_to_dataset(
    chunks: List,
    chunk: str,
    x: int = 5,
    num_distract: int = 3,
    p: float = 1.0,
):
    """
    Given a chunk, create {Q, A, D} triplets and add them to the dataset.
    """
    ds = None
    i = chunks.index(chunk)
    qs = generate_instructions_gen(chunk, x)
    for q in qs:
        datapt = {
            "id": None,
            "type": None,
            "question": None,
            "context": None,
            "oracle_context": None,
            "cot_answer": None,
        }

        datapt["id"] = f"seed_task_{0 if not ds else ds.num_rows}"
        datapt["type"] = "general"
        datapt["question"] = q

        # add distractor docs
        docs = [chunk]
        indices = list(range(len(chunks)))
        indices.remove(i)
        for j in random.sample(indices, num_distract):
            docs.append(chunks[j])
        # decides whether to add oracle document
        oracle = random.uniform(0, 1) < p
        if not oracle:
            docs[0] = chunks[random.sample(indices, 1)[0]]
        random.shuffle(docs)

        d = {"title": [], "sentences": []}

        d["title"].append(["placeholder_title"] * (num_distract + 1))
        d["sentences"].append(docs)
        datapt["context"] = d
        datapt["oracle_context"] = chunk

        # add answer to q
        datapt["cot_answer"] = generate_label(q, chunk)

        # construct model instruction
        context = ""
        for doc in docs:
            context += "<DOCUMENT>" + str(doc) + "</DOCUMENT>\n"
        context += q
        datapt["instruction"] = context

        # add to dataset
        if not ds:
            # init ds
            datapt["id"] = [datapt["id"]]
            datapt["type"] = [datapt["type"]]
            datapt["question"] = [datapt["question"]]
            datapt["context"] = [datapt["context"]]
            datapt["oracle_context"] = [datapt["oracle_context"]]
            datapt["cot_answer"] = [datapt["cot_answer"]]
            datapt["instruction"] = [datapt["instruction"]]
            ds = Dataset.from_dict(datapt)
        else:
            ds = ds.add_item(datapt)
    return ds

In [10]:
chunks = get_chunks(chunk_dir,size)

In [11]:
len(chunks)

261

In [12]:
for index,chunk in enumerate(chunks):
    print(f"===========Processing Chunk {index}=================")
    data = add_chunk_to_dataset(chunks,chunk,num_questions_per_chunk,num_distract_docs)
    data.to_json("Tanenbaum_data/Small/"+f"{index}.json")
    clear_output()



Retrying llama_index.llms.openai.base.OpenAI._chat in 0.31137766609024264 seconds as it raised APIConnectionError: Connection error..
Retrying llama_index.llms.openai.base.OpenAI._chat in 0.011838702370819743 seconds as it raised APIConnectionError: Connection error..


APIConnectionError: Connection error.