In [2]:
import json

import os

import re

import numpy as np

# Using OpenAI to generate questions and answers for fine-tuning purposes

## Reading the document

In [3]:
from PyPDF2 import PdfReader



reader = PdfReader("generative_agent.pdf")

number_of_pages = len(reader.pages)

# Generating Q&As pairs

## Breaking the document into overlapped chunks
### 1. Remove irrelevant content (such as authors, doi, ect)
### 2. Make page content overlapped with previous and succeeding pages

In [131]:
import re

pages_sentences = []
pages_content = []
pages_clean_text = []
NUM_OVERLAP_SENTENCES = 10

for page_num, page in enumerate(reader.pages):
  page_text = page.extract_text()
  page_text = page_text.replace('arXiv, April, 2023, J. S.  Park, J. C.  O’Brien, C. J.  Cai, M.  Morris, P.  Liang, M. S', '')
  page_text = page_text.replace('arXiv, April, 2023, J.S. Park, J.C. O’Brien, C.J. Cai, M. Morris, P. Liang, M.S. Bernstein', '')
  page_text = page_text.replace('[cs.HC]  7 Apr 2023', '')
  page_text = page_text.replace('arXiv:2304.03442v1', '')
  
  sentences = re.split(r'[.]', page_text)
  pages_clean_text.append(page_text)
  pages_sentences.append(sentences)

for page_num, page in enumerate(pages_clean_text):
  prev_page_overlapped_sentences = "" if page_num == 0 else '. '.join(pages_sentences[page_num - 1][-NUM_OVERLAP_SENTENCES:])
  next_page_overlapped_sentences = "" if page_num == len(pages_clean_text) - 1 else '. '.join(pages_sentences[page_num + 1][:NUM_OVERLAP_SENTENCES])
  pages_content.append(prev_page_overlapped_sentences + '\n' + page + '\n' + next_page_overlapped_sentences)
  



In [140]:
with open(os.path.join('data', 'generative_agent.txt'), 'w') as f:
    for page in pages_content:
        f.write(page + '\n ---- \n')


## Generate Q&As pairs for each document pages, 
### 1. The page contents are overlapped by 10 sentences with the previous page to add context to the data
### 2. For each page, the model is asked to summarize the information.

In [None]:
from openai import OpenAI
client = OpenAI(api_key='')

qas = []
messages = []
summaries = []

for page_num, page in enumerate(reader.pages):
  sentences = re.split(r'[.?!]', page.extract_text())

for page_num, page in enumerate(pages_content):
  completion = client.chat.completions.create(
    model="gpt-3.5-turbo-1106",
    messages= [
        {"role": "system", "content": "You act as a training data generator for a GPT 3.5 fine-tuning job. Your job is to generate questions and answer those questions for the content of a research paper." 
        " The whole paper is broken into multiple sections. Therefore, all of the following content is from the same research paper and presented to you in order."
        " You only generate questions, those are and highly-relevant to the research paper, and the correct answers to the corresponding questions. "
        " The questions and answers in following format: "\
        "Question 1: \n Answer 1: \n\nQuestion 2: \n Answer 2: Question 3: \n Answer 3: " 
        "\n\nQuestion 4: \n Answer 4: \n\nQuestion 4: \n Answer 2: Question 5: \n Answer 5: "
        "\n\nQuestion 6: \n Answer 6: \n\nQuestion 7: \n Answer 7: Question 8: \n Answer 8: "
        "\n\nQuestion 9: \n Answer 9: Question 10: \n Answer 10: "},
        {"role": "user", "content": f"{page}"}
      ]
    )
  qas.append({'page': page_num, 'completion': completion.choices[0].message.content})
  
  completion = client.chat.completions.create(
    model="gpt-3.5-turbo-1106",
    messages = [
        {"role": "system", "content": "You act as a training data generator for a GPT 3.5 fine-tuning job. Summarise the page content as precisely as possible. " \
        "{\"summary\": \"\"}"},
        {"role": "user", "content": f"{page}"}
      ]
      
    )
  summaries.append(completion.choices[0].message.content)

print(completion.choices[0].message)

ChatCompletionMessage(content="The page content consists of a conversation between individuals discussing their plans, reactions to unexpected events, and reflections on their current inspirations and interactions. It also contains hypothetical scenarios related to Ayesha Khan's interests and potential gifts for her.", role='assistant', function_call=None, tool_calls=None)


### 3. Concat the pages summaries, generate Q&As on all summaries.

In [69]:

page_num = 1
str_summaries = ""
for page_summary in summaries:
    str_summaries += f"\nPage {page_num}'s summary: {page_summary}"
    page_num += 1

completion = client.chat.completions.create(
  model="gpt-3.5-turbo-1106",
  messages=[
        {"role": "system", "content": "You act as a training data generator for a GPT 3.5 fine-tuning job. Your job is to generate questions and answer those questions for the content of a research paper. The whole paper is broken into multiple sections. Each page's summary will be provided."
        " Provide questions and answers those are involved content from different pages. "
        " The questions and answers in following format: "\
        "\n\nQuestion 1: \n Answer 1: \n\nQuestion 2: \n Answer 2: Question 3: \n Answer 3: " 
        "\n\nQuestion 4: \n Answer 4: \n\nQuestion 5: \n Answer 5: "
        "\n\nQuestion 6: \n Answer 6: \n\nQuestion 7: \n Answer 7: \n\nQuestion 8: \n Answer 8: "
        "\n\nQuestion 9: \n Answer 9: Question 10: \n Answer 10: "
        "\n\nQuestion 11: \n Answer 11: \n\nQuestion 12: \n Answer 12: Question 13: \n Answer 13: " 
        "\n\nQuestion 14: \n Answer 14: \n\nQuestion 15: \n Answer 15: "
        "\n\nQuestion 16: \n Answer 16: \n\nQuestion 17: \n Answer 17: \n\nQuestion 18: \n Answer 18: "
        "\n\nQuestion 19: \n Answer 19: Question 20: \n Answer 20: "},
        {"role": "user", "content": "Summary: " + str_summaries}
      ]
)

qas.append({'page': -1, 'completion': completion.choices[0].message.content})

with open(os.path.join('data', 'page_summary.txt'), 'w') as f:
    f.writelines(str_summaries)

print(completion.choices[0].message.content)

Question 1: What is the main focus of the paper introduced in the summaries?

Answer 1: The main focus of the paper is on generative agents that simulate believable human behavior in an interactive sandbox environment inspired by The Sims.

Question 2: How are generative agents instantiated in the interactive sandbox environment discussed in the paper?

Answer 2: Generative agents are instantiated in an interactive sandbox environment where users can interact with a small town of 25 agents using natural language.

Question 3: What are some of the behaviors that the generative agents are capable of simulating?

Answer 3: The generative agents are capable of waking up, cooking, working, creating art, forming opinions, noticing others, and engaging in conversations.

Question 4: How are the generative agents evaluated in the paper?

Answer 4: The generative agents are evaluated based on their believable individual and emergent social behaviors, demonstrating the effectiveness of their arc

In [114]:
qa_examples = ""

for qa in qas:
    page_num = qa['page']
    data_str = qa['completion'].replace('\n\nAnswer', '\nAnswer').replace(': \n', ': ')
    data_list = data_str.split('\n\n')
    for pair in data_list:
        if pair[:8] != 'Question':
            continue
        q, a = pair.split('Answer')
        q, a = q.split(': ')[-1].replace('?\n', '?'), a.split(': ')[-1]
        qa_examples += json.dumps({"messages": [{"role": "system", "content": "Assistant is a large language model trained by OpenAI.\n\n"
                                                        "Assistant is designed to be able to assist with a wide range of tasks, from answering simple questions to providing in-depth explanations and discussions on topics related to provided documents."
                                                        "As a language model, Assistant is able to generate human-like text based on the input it receives, allowing it to engage in natural-sounding conversations and provide responses that are coherent and relevant to the topic at hand.\n\n"
                                                        "Additionally, Assistant is able to generate its own text based on the input it receives, allowing it to engage in discussions and provide explanations and descriptions on the topics."}, 
                                                       {"role": "user", "content": f"{q}"}, 
                                                       {"role": "assistant", "content": f"{a}"}]}) + '\n'


with open(os.path.join('data', 'QA_pairs.jsonl'), 'w') as f:
    f.writelines(qa_examples[:-1])


# Split data set

In [128]:
from sklearn.model_selection import train_test_split


with open(os.path.join('data', 'QA_pairs.jsonl'), 'r') as f:
    lines = f.readlines()

train, test = train_test_split(np.array(lines), test_size=0.2)
train, val = train_test_split(np.array(train), test_size=0.2)

with open(os.path.join('data', 'QA_pairs_train.jsonl'), 'w') as f:
    f.writelines(train)

with open(os.path.join('data', 'QA_pairs_val.jsonl'), 'w') as f:
    f.writelines(val)

with open(os.path.join('data', 'QA_pairs_test.jsonl'), 'w') as f:
    f.writelines(test)

In [130]:
train[61]

'{"messages": [{"role": "system", "content": "Assistant is a large language model trained by OpenAI.\\n\\nAssistant is designed to be able to assist with a wide range of tasks, from answering simple questions to providing in-depth explanations and discussions on topics related to provided documents.As a language model, Assistant is able to generate human-like text based on the input it receives, allowing it to engage in natural-sounding conversations and provide responses that are coherent and relevant to the topic at hand.\\n\\nAdditionally, Assistant is able to generate its own text based on the input it receives, allowing it to engage in discussions and provide explanations and descriptions on the topics."}, {"role": "user", "content": "What is the contribution of the study related to generative agents\' architecture and evaluations?"}, {"role": "assistant", "content": "The study\'s significant contribution lies in the introduction of an architecture that effectively extends a large