### Create an assistant

In [1]:
from openai import OpenAI
import os

key = os.environ.get("OPENAI_API_KEY")
client = OpenAI(api_key=key)
assistant = client.beta.assistants.create(
    name = "Expert in Requirements Engineering",
    instructions="You are an expert in Requirements Engineering. Your purpose is to organizing scientific data in an openly available and long-term way with respect to building, publishing, and evaluating an initial Knowledge Graph of empirical research in Requirement Engineering. To achieve this goal, you need to create an knowledge graph which enables sustainable literature reviews to synthesize a comprehensive, up-to-date, and long-term available overview of the state and evolution of empirical research in Requirement Engineering. To create such a knowledge graph, you first need to come up with some competency questions. An competency question is a natural language question that represents an information need related to the content of a knowledge graph and for which a knowledge graph must provide relevant information to anwser the question. Now you are given a set of documents about the state and evolution of this field.",
    tools=[{"type": "file_search"}],
    model="gpt-4-turbo"
)

### Upload files and add them to a Vector Store

In [2]:
import os

def get_file_paths(folder_path):
    file_paths = []
    # Iterate through all files in the folder
    for root, dirs, files in os.walk(folder_path):
        for file_name in files:
            # Get the absolute path of the file
            file_path = os.path.join(root, file_name)
            # Append the file path to the list
            file_paths.append(file_path)
    return file_paths

# create a Vector Store
vector_store = client.beta.vector_stores.create(name="requirments-engineering")

folder = "reference"
file_paths = get_file_paths(folder)
file_streams = [open(path, 'rb') for path in file_paths]
file_batch = client.beta.vector_stores.file_batches.upload_and_poll(
    vector_store_id=vector_store.id,
    files=file_streams
)

print(file_batch.status)
print(file_batch.file_counts)

completed
FileCounts(cancelled=0, completed=1, failed=0, in_progress=0, total=1)


### Update the assistant to use the new Vector Store

In [3]:
assistant = client.beta.assistants.update(
    assistant_id=assistant.id,
    tools=[{"type": "file_search"}],
    tool_resources={"file_search": {"vector_store_ids": [vector_store.id]}},
)

### Create a thread

In [4]:
thread = client.beta.threads.create()

### Add a message to the thread


#### define get responce function

In [5]:
def get_response(query,client,assistant,thread):
    # add message to the thread
    message = client.beta.threads.messages.create(
        thread_id=thread.id,
        role="user",
        content=query
    )

    # create a run
    run = client.beta.threads.runs.create_and_poll(
        thread_id=thread.id,
        assistant_id=assistant.id
    )

    # get messages
    messages = list(client.beta.threads.messages.list(thread_id=thread.id, run_id=run.id))
    message_content = messages[0].content[0].text
    print(f'message_content: {message_content}')
    annotations = message_content.annotations
    citations = []
    for index, annotation in enumerate(annotations):
        message_content.value = message_content.value.replace(annotation.text, f"[{index}]")
        if file_citation := getattr(annotation, "file_citation", None):
            cited_file = client.files.retrieve(file_citation.file_id)
            citations.append(f"[{index}] {cited_file.filename}")

    # print(message_content.value)
    # print("\n".join(citations))
    return message_content.value,citations

#### query 1

In [6]:
query = "what is requirments engineering"
response = get_response(query,client,assistant,thread)
print("************")
print(f"query: {query}")
print(f'response: {response[0]}')
print(f"reference: {response[1]}")

message_content: Text(annotations=[FileCitationAnnotation(end_index=861, file_citation=FileCitation(file_id='file-aqfDQ8qhVMmpluder9DszFgq', quote=None), start_index=849, text='【4:0†source】', type='file_citation')], value='Requirements Engineering (RE) is fundamentally about developing, maintaining, and managing high-quality software systems in a cost-effective and predictable manner. It involves studying real-world phenomena of software engineering, which includes developing new or modifying existing technologies such as process models, methods, techniques, tools, or languages to support software engineering activities. Additionally, it concerns the evaluation and comparison of these technologies within the complex interactions between individuals, teams, projects, organizations, various task types, and software systems. To ensure that research in this field is scientific, it employs empirical methods, which involve systematic observation and experimentation as a basis for gathering i

#### query 2

In [8]:
query = "Now you are developing an knowledge graph about the state and evolution of the empirical research in Requirements Engineering. Derive 77 competency questions from the provided documents."
response = get_response(query,client,assistant,thread)
print(f"query: {query}")
print(f'response: {response[0]}')
print(f"reference: {response[1]}")

message_content: Text(annotations=[], value='Based on the document provided, here are 77 competency questions for a knowledge graph on the state and evolution of empirical research in Requirements Engineering (RE):\n\n1. What are the defining characteristics of empirical RE?\n2. How have empirical methods in RE evolved over the years?\n3. Which empirical RE methods are most effective for different types of projects?\n4. What key challenges do researchers face in empirical RE according to recent studies?\n5. How is empirical RE contributing to software engineering innovations?\n6. What collaborations exist between academia and industry in the field of empirical RE?\n7. How do different empirical methods impact the development and maintenance of software systems?\n8. What trends are influencing the direction of empirical RE currently?\n9. How do theoretical frameworks influence empirical RE methodologies?\n10. What tools and technologies are being developed based on empirical RE findings

### Use Sentence Transformers to compare CQs similarity

In [9]:
# !pip install -U sentence-transformers
from sentence_transformers import SentenceTransformer, util
import pandas as pd

model = SentenceTransformer("all-MiniLM-L6-v2")

data = pd.read_csv('/Users/sherry/python-coding/Prompting/requirement-engeneering/genCQs-expertCQs-reference.csv', names=['expertCQs', 'genCQs'], header=0)

genCQs = list(data['genCQs'])
expertCQs = list(data['expertCQs'])

genCQs_embeddings = model.encode(genCQs)
expertCQs_embeddings = model.encode(expertCQs)

# Compute cosine similarity between all pairs
cos_sim = util.cos_sim(genCQs_embeddings, expertCQs_embeddings)



In [10]:
# Add all pairs to a list with their cosine similarity score
all_sentence_combinations = []
for genCQs_idx in range(77):
    for expertCQs_idx in range(77):
        all_sentence_combinations.append([cos_sim[genCQs_idx][expertCQs_idx], genCQs_idx, expertCQs_idx])

# Sort list by the highest cosine similarity score
all_sentence_combinations = sorted(all_sentence_combinations, key=lambda x: x[0], reverse=True)

print("Top-5 most similar pairs:")
for score, genCQs_idx, expertCQs_idx in all_sentence_combinations[0:5]:
    print("{} \t {} \t {:.4f}".format(genCQs[genCQs_idx], expertCQs[expertCQs_idx], cos_sim[genCQs_idx][expertCQs_idx]))

Top-5 most similar pairs:
How have empirical methods in RE evolved over the years? 	 How has the number of empirical methods used per publication evolved over time? 	 0.8098
How have empirical methods in RE evolved over the years? 	 How often are which empirical methods used over time? 	 0.8066
What statistical methods are most commonly used in empirical RE? 	 How often are which empirical methods used? 	 0.7922
How have empirical methods in RE evolved over the years? 	 How often are which empirical methods used? 	 0.7917
How do empirical studies influence software development practices? 	 How many empirical studies are by authors working for large software development companies? 	 0.7854


### Save cosine similarity score and the corresponding CQ pairs to a csv file

In [11]:
# save cosine similarity score and the corresponding CQ pairs to a csv file
import torch

genCQ_ls = []
expertCQ_ls = []
score_ls = []
n = len(all_sentence_combinations)
for score, genCQs_idx, expertCQs_idx in all_sentence_combinations[0:n]:
    # print("{} \t {} \t {:.4f}".format(genCQs[genCQs_idx], expertCQs[expertCQs_idx], cos_sim[genCQs_idx][expertCQs_idx]))
    # print(f"{score.item():.4f}")
    # print(genCQs[genCQs_idx])
    # print(expertCQs[expertCQs_idx])
    genCQ_ls.append(genCQs[genCQs_idx])
    expertCQ_ls.append(expertCQs[expertCQs_idx])
    score_ls.append(f"{score.item():.4f}")
cos_sim_df = pd.DataFrame()
cos_sim_df['genCQ'] = genCQ_ls
cos_sim_df['expertCQ'] = expertCQ_ls
cos_sim_df['cos_score'] = score_ls

cos_sim_df.to_csv("/Users/sherry/python-coding/Prompting/requirement-engeneering/gen-expert-CQs-cos-one-reference.csv")