In [1]:
from openai import OpenAI
import os
from dotenv import load_dotenv
import json

load_dotenv()

client_open_ai = OpenAI(api_key=os.environ.get("OPEN_AI_API_KEY"))

In [2]:
from opensearchpy import OpenSearch

def get_open_search_client():
    host = 'localhost'
    port = 9200

    auth = ('admin', 'admin')

    client_open_search = OpenSearch(
        hosts = [{'host': host, 'port': port}],
        http_auth = auth,
        use_ssl = True,
        verify_certs = False
    )
    return client_open_search

In [3]:
def open_search_quary(search_body, index_name, client_open_search, size, from_where: int=None):
    response = client_open_search.search(index=index_name, body=search_body, size=size, from_=from_where)
    if len(response['hits'])>0:
        return response['hits']['hits']
    else:
        print('There is no hits in open search DB.')
        return None

In [4]:
def query_with_open_AI(paper_detail):
  completion = client_open_ai.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
      {"role": "system", "content": 'Generate around 2 QA pairs as well as the answer start point number for training for these information, inforamtion is seperated with \\n'},
      {"role": "user", "content": f"Abstract:{paper_detail['Abstract']}\n Article date:{paper_detail['ArticleDate']}"},
      {"role": "system", "content": 'Return a dictionary with question, answer, answer_start_point as keyword, the values of them are lists of generated results, the keyword should be presented!! Like {"question":["{question1}"...],"answer":["{answer1}"...]},"start_point":["{answer1_start_point}"...]'},
      # {"role": "system", "content": 'Every answer needs to be extractable from the context.'},
      # {"role": "system", "content": 'answer_start_point should be digits of position where the answer starts.'},
    ],
  )
  print(f"{completion.choices[0].message.content}\n")

  return json.loads(completion.choices[0].message.content)



In [5]:
import time
def generate_qa_pairs(paper_detail):
    data = []
    for i in paper_detail:
        paper = i['_source']
        result = query_with_open_AI(paper)
        
        questions_list=result['question']
        answers_list=result['answer']
        answers_start_point = result['start_point']

        article_title = paper['Title']
        PMID = paper['PMID']
        author = ', '.join(paper['Authors'])
        article_date = paper['ArticleDate']
        context = paper['Abstract']
        final_result = [[qu, an, an_s_p, article_title, PMID, author, context ,article_date] for qu, an, an_s_p in zip(questions_list, \
                                                                                                answers_list, answers_start_point)]
        data+=final_result
        # Avoid OpenAI blocking
        time.sleep(10)
    return data

In [6]:
import csv
import urllib3

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

index_name = 'pubmed_intelligence'

# saving_file_name = 'reformatting_testing_qa.csv'

saving_file_name = 'reformatting_testing_qa_for_evaluation.csv'

header = ['question', 'answer', 'start_point', 'article_title', 'PMID', 'author', 'context', 'article_date']

client_open_search = get_open_search_client()

search_body = {
    "query": {
        "match": {
            "_index": index_name
        }
    },
}

query_result = open_search_quary(search_body,index_name,client_open_search, 20, 8)




In [7]:
if query_result is not None:

    data = generate_qa_pairs(query_result)

    with open(saving_file_name, 'a+') as f:
        writer = csv.writer(f)
        # write the header
        writer.writerow(header)
        # write the data
        writer.writerows(data)

{
"question": [
"What is the diagnostic criteria for the homogenous subgroup of children in the study?",
"How did the cognitive profiles of children with Asperger's Syndrome differ from those with autism?"
],
"answer": [
"corresponding to DSM-IV diagnosis of autism",
"Asperger children scored higher on the Verbal Comprehension Index than on other indexes, with the lowest score found on the Processing Speed Index."
],
"start_point": [
342,
1136
]
}

{
  "question": [
    "What are the two main perspectives discussed in the book chapter?",
    "When are rational associations of selective drug inhibitors expected to become a more realistic goal according to the book chapter?"
  ],
  "answer": [
    "The two main perspectives discussed in the book chapter are: 1) Future-focused perspective on devising rational associations of targeted inhibitors against distinct altered signaling-network pathways, and 2) Supporting the choice of the most convenient treatment for an individual cancer patien