In [2]:
import openai
import time
import json
import backoff
from openai.error import RateLimitError
import datetime
import pickle
import numpy as np
from numpy.linalg import norm
import re
import os

In [2]:
openai.api_key=os.getenv("OPENAI_API_KEY")

In [None]:
s = open("/Users/nehasheth/Desktop/Research/Research - AI Chatbot TA/github/data-generator/QA_generation_from_textbook/gpt-3/GPT-3_section_level.json")
sections_data = json.load(s)

#full textbook embeddings - vectors
with open("/Users/nehasheth/Desktop/Research/Research - AI Chatbot TA/github/data-generator/QA_generation_from_textbook/reinforcement_learning/index.json") as input_file:
    index_data = json.load(input_file)



In [None]:
len(sections_data) #144 sections

In [None]:
sections_data[16] #16 !! till 16. 

In [6]:
def gpt3_embedding(content, model='text-embedding-ada-002'):
    try:
        response = openai.Embedding.create(input=content, engine=model)
    except openai.error.APIConnectionError:
        print("Failed") 
    return response['data'][0]['embedding'] 

In [8]:
@backoff.on_exception(backoff.expo, RateLimitError)
def response_API(prompt, myKwargs = {}):

  #default arguments to send the API, unless changed in function
  kwargs = {"model" :"text-davinci-003",
            "temperature" :0.6,
            "max_tokens": 500,
            "frequency_penalty":1,
            "presence_penalty":0}


  for kwarg in myKwargs:
    kwargs[kwarg] = myKwargs[kwarg]

  r = openai.Completion.create(prompt=prompt, **kwargs)
  return r['choices'][0]['text']

In [9]:
#compute cosine similarity
def get_similarity(v1, v2):
    cosine = np.dot(v1, v2)/(norm(v1)*norm(v2))
    return cosine

In [10]:
#searching through textbook 
def search_index(query, index_data, count=1):
    question_vector = gpt3_embedding(query)
    scores = []
    for i in index_data:
        score = get_similarity(question_vector, i['vector'])
        scores.append({'content' : i['content'], 'score' : score})
    most_relevant= sorted(scores, key=lambda d: d['score'], reverse=True)
    return most_relevant[0:3]

In [11]:
def get_answers(fin_question, section):
    
    results = search_index(fin_question, index_data) #get top 3 relevant contexts.

    fin_answers = []

    prompt1 = "Context : %s %s Answer this question based on the above context. The answer should have a university freshmen-level language and be very concise and to-the-point. Answer: " % (results[0], fin_question)
    response1 = response_API(prompt1)
    fin_answers.append(response1)

    prompt2 = "Context : %s %s Answer this question based on the above context. The answer should have a university freshmen-level language and be very concise and to-the-point. Answer: " % (results[1], fin_question)
    response2 = response_API(prompt2)
    fin_answers.append(response2)

    prompt3 = "Context : %s %s Answer this question based on the above context. The answer should have a university freshmen-level language and be very concise and to-the-point. Answer: " % (results[2], fin_question)
    response3 = response_API(prompt3)
    fin_answers.append(response3)

    return fin_answers #returns a list. 1Q - 3A


In [12]:
def question_completions_with_backoff(passages): 
       
    question_prompts = [
    '''Generate exactly 2 objective, to-the-point and firm questions about this context. 
    The questions must specify the concept discussed in the context and be complete on its own. The questions should be different from one another. 
    Begin each question with a “[Q]” sign.''']
    
    n=len(question_prompts)
    questions = []
    for p in passages:
        for j in question_prompts:
                #prompt_tokens = calculate_tokens(j)
                #context_tokens = calculate_tokens(p)
                #max_tokens = 300
                
                #while(max_tokens+prompt_tokens+context_tokens < 4096):
                prompt= "%s \n %s" % (j, p)
    
                response = response_API(prompt)
                
                questions.append(response)
                print(response)
                      
    question_list = [questions[i:i + n] for i in range(0, len(questions), n)]
    
    return question_list 

START HERE


In [13]:
sections_list = []
for p, item in enumerate(sections_data[17:]):
    subtext = item['positive_ctxs']['text']
    sections_list.append(subtext)

In [None]:
#question_list = question_completions_with_backoff(sections_list)

In [14]:
#manually cleaned 
file1 = open("/Users/nehasheth/Desktop/Research/Research - AI Chatbot TA/github/data-generator/QA_generation_from_textbook/reinforcement_learning/GPT-3_questions_cleaned.json")
questions_manual = json.load(file1)

In [15]:
questions = questions_manual
print(len(questions))

127


In [20]:
sections_list1 = sections_list[50:]
print(len(sections_list1))

77


In [27]:
questions1 = questions[50:]

In [28]:
len(questions1)

77

In [None]:
#formatting the data
qa_data = []
answer_list=[]
for i, section in enumerate(sections_list1):
    for j, ques in enumerate(questions1[i]):
        #print(i)
        #print(section)
        #print(ques)
        data = {}
        data['textbook-paragraph'] = section
        data['GPT-3-RLHF-Generations'] = {}
        data['GPT-3-RLHF-Generations']['question'] = ques
        answers = []
        match = re.search(r'{([^}]*)}', section)
        if match:
            sec_title = match.group(1)
        fin_question =  f"Topic : {match.group(0)} Question : {ques} "
        answers = get_answers(fin_question, section)
        data['GPT-3-RLHF-Generations']['answers'] = answers
        print("done")
        qa_data.append(data)    


In [34]:
with open('RLHF_Version3_set2.json', 'w', encoding='utf-8') as f:
    json.dump(qa_data, f, ensure_ascii=False, indent=4) 

In [33]:
len(qa_data)

108