TO DO :
1. Alter prompt (use 4 or 5 shot prompting to make it follow the pattern)
2. Final output formatting - first question and final answer QA pair
3. Metaprompting techniques


In [2]:
import openai
import time
import json
import transformers
from transformers import GPT2Tokenizer
import backoff
from openai.error import RateLimitError
import datetime
import pickle
import numpy as np
from numpy.linalg import norm
import os

In [3]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [4]:
def open_file(filepath):
    with open(filepath, 'r', encoding='utf-8') as infile:
        return infile.read()

In [5]:
api_key =  os.getenv('API_KEY')
openai.api_key =api_key

In [6]:
def gpt3_embedding(content, model='text-similarity-ada-001'):
    try:
        response = openai.Embedding.create(input=content, model=model)
    except openai.error.APIConnectionError:
        print("Failed") 
    return response['data'][0]['embedding'] 

In [7]:
#compute cosine similarity
def get_similarity(v1, v2):
    cosine = np.dot(v1, v2)/(norm(v1)*norm(v2))
    return cosine

In [8]:
#searching through textbook 
def search_index(query, data, count=1):
    question_vector = gpt3_embedding(query)
    scores = []
    for i in data:
        score = get_similarity(question_vector, i['vector'])
        scores.append({'content' : i['content'], 'score' : score})
    most_relevant= sorted(scores, key=lambda d: d['score'], reverse=True)
    return most_relevant[0]

In [9]:
@backoff.on_exception(backoff.expo, RateLimitError)
def response_API(prompt, myKwargs = {}):

  #default arguments to send the API, unless changed in function
  kwargs = {"model" :"text-davinci-002",
            "temperature" :0.7,
            "max_tokens": 300,
            "best_of" :5,
            "n" :3,
            "top_p" : 1,
            "stop" : '\n\n\n',
            "presence_penalty" : 0}


  for kwarg in myKwargs:
    kwargs[kwarg] = myKwargs[kwarg]

  r = openai.Completion.create(prompt=prompt, **kwargs)
  return r['choices'][0]['text']

In [10]:
def question_completions_with_backoff(passages): 
       
    question_prompts = ['''Generate 5 interactive and coherent questions about this context. The questions should not be repeated from the previous step. The questions should consist of reasoning and procedural steps. \n
                        The questions should be precise and factual. Start the question with a '[Q]' ''',
                        
                        '''Generate 5 objective, concise and firm questions about this context. The questions should not be repeated from the previous step. \n
                        The questions should begin with any of Why/How/Where/Who/When. Start the question with a '[Q]' ''' ,
                        
                        '''Generate 5 thoughtful and compelling, steps-based procedural questions about this context that start with Why or How. The questions should not be repeated from the previous step. \n
                        The questions should be unique and creative with an abstract and subjective aspect. Start the question with a '[Q]' ''' ]
    
    n=len(question_prompts)
    questions = []
    for p in passages:
        for j in question_prompts:
                #prompt_tokens = calculate_tokens(j)
                #context_tokens = calculate_tokens(p)
                #max_tokens = 300
                
                #while(max_tokens+prompt_tokens+context_tokens < 4096):
                prompt= "%s \n %s" % (j, p)
    
                response = response_API(prompt)
                
                questions.append(response)
                print(response)
                      
    question_list = [questions[i:i + n] for i in range(0, len(questions), n)]
    
    return question_list 

In [11]:
def get_answer(question, data):
    #most relevant passages
    result = search_index(question, data) #get most relevant passages where answer could be
    prompt = "PASSAGE - %s \n QUESTION - %s \nAnswer this question in 2-3 concise sentences based on the passage. Be objective in the answer given and explain in a few lines only.\n" % (result['content'], question)
    answer = response_API(prompt)
    print(answer)
    
    return answer

In [12]:
def extract_answer(generated):
    if '\n' not in generated:
        last_line =  generated
    else: 
        last_line = generated.split('\n')[-1]

    if ':' not in last_line:
        after_colon = last_line
    else:
        after_colon = generated.split(':')[-1]
    
    if ' ' == after_colon[0]:
        after_colon = after_colon[1:]
    if '.' == after_colon[-1]:
        after_colon = after_colon[:-1]

    return after_colon

In [13]:
def extract_question(generated):
    if '\n' not in generated:
        last_line =  generated
    else: 
        last_line = generated.split('\n')[-1]

    if 'Follow up:' not in last_line:
      print('we probably should never get here...' + generated)

    if ':' not in last_line:
        after_colon = last_line
    else:
        after_colon = generated.split(':')[-1]
    
    if ' ' == after_colon[0]:
        after_colon = after_colon[1:]
    if '?' != after_colon[-1]:
        print('we probably should never get here...' + generated)
        
    return after_colon


In [14]:
def get_last_line(generated):
    if '\n' not in generated:
        last_line =  generated
    else: 
        last_line = generated.split('\n')[-1]


    return last_line
  

In [19]:
notes = []
directory = '../../notes'
for filename in os.listdir(directory):
    f = os.path.join(directory, filename)
    if not os.path.isfile(f):
        for fn in os.listdir(f):
            notes.append(f+'/'+fn)

In [20]:
notes

['../../notes/part3/ece120-set-3-2-fsm-examples-part-i.tex',
 '../../notes/part3/ece120-set-3-6-memory.tex',
 '../../notes/part3/ece120-set-3-3-lab.tex',
 '../../notes/part3/ece120-set-3-5-fsm-examples-part-ii.tex',
 '../../notes/part3/ece120-set-3-7-fsm-to-computer.tex',
 '../../notes/part3/ece120-set-3-4-keyless-extension.tex',
 '../../notes/part3/ece120-set-3-1-serialize.tex',
 '../../notes/part3/ece120-set-3-8-summary.tex',
 '../../notes/part4/ece120-set-4-3-isa-design.tex',
 '../../notes/part4/ece120-set-4-1-control-unit.tex',
 '../../notes/part4/ece120-set-4-2-coding.tex',
 '../../notes/part4/ece120-set-4-4-summary.tex',
 '../../notes/part2/ece120-set-2-3-adder.tex',
 '../../notes/part2/ece120-set-2-8-summary.tex',
 '../../notes/part2/ece120-set-2-7-registers.tex',
 '../../notes/part2/ece120-set-2-2-dontcare.tex',
 '../../notes/part2/ece120-set-2-1-goodforms.tex',
 '../../notes/part2/ece120-set-2-6-sequential.tex',
 '../../notes/part2/ece120-set-2-5-abstraction.tex',
 '../../note

In [18]:
#sections data
'../../split_textbook/sections.json'
s = open('../../split_textbook/sections.json')
sections_data = json.load(s)

#full textbook embeddings - vectors
with open("/Users/nehasheth/Desktop/Research - AI Chatbot TA/github/data-generator/prompt engineering/embeddings/index.json") as input_file:
    data = json.load(input_file)

FileNotFoundError: [Errno 2] No such file or directory: '/Users/nehasheth/Desktop/Research - AI Chatbot TA/github/data-generator/gpt-3/GPT-3_section_level.json'

In [None]:
#make a list of only texts from the json file
sections_list = []
for p, item in enumerate(sections_data):
    subtext = item['positive_ctxs']['text']
    sections_list.append(subtext)

In [None]:
s = open("/Users/nehasheth/Desktop/Research - AI Chatbot TA/github/data-generator/prompt engineering/self-ask /GPT-3_semantic_search.json")
semantic_search_data =  json.load(s)

In [None]:
questions = []
for s in range(len(semantic_search_data)):
    q = semantic_search_data[s]['GPT-3-Semantic-Search-Generations']['question']
    questions.append(q)

self ask

In [None]:
prompt = ['''Question: What is the output of a Gray code counter?
Are follow up questions needed here: Yes.
Follow up: What is the output of a three-bit Gray code counter?
Intermediate answer: The output of a three-bit Gray code counter is a sequence of three-bit values that differ by only one bit.
Follow up: What is the output of a two-bit Gray code counter?
Intermediate answer: The output of a two-bit Gray code counter is a sequence of two-bit values that differ by only one bit.
So the final answer is: a sequence of values that differ by only one bit.

Question: What is the design process for a digital FSM?
Are follow up questions needed here: Yes.
Follow up: What are the steps in the design process?
Intermediate answer: The steps in the design process are: develop an abstract model, specify I/O behavior, complete the specification, choose a state representation, calculate logic expressions, and implement with flip-flops and gates.
So the final answer is: develop an abstract model, specify I/O behavior, complete the specification, choose a state representation, calculate logic expressions, and implement with flip-flops and gates.

Question: How many transistors does an N-input gate require?
Are follow up questions needed here: Yes.
Follow up: How many inputs does a 2-input gate have?
Intermediate answer: A 2-input gate has 2 inputs.
Follow up: How many inputs does a 10-input gate have?
Intermediate answer: A 10-input gate has 10 inputs.
So the final answer is: a 2-input gate requires roughly 2 transistors, and a 10-input gate requires roughly 10 transistors.

Question: ''', 
'''
Are follow up questions needed here:''', ]



In [None]:
intermediate = "\nIntermediate answer:"
followup = "Follow up:"
finalans= '\nSo the final answer is:'

In [None]:
cur_prompt = prompt[0] +  questions[10] + prompt[1]

print(cur_prompt, end ='')

ret_text = response_API(cur_prompt, myKwargs = {"stop" : intermediate})

In [None]:
while followup in get_last_line(ret_text):
      
      cur_prompt += ret_text
      question = extract_question(ret_text)
      external_answer = get_answer(question, data)

      if external_answer is not None:
        cur_prompt += intermediate + ' ' + str(external_answer) + '.'
        print(intermediate + ' ' + external_answer + '.', end='' )
        ret_text = response_API(cur_prompt, myKwargs = {"stop" : intermediate})
      else:
        #We only get here in the very rare case that Google returns no answer.
        cur_prompt += intermediate
        print(intermediate + ' ')
        gpt_answer = response_API(cur_prompt, myKwargs = {"stop" : '\n'+followup})
        cur_prompt += gpt_answer
    
if finalans not in ret_text:
  cur_prompt += finalans
  print(finalans, end = '')
  ret_text = response_API(cur_prompt, myKwargs = {"stop" : '\n'})

  print(cur_prompt + ret_text)