In [1]:
import openai
import time
import json
import backoff
from openai.error import RateLimitError
import datetime
import pickle
import numpy as np
from numpy.linalg import norm
import re
import os
from PyPDF2 import PdfReader

In [2]:
openai.api_key=os.getenv("OPENAI_API_KEY")

In [3]:
#full textbook embeddings - vectors
with open("/home/nehasheth/chatbotai/neha/data-generator/QA_generation_from_textbook/prompt_engineering/reinforcement_learning/index_pateltextbook.json") as input_file:
    index_data = json.load(input_file)

In [4]:
def gpt3_embedding(content, model='text-embedding-ada-002'):
    try:
        response = openai.Embedding.create(input=content, engine=model)
    except openai.error.APIConnectionError:
        print("Failed") 
    return response['data'][0]['embedding'] 

In [5]:
#compute cosine similarity
def get_similarity(v1, v2):
    cosine = np.dot(v1, v2)/(norm(v1)*norm(v2))
    return cosine

In [6]:
#searching through textbook 
def search_index(query, index_data, count=1):
    question_vector = gpt3_embedding(query)
    scores = []
    for i in index_data:
        score = get_similarity(question_vector, i['vector'])
        scores.append({'content' : i['content'], 'score' : score})
    most_relevant= sorted(scores, key=lambda d: d['score'], reverse=True)
    return most_relevant[0:3]

In [7]:
def response_API(prompt, myKwargs = {}):

  #default arguments to send the API, unless changed in function
  kwargs = {"model" :"gpt-3.5-turbo",
            "temperature" :0.6,
            "max_tokens": 500,
            "frequency_penalty":1,
            "presence_penalty":0}


  for kwarg in myKwargs:
    kwargs[kwarg] = myKwargs[kwarg]
  
  r=openai.ChatCompletion.create(
      model="gpt-3.5-turbo",
      messages=[
            {"role": "system", "content": "You are a teaching assistant answering student questions in a concise way, the students are college freshmen."},
            {"role": "user", "content": prompt}
            ])
  return r['choices'][0]['message']['content']

In [8]:
def get_answers(fin_question):
    
    results = search_index(fin_question, index_data) #get top 3 relevant contexts.

    fin_answers = []

    prompt1 = "Context : %s %s Answer this question based on the above context. The answer should have a university freshmen-level language and be very concise and to-the-point. Answer: " % (results[0], fin_question)
    response1 = response_API(prompt1)
    fin_answers.append(response1)

    prompt2 = "Context : %s %s Answer this question based on the above context. The answer should have a university freshmen-level language and be very concise and to-the-point. Answer: " % (results[1], fin_question)
    response2 = response_API(prompt2)
    fin_answers.append(response2)

    #gpt answer
    prompt3 = "Answer this question. The answer should have a university freshmen-level language and be very concise and to-the-point. Question:%s Answer: " % fin_question
    response3 = response_API(prompt3)
    fin_answers.append(response3)

    return fin_answers #returns a list. 1Q - 3A

Embedding patel textbook 

In [9]:
# reader = PdfReader("../raw_data/notes/Student_Notes.pdf")
reader = PdfReader("/home/nehasheth/chatbotai/neha/non-public-datasets/raw_data/patel_textbook/Yale Patt - Introduction to Computing Systems_ From Bits & Gates to C & Beyond.pdf")
print("Total pages: ", len(reader.pages))
 
# extracting text from page
textbook = []
for i, page in enumerate(reader.pages):
    text = page.extract_text().replace("\n", " ")
    # skip empty pages
    if text:
        textbook.append(dict(
                            text=text,
                            page_number=i, 
                            textbook_name="Yale-Patt_Sanjay-Patel--Intro_to_Computing_Systems"))

Total pages:  801


In [27]:
full_textbook = ""
for i, content in enumerate(textbook[25:281]):
    full_textbook+=content["text"]

In [29]:
text_file = open("patel_textbook_chap7.txt", "wt")
n = text_file.write(full_textbook)
text_file.close()

Generating answers

In [9]:
#get questions
with open('questions_set1.pkl', 'rb') as file:
    # Load the data from the pickle file
    q1 = pickle.load(file)

with open('questions_set2.pkl', 'rb') as file:
    # Load the data from the pickle file
    q2 = pickle.load(file)

In [10]:
q1 = [q.split('. ', 1)[1] for q in q1] #remove all numbering

In [11]:
questions = q1 + q2

In [None]:
questions

In [None]:
#formatting the data
qa_data = []
answer_list=[]
for j, ques in enumerate(questions):
        print(j)
        print(ques)
        data = {}
        data['GPT-3-RLHF-Generations'] = {}
        data['GPT-3-RLHF-Generations']['question'] = ques
        answers = []
        answers = get_answers(ques)
        data['GPT-3-RLHF-Generations']['answers'] = answers
        print("done")
        qa_data.append(data)   

In [19]:
with open('RLHF_Keywords_Set1.json', 'w', encoding='utf-8') as f:
    json.dump(qa_data, f, ensure_ascii=False, indent=4) 

In [20]:
os.getcwd()

'/mnt/project/chatbotai/neha/data-generator/QA_generation_from_textbook/prompt_engineering/reinforcement_learning'