In [1]:
!pip install pypdf2 pinecone-client scikit-learn tiktoken numpy openai




[notice] A new release of pip is available: 23.0 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pinecone
import tiktoken
import openai
import os
import json
from numpy import array, average
from PyPDF2 import PdfReader

  from tqdm.autonotebook import tqdm


In [13]:
EMBEDDINGS_MODEL = "text-embedding-ada-002"
GENERATIVE_MODEL = "gpt-3.5-turbo"
EMBEDDING_DIMENSION = 1536
TEXT_EMBEDDING_CHUNK_SIZE = 200
COSINE_SIM_THRESHOLD = 0.7
MAX_TEXTS_TO_EMBED_BATCH_SIZE = 100
MAX_PINECONE_VECTORS_TO_UPSERT_PATCH_SIZE = 100
TOP_K = 5

openai.api_key = os.getenv('OPENAI_API_KEY')

In [20]:
pinecone.init(api_key=os.getenv('PINECONE_API_KEY'), environment='gcp-starter')
pinecone_index = pinecone.Index('studyhub')
print(pinecone_index.describe_index_stats())

tokenizer = tiktoken.get_encoding("gpt2")
file_text_dict = {}

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}


In [21]:
def chunks(text, n):
    """
    Yield successive n-sized chunks from text,
    preferably ending at the end of a sentence.
    """
    tokens = tokenizer.encode(text)
    i = 0
    while i < len(tokens):
        # Find the nearest end of sentence within a range of 0.5 * n and 1.5 * n tokens
        j = min(i + int(1.5 * n), len(tokens))
        while j > i + int(0.5 * n):
            # Decode the tokens and check for full stop or newline
            chunk = tokenizer.decode(tokens[i:j])
            if chunk.endswith(".") or chunk.endswith("\n"):
                break
            j -= 1
        # If no end of sentence found, use n tokens as the chunk size
        if j == i + int(0.5 * n):
            j = min(i + n, len(tokens))
        yield tokens[i:j]
        i = j


def get_col_average_from_list_of_lists(list_of_lists):
    """
    Compute the column-wise average of a list of lists
    """
    if len(list_of_lists) == 1:
        return list_of_lists[0]
    else:
        list_of_lists_array = array(list_of_lists)
        average_embedding = average(list_of_lists_array, axis=0)
        return average_embedding.tolist()


def create_embeddings_for_text(text):
    """
    Create embeddings for a text using a tokenizer and an OpenAI engine.
    Return a list of tuples (text_chunk, embedding) and an average embedding for a text.
    """
    token_chunks = list(chunks(text, TEXT_EMBEDDING_CHUNK_SIZE))
    text_chunks = [tokenizer.decode(chunk) for chunk in token_chunks]

    # Split text_chunks into shorter arrays of max length 10
    text_chunks_arrays = [text_chunks[i:i+MAX_TEXTS_TO_EMBED_BATCH_SIZE] for i in range(0, len(text_chunks), MAX_TEXTS_TO_EMBED_BATCH_SIZE)]

    # Call get_embeddings for each shorter array and combine the results
    embeddings = []
    for text_chunks_array in text_chunks_arrays:
        embeddings_response = get_embeddings(text_chunks_array, EMBEDDINGS_MODEL)
        embeddings.extend([embedding["embedding"] for embedding in embeddings_response])

    text_embeddings = list(zip(text_chunks, embeddings))

    average_embedding = get_col_average_from_list_of_lists(embeddings)

    return text_embeddings, average_embedding

def read_file(filename):
  if filename.endswith('.pdf'):
    reader = PdfReader(os.path.join('data', filename))
    extracted_text = ""
    for page in reader.pages:
        extracted_text += page.extract_text()
  elif filename.endswith('.txt'):
    with open(os.path.join('data', filename), 'r') as fp:
      extracted_text = fp.read()

  file_text_dict[filename[:-4]] = extracted_text

  clean_text = extracted_text.replace('\uf0b7', ' ').replace("\n", " ").replace("\t", " ").replace("  ", " ")
  return f'Topic is {filename[:-4]}; {clean_text}'


def get_embedding(text, engine):
    return openai.Engine(id=engine).embeddings(input=[text])["data"][0]["embedding"]

def get_embeddings(text_array, engine):
    return openai.Engine(id=engine).embeddings(input=text_array)["data"]


In [22]:
def handle_file(filename):
    contents = read_file(filename)
    stripped_filename = filename[:-4]
    text_embeddings, average_embedding = create_embeddings_for_text(contents)

    vectors = []
    for i, (text_chunk, embedding) in enumerate(text_embeddings):
        id = f'{stripped_filename}/{i}'
        file_text_dict[id] = text_chunk
        vectors.append((id, embedding, {"topic": stripped_filename, "topic_chunk_index": i}))

    batch_size = MAX_PINECONE_VECTORS_TO_UPSERT_PATCH_SIZE
    batches = [vectors[i: i + batch_size] for i in range(0, len(vectors), batch_size)]

    for batch in batches:
        pinecone_index.upsert(vectors=batch)


In [23]:
for filename in os.listdir('data'):
  if not filename.endswith('.pdf') and not filename.endswith('.txt'):
    continue
  print(f'Processing file: {filename}')
  handle_file(filename)

Processing file: Domestic-Electricity.pdf
Processing file: Electrical-Components.pdf
Processing file: Electromagnetic-Induction.pdf
Processing file: Electromagnetism.pdf
Processing file: Electrostatics.pdf
Processing file: Magnetism.pdf
Processing file: Sound.pdf
Processing file: Waves.pdf


In [24]:
with open('file-text-mapping.json', 'w+') as fp:
    json.dump(file_text_dict, fp)

In [25]:
pinecone_index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.00075,
 'namespaces': {'': {'vector_count': 75}},
 'total_vector_count': 75}

In [26]:
def answer(question):
    search_query_embedding = get_embedding(question, EMBEDDINGS_MODEL)

    try:
        query_response = pinecone_index.query(
            top_k=TOP_K,
            include_values=False,
            include_metadata=True,
            vector=search_query_embedding,
        )

        files_string = "Extract:\n"

        for i in range(len(query_response.matches)):
            result = query_response.matches[i]
            file_chunk_id = result.id

            score = result.score
            if score < COSINE_SIM_THRESHOLD and i > 0:
                break

            topic = result.metadata["topic"]
            file_text = file_text_dict.get(file_chunk_id)
            files_string += f"\nTopic: {topic}\nContent: {file_text}\n"

        messages = [
            {
                "role": "system",
                "content": """You are an intelligent teaching assistant whose goal is to answer and explain queries from the student.

Along with the student's question, you will be given extracts from the textbook (showing both topic and contents) to help you better assist the student. First, check if the student's question is related to the subject at hand (Physics). If not, reply "This is not a valid question.".

You will then go through the extracts to find answers to the student's question. If it is not found, use your own knowledge on the topic to give a reliable and accurate answer to the student. Make references to the textbook in your answer if possible."""
            },
            {
                "role": "user",
                "content": f"Question: {question}\n{files_string}"
            }
        ]

        response = openai.ChatCompletion.create(
            messages=messages,
            model=GENERATIVE_MODEL,
            max_tokens=1000,
            temperature=0.5,
        )

        return response.choices[0].message.content.strip()

    except Exception as e:
        return str(e)

In [43]:
def generate(topic):
    search_query_embedding = get_embedding(topic, EMBEDDINGS_MODEL)
    
    query_response = pinecone_index.query(
        top_k=TOP_K,
        include_values=False,
        include_metadata=True,
        vector=search_query_embedding,
        filter={'topic': topic}
    )

    files_string = "Extracts:\n"

    for i in range(len(query_response.matches)):
        result = query_response.matches[i]
        file_chunk_id = result.id

        score = result.score
        if score < COSINE_SIM_THRESHOLD and i > 0:
            break

        topic = result.metadata["topic"]
        file_text = file_text_dict.get(file_chunk_id)
        files_string += f"\nTopic: {topic}\nContent: {file_text}\n"

    print(files_string)

    messages = [
        {
            "role": "system",
            "content": """You are a knowledgeable educator preparing a test for your students based on a given topic.

You will be given extracts from the textbook to help you better craft a question.
You will then go through the extracts and create a suitable question to test the student, and you have to make sure to create the answer as well.
Vary the questions you craft and make them difficult.

Format your response in JSON format: {"question": question, "answer": answer}"""
        },
        {
            "role": "user",
            "content": f"Topic: {topic}\n{files_string}"
        }
    ]

    response = openai.ChatCompletion.create(
        messages=messages,
        model=GENERATIVE_MODEL,
        max_tokens=1000,
        temperature=1.0,
    )

    return response.choices[0].message.content.strip()

In [44]:
generate('Electrostatics')

Extracts:

Topic: Electrostatics
Content: Topic is Electrostatics; /Sec4/Phy sics/Electrostatics  Page 1 of 11 Hwa Chong Institution (High School) Name : _________________________ (   ) PHYSICS Notes Class : __________  Electrostatics Date : __________  Note: You should read your text book and compare with these notes.   A. Introduction Some objects (e.g. glass ro d or ebonite rod) acquire a new property of being able to attract small pieces of paper after they have been rubbed with another material (e.g. silk or fur, respectively). This phenomenon belongs to the branch of physics called electrostatics or static electricity . It involves the study of static electric charges. Before rubbing, these objects do not attract small pieces of paper. This implies that friction due to rubbing has changed the nature of the surfaces of the rods. We say that friction has caused the rods to be ‘electrified’ or ‘charged’.   B. Two types of charges  Only two kinds of charges exist: positive charge and

'{"question": "What is the purpose of an electrostatic precipitator?", "answer": "To remove flue-ash (a mixture of smoke and dust particles) from a modern coal-fired power station and prevent air pollution."}'