In [2]:
import tiktoken
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
import pinecone
import openai
import os
import time
import sys
import pprint 
import chardet
pp = pprint.PrettyPrinter(indent=4)
from uuid import uuid4
from sentence_transformers import SentenceTransformer
import json

try:
    from dotenv import load_dotenv
except:
    pass

load_dotenv()


  from tqdm.autonotebook import tqdm


True

In [3]:
# Access the environment variables
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
PINECONE_ENV = os.getenv('PINECONE_ENV')

print(OPENAI_API_KEY, PINECONE_API_KEY, PINECONE_ENV)

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 7d0de2c4-fd69-47f0-9d6d-10355431d98f asia-southeast1-gcp-free


In [4]:
def tiktoken_len(text):
    tokenizer = tiktoken.get_encoding('cl100k_base')
    tokens = tokenizer.encode(text, disallowed_special=())
    return len(tokens)

In [5]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=20,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""]
)

In [6]:
model_name = 'text-embedding-ada-002'
embed = OpenAIEmbeddings(
    model=model_name,
    openai_api_key=OPENAI_API_KEY)



In [7]:
pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)


In [8]:
def getIds(texts):
    ids = [str(uuid4()) for _ in range(len(texts))]
    return ids

In [9]:
def get_embeddings(texts, open_source=True):
    model = SentenceTransformer('all-MiniLM-L6-v2')

    if open_source:
        return model.encode(texts)

    return embed.embed_documents(texts)

In [10]:
def get_query_embeddings(texts, open_source=True):
    model = SentenceTransformer('all-MiniLM-L6-v2')

    if open_source:
        return [float(x) for x in model.encode(texts)]

    return embed.embed_documents(texts)

In [11]:
def upload_vectors(ids, embeddings, metadataList, index):
    vectors = zip(ids, embeddings, metadataList)

    index.upsert(vectors=vectors)

    

In [None]:

def upsert_vecs(text, doc_name, cluster_name, index_name):
    completed = 0
    batch_limit = 100

    texts = []
    metadatas = []

    pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)
    print("Initialized Pinecone Database...")

    if index_name not in pinecone.list_indexes():
        pinecone.create_index(
            name=index_name,
            metric='dotproduct',
            dimension=1536  # 1536 dim of text-embedding-ada-002
        )
    index = pinecone.Index(index_name)

    metadata = {
        "doc_name": doc_name,
        "cluster_name": cluster_name
    }

    record_texts = text_splitter.split_text(text)
    record_metadatas = []

    for j, text in enumerate(record_texts):
        record_metadatas = [{"chunk": j, "text": text, **metadata}]

        texts.append(text)
        # print(len(texts))
        metadatas.append(record_metadatas)
        # print(len(metadatas))
        # pp.pprint(metadatas)

        # print(j, len(texts), text[:50])

        if len(texts) >= batch_limit:
            print("HERE! UPSERT")
            
            ids = [str(uuid4()) for _ in range(len(texts))]
            print(ids)

            print("Getting Embeds")
            embeds = embed.embed_documents(texts)
            print(embeds)
            print("Got Embeds")
            embeds = []
            print("Upserting...")

            Vectors = []

            print(len(ids), len(embeds), len(metadatas))
            input("Upsert? ")
            for index, i in enumerate(ids):
                Vectors.append((i, embeds[index], metadatas[index]))
            
            pp.pprint(Vectors)


            index.upsert(vectors=Vectors)
            print("Upserted", j)
            texts = []
            metadatas = []
        
        # print(f"{j}/{len(record_texts)} completed....")

    if len(texts) > 0:
        print("1 Upserting...")
        ids = [str(uuid4()) for _ in range(len(texts))]
        embeds = embed.embed_documents(texts)
        index.upsert(vectors=zip(ids, embeds, metadatas))
        print("1 Upserted!")


In [12]:
def split_texts(text):
    record_texts = text_splitter.split_text(text)

    return record_texts

In [13]:
def create_gpt_question(query, Responses):

    FinalMessage = f"Query: '{query}'\n\nContexts:\n\n\n"

    for resp in Responses:
        FinalMessage += f"Chunk: {resp['Source']}\n"
        FinalMessage += f"Content: {resp['Text']}\n\n"
        FinalMessage += f"---\n\n\n"

    
    return FinalMessage

In [11]:
def upsert_func(text, doc_name, cluster_name, index_name):
    batch_limit = 100

    batch_texts, batch_metadas = [], []

    if index_name not in pinecone.list_indexes():
        pinecone.create_index(
            name=index_name,
            metric='dotproduct',
            dimension=1536  # 1536 dim of text-embedding-ada-002
        )
    index = pinecone.Index(index_name)

    doc_metadata = {
        "doc_name": doc_name,
        "cluster_name": cluster_name
    }

    all_texts = split_texts(text)
    record_metadatas = []



In [14]:

def query_index(query, cluster_name, index_name, filter=None, openSource=True):
    pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)

    
    text_field = "text"
    index = pinecone.Index(index_name)

    Responses = []


    
    if openSource:

        response = index.query(
        top_k=3,
        include_values=False,
        include_metadata=True,
        vector=get_query_embeddings(query),
        filter={}
    )
        
        for match in (response["matches"]):
            source = (match["metadata"]["chunk"])
            doc_name = (match["metadata"]["doc_name"])
            text = (match["metadata"]["text"])

            Responses.append({"Source" : source, "Doc": doc_name, "Text": text})

        return create_gpt_question(query, Responses)
          
    else:
        vectorstore = Pinecone.from_existing_index(index_name, embed)
        
        response = vectorstore.similarity_search(
            query,  # our search query
        )
        for res in response:
            print(f"Chunk: {res.metadata['chunk']}")
            print(f"Content:\n{res.page_content}")
        
    print("_"*200)

# question = "Who is Percy?"
# print(query_index(question, "Trials", "open-source-index"))

In [None]:
question = "Who is Percy?"
query_index(question, "Trials", "test-index")

In [None]:

input_file1 = 'Book 3 - The Prisoner of Azkaban.txt' 
input_file = 'textDocs/' + input_file1


with open(input_file, "rb") as f:
    encoding = chardet.detect(f.read())["encoding"]
with open(input_file, "r", encoding=encoding) as f:
    txt = f.read()
    upsert_vecs(txt, input_file1, "Trials", "test-index")

In [60]:

input_file1 = 'Book 3 - The Prisoner of Azkaban.txt' 
input_file = 'textDocs/' + input_file1
# input_file = "largeText.txt"

with open(input_file, "rb") as f:
    encoding = chardet.detect(f.read())["encoding"]
with open(input_file, "r", encoding=encoding) as f:
    text = f.read()



In [61]:
batch_limit = 100
batch_texts, batch_metadatas = [], []

In [62]:
doc_name, cluster_name, index_name = input_file1, "Trials", "open-source-index"

if index_name not in pinecone.list_indexes():
        pinecone.create_index(
            name=index_name,
            metric='dotproduct',
            dimension=1536  # 1536 dim of text-embedding-ada-002
        )
index = pinecone.Index(index_name)

doc_metadata = {
        "doc_name": doc_name,
        "cluster_name": cluster_name
    }

In [63]:
all_texts = split_texts(text)
record_metadatas = []

In [None]:
for i, t in enumerate(all_texts):
    print(i, t[:200])

In [71]:
BATCHES = []
SingleBatch = []
Batch_Embedding = []

for j, text in enumerate(all_texts):
    print(j, end = ", ")

    # print(all_texts)

    chunk_metadata = {"chunk": j, "text": text, **doc_metadata}

    SingleBatch.append({
        "id" : str(uuid4()),
        "metadata": chunk_metadata
    })

    batch_texts.append(text)
    batch_metadatas.append(chunk_metadata)

    if len(batch_texts) >= batch_limit:
        print("\nLimit\n")
        embeddings = get_embeddings(batch_texts)

        for data, em in zip(SingleBatch, embeddings):
            EmList = []
            for em in list(em):
                EmList.append(float(em))
            data["values"] = list(EmList)

        BATCHES.append(SingleBatch)

        SingleBatch = []
        batch_texts, batch_metadatas = [], []
    
if len(batch_texts):
    print("\nLimit Last\n")

    embeddings = get_embeddings(batch_texts)

    for data, em in zip(SingleBatch, embeddings):
        EmList = []
        for em in list(em):
            EmList.append(float(em))
        data["values"] = list(EmList)

    BATCHES.append(SingleBatch)

    SingleBatch = []
    batch_texts, batch_metadatas = [], []


    


0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 
Limit

100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 
Limit

200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218,

In [72]:
for batch in BATCHES:
    print(batch)
    index.upsert(vectors=batch)



In [None]:
question = "What is the relation between Sirius and Harry?"
query_index(question, "Trials", "open-source-index")

In [15]:

SYSTEM_MESSAGE = '''
REPLY AS A PYTHON DICTIONARY! 
You are a chat assistant that will create answers based on the information provided. 

The message will contain the Query, followed by the context. This will state the "chunk_num" followed by the chunk content. You will only use the information provided in these chunks/messages to create your response. You do not need to use all of the information provided, just what you think is relevant to the query. Do not add unnecessary details. You can also use markdown formatting if required for a better layout.
Your response should be in the form of a Python dictionary, with one key being "reply" with your response, and the other being "source" which will have a list as the value. This list should contain all the chunk numbers from which you used information/facts to formulate your answer. 
The response should look like this:
{"Reply": "Your answer here", "Source": [0.0, 1.0]} (This is just an example)
The user should be able to read that chunk to check the response.

If the context/information provided by the user does not have enough information related to the Query, you will respond stating that.

REPLY AS A PYTHON DICTIONARY!  USE MARKDOWN IF REQUIRED!


'''

def ask_gpt(UserMessage, messages=None):


    messages = [
        {"role": "system", "content": SYSTEM_MESSAGE},
        {"role": "user", "content": UserMessage}
    ]
    
    completion = openai.ChatCompletion.create(
        model = "gpt-3.5-turbo",
        messages = messages)
    response_text = completion["choices"][0]["message"]["content"]

    


    try:
        response_dict = json.loads(response_text)
        Reply = response_dict.get('Reply', None)
        Source = response_dict.get('Source', None)
    except json.JSONDecodeError:
        Reply = response_text
        Source = None

    return Reply, Source


In [18]:
def get_answer(query):
    user_message = query_index(query, "Trials", "open-source-index")
    proper_response = ask_gpt(user_message)

    return proper_response

In [20]:
query = "Who is Sirius Balck? How is he related to Harry Potter"

print(get_answer(query))


("Sirius Black is a character from Harry Potter and is related to him through his godfather. Sirius was a friend of Harry's father James Potter and they created the Marauder's Map together at Hogwarts. Sirius was also an unregistered animagus and was able to turn into a dog. He was accused of being a Death Eater and was sent to Azkaban but escaped. Harry believes he is innocent and is determined to clear his name. In the third book, Harry and his friends try to prevent Sirius from being recaptured by the dementors and prove his innocence. ", [400.0, 443.0, 439.0])
