In [94]:
import tiktoken
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
import pinecone
import openai
import os
import time
import sys
import pprint 
import chardet
pp = pprint.PrettyPrinter(indent=4)
from uuid import uuid4
from sentence_transformers import SentenceTransformer

try:
    from dotenv import load_dotenv
except:
    pass

load_dotenv()

True

In [2]:
# Access the environment variables
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
PINECONE_ENV = os.getenv('PINECONE_ENV')

print(OPENAI_API_KEY, PINECONE_API_KEY, PINECONE_ENV)

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 7d0de2c4-fd69-47f0-9d6d-10355431d98f asia-southeast1-gcp-free


In [3]:
def tiktoken_len(text):
    tokenizer = tiktoken.get_encoding('cl100k_base')
    tokens = tokenizer.encode(text, disallowed_special=())
    return len(tokens)

In [4]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=20,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""]
)

In [5]:
model_name = 'text-embedding-ada-002'
embed = OpenAIEmbeddings(
    model=model_name,
    openai_api_key=OPENAI_API_KEY)



In [61]:
pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)


In [58]:
def getIds(texts):
    ids = [str(uuid4()) for _ in range(len(texts))]
    return ids

In [95]:
def get_embeddings(texts, open_source=True):
    model = SentenceTransformer('all-MiniLM-L6-v2')

    if open_source:
        return model.encode(texts)

    return embed.embed_documents(texts)

In [60]:
def upload_vectors(ids, embeddings, metadataList, index):
    vectors = zip(ids, embeddings, metadataList)

    index.upsert(vectors=vectors)

    

In [54]:

def upsert_vecs(text, doc_name, cluster_name, index_name):
    completed = 0
    batch_limit = 100

    texts = []
    metadatas = []

    pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)
    print("Initialized Pinecone Database...")

    if index_name not in pinecone.list_indexes():
        pinecone.create_index(
            name=index_name,
            metric='dotproduct',
            dimension=1536  # 1536 dim of text-embedding-ada-002
        )
    index = pinecone.Index(index_name)

    metadata = {
        "doc_name": doc_name,
        "cluster_name": cluster_name
    }

    record_texts = text_splitter.split_text(text)
    record_metadatas = []

    for j, text in enumerate(record_texts):
        record_metadatas = [{"chunk": j, "text": text, **metadata}]

        texts.append(text)
        # print(len(texts))
        metadatas.append(record_metadatas)
        # print(len(metadatas))
        # pp.pprint(metadatas)

        # print(j, len(texts), text[:50])

        if len(texts) >= batch_limit:
            print("HERE! UPSERT")
            
            ids = [str(uuid4()) for _ in range(len(texts))]
            print(ids)

            print("Getting Embeds")
            embeds = embed.embed_documents(texts)
            print(embeds)
            print("Got Embeds")
            embeds = []
            print("Upserting...")

            Vectors = []

            print(len(ids), len(embeds), len(metadatas))
            input("Upsert? ")
            for index, i in enumerate(ids):
                Vectors.append((i, embeds[index], metadatas[index]))
            
            pp.pprint(Vectors)


            index.upsert(vectors=Vectors)
            print("Upserted", j)
            texts = []
            metadatas = []
        
        # print(f"{j}/{len(record_texts)} completed....")

    if len(texts) > 0:
        print("1 Upserting...")
        ids = [str(uuid4()) for _ in range(len(texts))]
        embeds = embed.embed_documents(texts)
        index.upsert(vectors=zip(ids, embeds, metadatas))
        print("1 Upserted!")


In [62]:
def split_texts(text):
    record_texts = text_splitter.split_text(text)

    return record_texts

In [None]:
def upsert_func(text, doc_name, cluster_name, index_name):
    batch_limit = 100

    batch_texts, batch_metadas = [], []

    if index_name not in pinecone.list_indexes():
        pinecone.create_index(
            name=index_name,
            metric='dotproduct',
            dimension=1536  # 1536 dim of text-embedding-ada-002
        )
    index = pinecone.Index(index_name)

    doc_metadata = {
        "doc_name": doc_name,
        "cluster_name": cluster_name
    }

    all_texts = split_texts(text)
    record_metadatas = []



In [7]:

def query_index(query, cluster_name, index_name, metadata=None, openSource=True):
    pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)
    
    text_field = "text"
    index = pinecone.Index(index_name)
    if openSource:
        vectorstore = Pinecone.from_existing_index(index_name, SentenceTransformer('all-MiniLM-L6-v2'))
    else:
        vectorstore = Pinecone.from_existing_index(index_name, embed)
        
    response = vectorstore.similarity_search(
        query,  # our search query
    )
    for res in response:
        print(f"Chunk: {res.metadata['chunk']}")
        print(f"Content:\n{res.page_content}")
        print("\n---\n")
    
    print("_"*200)



In [None]:
question = "Who is Harry?"
query_index(question, "Trials", "test-index")

In [None]:

input_file1 = 'Book 3 - The Prisoner of Azkaban.txt' 
input_file = 'textDocs/' + input_file1

with open(input_file, "rb") as f:
    encoding = chardet.detect(f.read())["encoding"]
with open(input_file, "r", encoding=encoding) as f:
    txt = f.read()
    upsert_vecs(txt, input_file1, "Trials", "test-index")

In [102]:

input_file1 = 'Book 3 - The Prisoner of Azkaban.txt' 
input_file = 'textDocs/' + input_file1

with open(input_file, "rb") as f:
    encoding = chardet.detect(f.read())["encoding"]
with open(input_file, "r", encoding=encoding) as f:
    text = f.read()



In [103]:
batch_limit = 100
batch_texts, batch_metadatas = [], []

In [104]:
doc_name, cluster_name, index_name = input_file1, "Trials", "open-source-index"

if index_name not in pinecone.list_indexes():
        pinecone.create_index(
            name=index_name,
            metric='dotproduct',
            dimension=1536  # 1536 dim of text-embedding-ada-002
        )
index = pinecone.Index(index_name)

doc_metadata = {
        "doc_name": doc_name,
        "cluster_name": cluster_name
    }

In [105]:
all_texts = split_texts(text)
record_metadatas = []

In [106]:
print(text)

/ 




OWL POST 

Harry Potter was a highly unusual boy in many ways. 
For one thing, he hated the summer holidays more 
than any other time of year. For another, he really 
wanted to do his homework but was forced to do it in 
secret, in the dead of night. And he also happened to 
be a wizard. 

It was nearly midnight, and he was lying on his 
stomach in bed, the blankets drawn right over his 
head like a tent, a flashlight in one hand and a large 
leather-bound book (A History of Magic by Bathilda 
Bagshot) propped open against the pillow. Harry 
moved the tip of his eagle-feather quill down the page, 
frowning as he looked for something that would help 
him write his essay, “Witch Burning in the Fourteenth 
Century Was Completely Pointless — discuss.” 

The quill paused at the top of a likely-looking 
paragraph. Harry pushed his round glasses up the 
bridge of his nose, moved his flashlight closer to the 
book, and read: 



Page | 2 Harry Potter and the Prisoner of Azkaban - J.K. R

In [113]:
BATCHES = []

for j, text in enumerate(all_texts):
    print(j, end = ", ")

    # print(all_texts)

    chunk_metadata = [{"chunk": j, "text": text, **doc_metadata}]

    batch_texts.append(text)
    batch_metadatas.append(chunk_metadata)

    if len(batch_texts) >= batch_limit:
        print("\nLimit\n")
        ids = getIds(batch_texts)
        embeddings = get_embeddings(batch_texts)
        BATCHES.append(zip(ids, embeddings, batch_metadatas))
        batch_texts, batch_metadatas = [], []
    
if len(batch_texts):
    print("\nLimit Last\n")
    ids = getIds(batch_texts)
    embeddings = get_embeddings(batch_texts)
    BATCHES.append(zip(ids, embeddings, batch_metadatas))
    batch_texts, batch_metadatas = [], []


    


0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 
Limit

100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 
Limit

200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218,

In [None]:
len(BATCHES)

In [89]:
len(batch_texts)

126

In [93]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

#Our sentences we like to encode
sentences = ['This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.',
    'The quick brown fox jumps over the lazy dog.']

#Sentences are encoded by calling model.encode()
embeddings = model.encode(sentences)

#Print the embeddings
for sentence, embedding in zip(sentences, embeddings):
    print("Sentence:", sentence)
    print(len(embedding), "Embedding:", embedding)
    print("")

Sentence: This framework generates embeddings for each input sentence
384 Embedding: [-1.37173487e-02 -4.28515933e-02 -1.56286228e-02  1.40537629e-02
  3.95537876e-02  1.21796370e-01  2.94333566e-02 -3.17524336e-02
  3.54959667e-02 -7.93139935e-02  1.75878089e-02 -4.04369608e-02
  4.97259833e-02  2.54913196e-02 -7.18700886e-02  8.14969242e-02
  1.47075311e-03  4.79627810e-02 -4.50336076e-02 -9.92174894e-02
 -2.81770267e-02  6.45045564e-02  4.44670655e-02 -4.76217642e-02
 -3.52951996e-02  4.38671783e-02 -5.28565720e-02  4.32970497e-04
  1.01921506e-01  1.64072216e-02  3.26996818e-02 -3.45986672e-02
  1.21339411e-02  7.94871524e-02  4.58341883e-03  1.57778375e-02
 -9.68210213e-03  2.87625901e-02 -5.05807064e-02 -1.55793708e-02
 -2.87906490e-02 -9.62279737e-03  3.15556489e-02  2.27349345e-02
  8.71449932e-02 -3.85027416e-02 -8.84718448e-02 -8.75499658e-03
 -2.12343894e-02  2.08924413e-02 -9.02078077e-02 -5.25732860e-02
 -1.05638914e-02  2.88310964e-02 -1.61455385e-02  6.17835205e-03
 -1.2