In [7]:
import os
import json
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, SpacyTextSplitter,\
TokenTextSplitter, CharacterTextSplitter
from transformers import GPT2TokenizerFast
from langchain.embeddings.openai import OpenAIEmbeddings
import streamlit as st
from typing import List

In [2]:
def load_docs(directory:str)-> List:
  """
  Creates a data loader object that generates documents from [directory]
  :@param directory: dir to load documents from. Must be a valid dir
  """
  # check for file existance
  assert os.path.isdir(directory), f"{directory} not found."

  # create a generator object to load documents
  loader = DirectoryLoader(directory)
  documents = loader.load()
  return documents

documents = load_docs('./data')
print(f"number of documents:{len(documents)}")

number of documents:1


In [3]:
class UnknownSplitterType(Exception):
  pass

def split_docs(documents:List,chunk_size:int=500,chunk_overlap:int=100,
               splitter_type:str='RecurChar')->List:
  """
  Splits the documents into chunks of [chunk_size] with an overlap of
  [chunk_overlap] between adjacent chunks. Splitting regime is base on
  [splitter_type].
  :@param splitter_type: one of 'RecurChar' (RecursiveCharacterTextSplitter),
  'Spacy' (SpacyTextSplitter), 'TikToken' (TokenTextSplitter),
  'GPT2TokenizerFast' (uses Hugging Face's GPT2TokenizerFast)
  """
  match splitter_type:
    case 'RecurChar':
          text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,
                                                         chunk_overlap=chunk_overlap)
          print(f"Loaded RecursiveChatacterTextSplitter")

    case 'Spacy':
          text_splitter = SpacyTextSplitter(chunk_size=chunk_size,
                                            chunk_overlap=chunk_overlap)
          print(f"Loaded SpacyTextSplitter")

    case 'TikToken':
          text_splitter = TokenTextSplitter(chunk_size=chunk_size,
                                            chunk_overlap=chunk_overlap)
          print(f"Loaded TikToken")

    case 'GPT2TokenizerFast':
          tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
          text_splitter = \
          CharacterTextSplitter.from_huggingface_tokenizer(tokenizer,
                                                           chunk_size=chunk_size,
                                                           chunk_overlap=chunk_overlap)
          print(f"Loaded GPT2TokenizerFast")

    case _:
      raise UnknownSplitterType("needs to be one of {'RecurChar', 'Spacy','TikToken','GPT2TokenizerFast'}")

  docs = text_splitter.split_documents(documents)
  return docs


docs = split_docs(documents,chunk_size=200, chunk_overlap=100,splitter_type='GPT2TokenizerFast')
print(f"Number of chunks: {len(docs)}")

Created a chunk of size 219, which is longer than the specified 200
Created a chunk of size 243, which is longer than the specified 200
Created a chunk of size 205, which is longer than the specified 200
Created a chunk of size 562, which is longer than the specified 200
Created a chunk of size 326, which is longer than the specified 200
Created a chunk of size 295, which is longer than the specified 200
Created a chunk of size 312, which is longer than the specified 200
Created a chunk of size 313, which is longer than the specified 200
Created a chunk of size 215, which is longer than the specified 200
Created a chunk of size 202, which is longer than the specified 200
Created a chunk of size 206, which is longer than the specified 200


Loaded GPT2TokenizerFast
Number of chunks: 65


In [5]:
from langchain.embeddings import HuggingFaceEmbeddings, SentenceTransformerEmbeddings

class UnknownEmbeddingModelType(Exception):
  pass

def get_embeddings(embedding_model:str, api_key:str="")->str:
  """
  Creates an embedder that would generate the embeddings of the query based on
  the specified model
  :@param api_key: API key used to query the end point
  :@param embedding_model: model name needs to be one of
    'OPENAI': Uses OpenAI Embeddings
  """
  match embedding_model:
    case 'OPENAI':
      assert api_key != "", "OPENAI API key must not be an empty string"
      embeddings = OpenAIEmbeddings(openai_api_key=api_key)

    case 'sentence_transformers':
      embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

    case _ :
      raise UnknownSplitterType("needs to be one of {'OPENAI', sentence_transformers}")

  return embeddings

embeddings = get_embeddings('sentence_transformers')
query_result = embeddings.embed_query("Test")
print(f"embedding length: {len(query_result)}")

embedding length: 384


In [6]:
import pinecone
from langchain.vectorstores import Pinecone

pinecone.init(
    api_key= st.secrets.pinecone.api_key,
    environment=st.secrets.pinecone.env
)

index = Pinecone.from_documents(docs, embeddings, index_name=st.secrets.pinecone.index_name)