In [15]:
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import CharacterTextSplitter

from langchain.vectorstores import Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone

from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI

import os

In [None]:
# splitting the docs into chunks of manageable size
def split_docs(docs_folder_path):
  # langchain directory loader 
  loader = DirectoryLoader(docs_folder_path)
  # load the docs 
  docs = loader.load()
  # split the docs into chunks 
  char_text_splitter = CharacterTextSplitter(chunk_size=1024, chunk_overlap=0)
  doc_texts = char_text_splitter.split_documents(docs)
  return doc_texts

doc_texts = split_docs('docs')
doc_texts

In [2]:
# keys
openai_key = os.getenv("OPENAI_API_KEY")
pinecone_key = os.getenv("PINECONE_API_KEY")
pinecoen_env = os.getenv("PINECONE_ENVIRONMENT")

In [3]:
# embedding model to use
EMBEDDING_MODEL = "text-embedding-ada-002"
OUTPUT_DIM = 1536

In [8]:
# init embeddings object
embeddings = OpenAIEmbeddings(openai_api_key=openai_key)

# init pinecone
pinecone.init(
  api_key=pinecone_key,
  environment=pinecoen_env,
)

In [6]:
# index_name -> index vector name in Pinecone
index_name = "woman-safety-embeddings"
# init index obj of pinecone
index = pinecone.Index(index_name)
# check 
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.00014,
 'namespaces': {'': {'vector_count': 14}},
 'total_vector_count': 14}

In [None]:
# upsert your embeddings to pinecone (takes care of cosine similarity)
doc_store = Pinecone.from_texts([d.page_content for d in doc_texts], embeddings, index_name=index_name)

In [None]:
# question the chatbot and prompt it how ever you prefer 
question = "YOUR_QUESTION"
query = f"PRE_INSTRUCTIONS {question} POST_INSTRUCTIONS"

In [None]:
# check to see the knowledge base the query searches for --> similarity_search <--
docs = doc_store.similarity_search(query)
docs

In [None]:
def answer_query(query):
  # generate response to the query
  # openai llm which langchain uses
  llm = OpenAI(temperature=0.2, openai_api_key=openai_key)

  qa_chain = load_qa_chain(llm, chain_type="stuff")
  docs = doc_store.similarity_search(query)

  # used for question and answering
  answer = qa_chain.run(input_documents = docs, question=query)
  return answer

In [None]:
answer_query(query)

In [17]:
question = "please i want to know How to get better at football"
query = f"You are a chatbot personalised for a woman safety application, you have been provided a context and now answer {query} without going out of context or hallucinating or providing fabricated information, just stick to the context "
# You are a chatbot personalised for a woman safety application, you have been provided a context and now answer {query} without going out of context or hallucinating or providing fabricated information, just stick to the context 

To query the db directly

In [None]:
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.8, openai_api_key=openai_key)
# embeddings = OpenAIEmbeddings(openai_api_key=openai_key)
chain = load_qa_chain(llm, chain_type="stuff")
docsearch = Pinecone.from_existing_index(index_name, embeddings)
docs = docsearch.similarity_search(query)
answer = chain.run(input_documents = docs, question=query)
answer