In [34]:
# New Chunker using Json file

import faiss
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import CharacterTextSplitter
import os
import json
import pickle
import tiktoken
from langchain.document_loaders import DirectoryLoader, PyPDFLoader
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.embeddings.openai import OpenAIEmbeddings

vector_store_name = 'acwd_faiss_store.pkl'
index_name= "acwd.index"
url_list = []

tokenizer = tiktoken.get_encoding('cl100k_base')
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)
    

text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=200,
        chunk_overlap=10,
        length_function = len,
        separators=['\n\n', '\n', ' ', '']
    )

def process_file(file_path):
    chunks = []   
    metadatas = []
    # try:
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            data = json.loads(line)
            content = data['content']
            url = data['url']
           
            # Chunk the content and create embeddings
            splits = text_splitter.split_text(content)
            chunks.extend(splits)
            metadatas.extend([{"source": url}] * len(splits)) 
            print("added to index and url list for ", url)  
    
      #except Exception as e:
    #        print(f"Error processing file {file}: {e}")
       
    return chunks, metadatas


chunks, metadatas = process_file("items.jl")
print("done processing", len(chunks)) 

embeddings = HuggingFaceEmbeddings()
store = FAISS.from_texts(chunks, embeddings, metadatas=metadatas)    
faiss.write_index(store.index, index_name)
store.index = None
with open(vector_store_name, "wb") as f:
    pickle.dump(store, f)

print("created vector store ", vector_store_name)

added to index and url list for  https://acwd.org
added to index and url list for  https://acwd.org/27/About-Us
added to index and url list for  https://acwd.org/8/Services
added to index and url list for  https://acwd.org/31/Connect-With-Us
added to index and url list for  https://acwd.org/630/I-Want-To
added to index and url list for  https://acwd.org/606/Advanced-Metering-Infrastructure-AMI
added to index and url list for  https://acwd.org/waterclips
added to index and url list for  https://acwd.org/145/Rebates
added to index and url list for  https://acwd.org/117/Starting-Stopping-Service
added to index and url list for  https://www.acwd.org/forms.aspx?fid=131
added to index and url list for  https://www.acwd.org/355/Current-Projects
added to index and url list for  https://acwd.org/90/Board-of-Directors
added to index and url list for  https://www.acwd.org/site/copyright
added to index and url list for  https://www.acwd.org/accessibility
added to index and url list for  https://ww

In [51]:
# Dec 16 chat bot

import pickle
import faiss
from langchain_community.chat_models import ChatOpenAI
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)

from langchain.chains import VectorDBQAWithSourcesChain
from langchain.chains import RetrievalQAWithSourcesChain
os.environ['OPENAI_API_KEY']='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'
OpenAI_key = os.environ.get("OPENAI_API_KEY")
vector_store_name = 'acwd_faiss_store.pkl'

# Load the FAISS index from disk.
index = faiss.read_index("acwd.index")

# Load the vector store from disk.
with open(vector_store_name, "rb") as f:
    store = pickle.load(f)

# merge the index and store
store.index = index

system_template = """
Your are an helpful AI assistant for Alameda County water department (ACWD). Website url is https://acwd.org. 
Try to be polite and helpful. Use the content of the transcript to answer the users question. Do not make up answers. 
If the question is not related to ACWD or water, DO NOT provide answers to the question and just reply 
"My primary function is to assist with inquiries related to the Alameda County Water District (ACWD). 
If you have any questions about ACWD services, please let me know, and I'll be glad to help!". 
----------------
{summaries}"""

messages = [
    SystemMessagePromptTemplate.from_template(system_template),
    HumanMessagePromptTemplate.from_template("{question}")
]
prompt = ChatPromptTemplate.from_messages(messages)

# Build the question answering chain.
llm = ChatOpenAI(openai_api_key=OpenAI_key, model_name="gpt-3.5-turbo", temperature=0.5, max_tokens=2256,
                 request_timeout=60)  
chain = RetrievalQAWithSourcesChain.from_chain_type(
    llm=llm,
    chain_type="stuff",
    #retriever=store.as_retriever(search_kwargs={'k':5}),
    retriever=store.as_retriever(),
    return_source_documents=True
    #chain_type_kwargs=chain_type_kwargs
)

# Run the chain.
query = "how should I pay my bill"
result = chain({"question": query})

# Print the answer and the sources.
print(f"Answer: {result['answer']}")
print(f"Sources: {result['sources']}")

Answer: You can pay your bill using any convenient payment method like Bank Account, MasterCard, Visa Card, Discover Card, ATM card, or Debit Card. Please confirm the details you provide on this page. You can find more information on bill payment on the ACWD website. 

Sources: 
