# Title
[]()

# initialize

In [1]:

directory='../data/' # This is the directory containing the CSV/text files.

# Initialize Dictionaries
tool_dict = dict()
embeddings_dict = dict()
db_dict = dict()
retriever_dict = dict()
vector_dict = dict()
description_dict = dict()
answer_dict=dict()
conversation_dict = dict()
doc_dict = dict()
queries_dict = dict()

In [12]:
doc_dict2 = dict()

# Script from iteration 2.21 of ` 2023-09-30 save embeddings.ipynb`

In [3]:
import os

# documents
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import TextLoader
from langchain.document_loaders.csv_loader import CSVLoader

from langchain.storage import LocalFileStore
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings import CacheBackedEmbeddings

# from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.vectorstores import FAISS
from langchain.agents.agent_toolkits import create_retriever_tool

# Creating the Agent
from langchain.agents.agent_toolkits import create_conversational_retrieval_agent
from langchain.chat_models import ChatOpenAI

# Create memory 
from langchain.memory import ConversationBufferMemory
from langchain.memory import StreamlitChatMessageHistory # for Streamlit

from langchain.agents.openai_functions_agent.base import OpenAIFunctionsAgent
from langchain.schema.messages import SystemMessage
from langchain.prompts import MessagesPlaceholder

from langchain.agents import AgentExecutor
from langchain.agents.openai_functions_agent.agent_token_buffer_memory import AgentTokenBufferMemory

import streamlit as st


def create_documents(directory='../data', glob='**/[!.]*', show_progress=True, loader_cls=CSVLoader):
    loader = DirectoryLoader(
        directory, glob=glob, show_progress=show_progress,
        loader_cls=loader_cls)

    documents = loader.load()
    print(f'Number of files: {len(documents)}')
    return documents
    
def create_documents_from_csv(file_path='../data/Datajam_2023___Fine_Tuning_ChatBot_CSV_-_Recycle_BC_1.csv'):
    loader = CSVLoader(file_path, encoding='utf-8')
    documents = loader.load()
    return documents

def create_retriever(
    documents, site_key, filepath, 
    embeddings_dict=embeddings_dict, 
    vector_dict=vector_dict, text_splitter=None
    ):
    """
    Parameters:
        - text_splitter (optional): a text splitter object. If None, the documents are not split. 
    """
    if text_splitter is None: # object type is the same (class 'langchain.schema.document.Document') whether or not the documents are split
        texts = documents
    else:
        texts = text_splitter.split_documents(documents)
   
    underlying_embeddings = OpenAIEmbeddings(
        openai_organization=os.environ['openai_organization'],
        openai_api_key=os.environ['openai_api_key']
        )
    embeddings_dict[site_key] = CacheBackedEmbeddings.from_bytes_store(
        underlying_embeddings, LocalFileStore(filepath), 
        namespace=f'{site_key}_{underlying_embeddings.model}'
        )
    vector_dict[site_key] = FAISS.from_documents(texts, embeddings_dict[site_key])
    retriever_dict[site_key] = vector_dict[site_key].as_retriever()
    return retriever_dict
    # return embeddings_dict


def create_retriever_and_description_dicts(params_dict, filepath, doc_dict=doc_dict, vector_dict=vector_dict):
    retriever_dict = dict()
    description_dict = dict()
    for doc_id in doc_dict:
        retriever_dict[doc_id] = create_retriever(
            doc_dict[doc_id], params_dict[doc_id]['site_key'], 
            filepath,
            vector_dict=vector_dict, 
            text_splitter=params_dict[doc_id].get('text_splitter', None)
            )
        description_dict[params_dict[doc_id]['site_key']] = params_dict[doc_id]['doc_description']

    return retriever_dict, description_dict

def create_tools_list(retriever_dict, description_dict):
    """
    https://api.python.langchain.com/en/latest/agents/langchain.agents.agent_toolkits.conversational_retrieval.tool.create_retriever_tool.html?highlight=create_retriever_tool#langchain.agents.agent_toolkits.conversational_retrieval.tool.create_retriever_tool
    """
    tools_list = []
    for site_key, retriever in retriever_dict.items():
        tool_name = f'search_{site_key}'
        tool = create_retriever_tool(retriever_dict[site_key], tool_name, description_dict[site_key])
        tools_list.append(tool)
    return tools_list


recylebc = """
This document provides information from the Recycle BC website or BC government 
website. It has the most specific information 
about whether or not an item is accepted for recycling and where to recycle it.
This should be the main resource for recycling information for residents of British Columbia.
"""

CoV_mattress = """
Information from the City of Vancouver website about how to recycle mattresses.
"""

params_dict = {
    # 1: {
    #     'site_key': 'recycle',
    #     'doc_description': recylebc,
    #     'text_splitter': None
    # },
    2: {
        'site_key': 'mattress',
        'doc_description': CoV_mattress,
        'text_splitter': None
    }
}
# filepath = '../embeddings/'
# # doc_id = 1
# # try:
# #     directory = 'data'
# #     doc_dict[doc_id] = create_documents(directory=directory, glob='*.csv')
# # except:
# #     doc_dict[doc_id] = create_documents_from_csv()
# #     print('Done creating doc from CSV')

# doc_id = 2
# try:
#     directory = 'data'
#     doc_dict[doc_id] = create_documents(directory=directory, glob='*.txt', loader_cls=TextLoader)
# except:
#     directory = '../data'
#     doc_dict[doc_id] = create_documents(directory=directory, glob='*.txt', loader_cls=TextLoader)


filepath = '../embeddings/'
retriever_dict, description_dict = create_retriever_and_description_dicts(params_dict, filepath)

In [4]:
retriever_dict

{}

In [5]:
description_dict

{}

# Iteration 3 

In [8]:
import os

# documents
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import TextLoader
from langchain.document_loaders.csv_loader import CSVLoader

from langchain.storage import LocalFileStore
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings import CacheBackedEmbeddings

# from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.vectorstores import FAISS
from langchain.agents.agent_toolkits import create_retriever_tool

# Creating the Agent
from langchain.agents.agent_toolkits import create_conversational_retrieval_agent
from langchain.chat_models import ChatOpenAI

# Create memory 
from langchain.memory import ConversationBufferMemory
from langchain.memory import StreamlitChatMessageHistory # for Streamlit

from langchain.agents.openai_functions_agent.base import OpenAIFunctionsAgent
from langchain.schema.messages import SystemMessage
from langchain.prompts import MessagesPlaceholder

from langchain.agents import AgentExecutor
from langchain.agents.openai_functions_agent.agent_token_buffer_memory import AgentTokenBufferMemory

import streamlit as st


def create_documents(directory='../data', glob='**/[!.]*', show_progress=True, loader_cls=CSVLoader):
    loader = DirectoryLoader(
        directory, glob=glob, show_progress=show_progress,
        loader_cls=loader_cls)

    documents = loader.load()
    print(f'Number of files: {len(documents)}')
    return documents
    
def create_documents_from_csv(file_path='../data/Datajam_2023___Fine_Tuning_ChatBot_CSV_-_Recycle_BC_1.csv'):
    loader = CSVLoader(file_path, encoding='utf-8')
    documents = loader.load()
    return documents

def create_retriever(
    documents, site_key, filepath, 
    embeddings_dict=embeddings_dict, 
    vector_dict=vector_dict, text_splitter=None
    ):
    """
    Parameters:
        - text_splitter (optional): a text splitter object. If None, the documents are not split. 
    """
    if text_splitter is None: # object type is the same (class 'langchain.schema.document.Document') whether or not the documents are split
        texts = documents
    else:
        texts = text_splitter.split_documents(documents)
   
    underlying_embeddings = OpenAIEmbeddings(
        openai_organization=os.environ['openai_organization'],
        openai_api_key=os.environ['openai_api_key']
        )
    embeddings_dict[site_key] = CacheBackedEmbeddings.from_bytes_store(
        underlying_embeddings, LocalFileStore(filepath), 
        namespace=f'{site_key}_{underlying_embeddings.model}'
        )
    vector_dict[site_key] = FAISS.from_documents(texts, embeddings_dict[site_key])
    retriever_dict[site_key] = vector_dict[site_key].as_retriever()
    print(f'Retriever created for {site_key}')
    return retriever_dict
    # return embeddings_dict


def create_retriever_and_description_dicts(params_dict, filepath, doc_dict=doc_dict, vector_dict=vector_dict):
    retriever_dict = dict()
    description_dict = dict()
    for doc_id in params_dict:
        retriever_dict[doc_id] = create_retriever(
            doc_dict[doc_id], params_dict[doc_id]['site_key'], 
            filepath,
            vector_dict=vector_dict, 
            text_splitter=params_dict[doc_id].get('text_splitter', None)
            )
        description_dict[params_dict[doc_id]['site_key']] = params_dict[doc_id]['doc_description']
    print(f'Created retriever and description dicts for {params_dict.keys()}')
    return retriever_dict, description_dict

def create_tools_list(retriever_dict, description_dict):
    """
    https://api.python.langchain.com/en/latest/agents/langchain.agents.agent_toolkits.conversational_retrieval.tool.create_retriever_tool.html?highlight=create_retriever_tool#langchain.agents.agent_toolkits.conversational_retrieval.tool.create_retriever_tool
    """
    tools_list = []
    for site_key, retriever in retriever_dict.items():
        tool_name = f'search_{site_key}'
        tool = create_retriever_tool(retriever_dict[site_key], tool_name, description_dict[site_key])
        tools_list.append(tool)
    return tools_list


recylebc = """
This document provides information from the Recycle BC website or BC government 
website. It has the most specific information 
about whether or not an item is accepted for recycling and where to recycle it.
This should be the main resource for recycling information for residents of British Columbia.
"""

CoV_mattress = """
Information from the City of Vancouver website about how to recycle mattresses.
"""

params_dict = {
    # 1: {
    #     'site_key': 'recycle',
    #     'doc_description': recylebc,
    #     'text_splitter': None
    # },
    2: {
        'site_key': 'mattress',
        'doc_description': CoV_mattress,
        'text_splitter': None
    }
}
# filepath = '../embeddings/'
# # doc_id = 1
# # try:
# #     directory = 'data'
# #     doc_dict[doc_id] = create_documents(directory=directory, glob='*.csv')
# # except:
# #     doc_dict[doc_id] = create_documents_from_csv()
# #     print('Done creating doc from CSV')

# doc_id = 2
# try:
#     directory = 'data'
#     doc_dict[doc_id] = create_documents(directory=directory, glob='*.txt', loader_cls=TextLoader)
# except:
#     directory = '../data'
#     doc_dict[doc_id] = create_documents(directory=directory, glob='*.txt', loader_cls=TextLoader)


filepath = '../embeddings/'
retriever_dict, description_dict = create_retriever_and_description_dicts(params_dict, filepath)

KeyError: 2

# 3.1

In [10]:
import os

# documents
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import TextLoader
from langchain.document_loaders.csv_loader import CSVLoader

from langchain.storage import LocalFileStore
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings import CacheBackedEmbeddings

# from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.vectorstores import FAISS
from langchain.agents.agent_toolkits import create_retriever_tool

# Creating the Agent
from langchain.agents.agent_toolkits import create_conversational_retrieval_agent
from langchain.chat_models import ChatOpenAI

# Create memory 
from langchain.memory import ConversationBufferMemory
from langchain.memory import StreamlitChatMessageHistory # for Streamlit

from langchain.agents.openai_functions_agent.base import OpenAIFunctionsAgent
from langchain.schema.messages import SystemMessage
from langchain.prompts import MessagesPlaceholder

from langchain.agents import AgentExecutor
from langchain.agents.openai_functions_agent.agent_token_buffer_memory import AgentTokenBufferMemory

import streamlit as st


def create_documents(directory='../data', glob='**/[!.]*', show_progress=True, loader_cls=CSVLoader):
    loader = DirectoryLoader(
        directory, glob=glob, show_progress=show_progress,
        loader_cls=loader_cls)

    documents = loader.load()
    print(f'Number of files: {len(documents)}')
    return documents
    
def create_documents_from_csv(file_path='../data/Datajam_2023___Fine_Tuning_ChatBot_CSV_-_Recycle_BC_1.csv'):
    loader = CSVLoader(file_path, encoding='utf-8')
    documents = loader.load()
    return documents

def create_retriever(
    documents, site_key, filepath, 
    embeddings_dict=embeddings_dict, 
    vector_dict=vector_dict, text_splitter=None
    ):
    """
    Parameters:
        - text_splitter (optional): a text splitter object. If None, the documents are not split. 
    """
    if text_splitter is None: # object type is the same (class 'langchain.schema.document.Document') whether or not the documents are split
        texts = documents
    else:
        texts = text_splitter.split_documents(documents)
   
    underlying_embeddings = OpenAIEmbeddings(
        openai_organization=os.environ['openai_organization'],
        openai_api_key=os.environ['openai_api_key']
        )
    embeddings_dict[site_key] = CacheBackedEmbeddings.from_bytes_store(
        underlying_embeddings, LocalFileStore(filepath), 
        namespace=f'{site_key}_{underlying_embeddings.model}'
        )
    vector_dict[site_key] = FAISS.from_documents(texts, embeddings_dict[site_key])
    retriever_dict[site_key] = vector_dict[site_key].as_retriever()
    print(f'Retriever created for {site_key}')
    return retriever_dict
    # return embeddings_dict


def create_retriever_and_description_dicts(params_dict, filepath, doc_dict=doc_dict, vector_dict=vector_dict):
    retriever_dict = dict()
    description_dict = dict()
    for doc_id in doc_dict:
        retriever_dict[doc_id] = create_retriever(
            doc_dict[doc_id], params_dict[doc_id]['site_key'], 
            filepath,
            vector_dict=vector_dict, 
            text_splitter=params_dict[doc_id].get('text_splitter', None)
            )
        description_dict[params_dict[doc_id]['site_key']] = params_dict[doc_id]['doc_description']
    print(f'Created retriever and description dicts for {params_dict.keys()}')
    return retriever_dict, description_dict

def create_tools_list(retriever_dict, description_dict):
    """
    https://api.python.langchain.com/en/latest/agents/langchain.agents.agent_toolkits.conversational_retrieval.tool.create_retriever_tool.html?highlight=create_retriever_tool#langchain.agents.agent_toolkits.conversational_retrieval.tool.create_retriever_tool
    """
    tools_list = []
    for site_key, retriever in retriever_dict.items():
        tool_name = f'search_{site_key}'
        tool = create_retriever_tool(retriever_dict[site_key], tool_name, description_dict[site_key])
        tools_list.append(tool)
    return tools_list


recylebc = """
This document provides information from the Recycle BC website or BC government 
website. It has the most specific information 
about whether or not an item is accepted for recycling and where to recycle it.
This should be the main resource for recycling information for residents of British Columbia.
"""

CoV_mattress = """
Information from the City of Vancouver website about how to recycle mattresses.
"""

params_dict = {
    # 1: {
    #     'site_key': 'recycle',
    #     'doc_description': recylebc,
    #     'text_splitter': None
    # },
    2: {
        'site_key': 'mattress',
        'doc_description': CoV_mattress,
        'text_splitter': None
    }
}
filepath = '../embeddings/'
# doc_id = 1
# try:
#     directory = 'data'
#     doc_dict[doc_id] = create_documents(directory=directory, glob='*.csv')
# except:
#     doc_dict[doc_id] = create_documents_from_csv()
#     print('Done creating doc from CSV')

doc_id = 2
try:
    directory = 'data'
    doc_dict[doc_id] = create_documents(directory=directory, glob='*.txt', loader_cls=TextLoader)
except:
    directory = '../data'
    doc_dict[doc_id] = create_documents(directory=directory, glob='*.txt', loader_cls=TextLoader)


retriever_dict, description_dict = create_retriever_and_description_dicts(params_dict, filepath)

100%|██████████| 1/1 [00:00<00:00, 1118.78it/s]


Number of files: 1
Retriever created for mattress
Created retriever and description dicts for dict_keys([2])


In [11]:
list(LocalFileStore(filepath).yield_keys())

['mattress_text-embedding-ada-0024d698914-67ab-531b-a43a-655989f5061e']

## For Recycle BC doc

In [13]:
params_dict = {
    1: {
        'site_key': 'recycle',
        'doc_description': recylebc,
        'text_splitter': None
    },
    # 2: {
    #     'site_key': 'mattress',
    #     'doc_description': CoV_mattress,
    #     'text_splitter': None
    # }
}
filestore = '../embeddings/'
doc_id = 1
try:
    directory = 'data'
    doc_dict2[doc_id] = create_documents(directory=directory, glob='*.csv')
except:
    doc_dict2[doc_id] = create_documents_from_csv()
    print('Done creating doc from CSV')

retriever_dict, description_dict = create_retriever_and_description_dicts(params_dict, filestore, doc_dict=doc_dict2)

Done creating doc from CSV
Retriever created for recycle
Created retriever and description dicts for dict_keys([1])


# *End of Page*