In [None]:
# Using langchain and huggingface hub to develop a chatbot with a given Github repo

In [None]:
%pip install --upgrade langchain deeplake huggingface_hub tiktoken

In [None]:
%pip install sentence_transformers

In [None]:
# getting huggingface hub api token from https://huggingface.co/settings/token
# getting active loop token from https://app.activeloop.ai/account
# active loop provide deep lake service as an online database

In [13]:
import os
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import DeepLake

os.environ['HUGGINGFACEHUB_API_TOKEN'] = 'your huggingface hub api token'
os.environ['ACTIVELOOP_TOKEN'] = 'your active loop token'

In [16]:
# the embeddings are used to generate vectors and store them in the database
hf_embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

In [7]:
# introduce the target repo (as an example)
!git clone https://github.com/chroma-core/chroma.git

Cloning into 'chroma'...
remote: Enumerating objects: 14581, done.[K
remote: Counting objects: 100% (3378/3378), done.[K
remote: Compressing objects: 100% (932/932), done.[K
remote: Total 14581 (delta 2617), reused 2864 (delta 2378), pack-reused 11203[K
Receiving objects: 100% (14581/14581), 172.26 MiB | 38.38 MiB/s, done.
Resolving deltas: 100% (9464/9464), done.


In [10]:
# index the data -> vectorize the data
# Chroma - the open-source embedding database.
import os
from langchain.document_loaders import TextLoader
root_dir = './chroma'
docs = []
for dirpath, dirnames, filenames in os.walk(root_dir): # Load files under the git directory by iterating through them.
    for file in filenames:
        try:
            loader = TextLoader(os.path.join(dirpath,file),encoding='utf-8') # textloader: load text from file
            docs.extend(loader.load_and_split())  # get the text array from the file
        except Exception as e:
            pass
print(len(docs))  # number of docs

517


In [None]:
# split the docs array
# split the docs array into chunks of 1000 characters with 0 overlap
# the text splitter is used to split the text into chunks
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(chunk_size=1000,chunk_overlap=0)
texts = text_splitter.split_documents(docs)
print(len(texts))  # the chunks of docs
# the text loader will warn us if there are some files that are larger than our limit of 1000.

In [None]:
# generate a database under deep lake, and store the data in it, public=True means the dataset is public.
username = 'nalanwutuo'
db = DeepLake(dataset_path=f"hub://{username}/chroma",embedding_function=hf_embeddings,public=True)
db.add_documents(texts)

In [19]:
# get data from the database
db = DeepLake(dataset_path="hub://nalanwutuo/chroma",read_only=True,embedding_function=hf_embeddings)

This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/nalanwutuo/chroma



\

hub://nalanwutuo/chroma loaded successfully.

Deep Lake Dataset in hub://nalanwutuo/chroma already exists, loading from the storage
Dataset(path='hub://nalanwutuo/chroma', read_only=True, tensors=['embedding', 'ids', 'metadata', 'text'])

  tensor     htype      shape      dtype  compression
  -------   -------    -------    -------  ------- 
 embedding  generic  (1386, 384)  float32   None   
    ids      text     (1386, 1)     str     None   
 metadata    json     (1386, 1)     str     None   
   text      text     (1386, 1)     str     None   


  

In [20]:
# set the parameters for the database query
retriever = db.as_retriever()
retriever.search_kwargs['distance_metric'] = 'cos'  # similarity metric
retriever.search_kwargs['fetch_k'] = 100  # fetch_k represents the number of documents to be fetched from the database for each query
retriever.search_kwargs['maximal_marginal_relevance'] = True
retriever.search_kwargs['k'] = 10  # k represents the number of documents to be returned for each query

In [21]:
# model setting, here we use flan-t5-large with huggingface hub api
from langchain import HuggingFaceHub
from langchain.chains import ConversationalRetrievalChain

# initialize
flan_t5 = HuggingFaceHub(
    repo_id = 'google/flan-t5-large',
    model_kwargs = {"temperature":1e-10,"max_length":3000}
)
qa = ConversationalRetrievalChain.from_llm(flan_t5,retriever=retriever)

In [22]:
questions = [
    "What does Chroma do?",
    "How to use Chroma"
]

chat_history = []
for question in questions:
    result = qa({"question": question, "chat_history": chat_history})
    chat_history.append((question, result['answer']))
    print(f"Question:\n {question} \n")
    print(f"Answer:\n {result['answer']} \n\n")

Question:
 What does Chroma do? 

Answer:
 embedding database 


Question:
 How to use Chroma 

Answer:
 To connect to your server and perform operations using the client only library, you can do the following: # requirements # - docker # - pip # get the code git clone https://oauth2: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@github.com/chroma-core/chroma.git #checkout the right branch cd chroma # run docker cd chroma-server docker-compose up -d --build # install chroma-client cd ../chroma-client pip3 install --upgrade pip # you have to do this or it will use UNKNOWN as the package name pip install 




In [23]:
def ask(question, chat_history):
    response = qa({"question":question,"chat_history":chat_history})
    print(f"Question:\n {question}\n")
    print(f"Answer:\n {response['answer']}\n")

In [24]:
ask("What's the main programming language used in Chroma?",chat_history)

Question:
 What's the main programming language used in Chroma?

Answer:
 python



In [25]:
ask("Summarize the storage part of Chroma",chat_history)

Question:
 Summarize the storage part of Chroma

Answer:
 disks> backups> type>local/type> path>/etc/clickhouse-server//path> /backups> /disks> /storage_configuration> backups> allowed_disk>backups/allowed_disk> allowed_path>/etc/clickhouse-server//allowed_path> /backups> /clickhouse> EOF

