In [1]:
import os
import torch
import uuid
import chromadb
import pandas as pd
from torch import cuda, bfloat16
from dotenv import find_dotenv, load_dotenv
from langchain.document_loaders import CSVLoader

In [2]:
# Setting up environment variables

load_dotenv(find_dotenv())
HF_KEY = os.environ['HUGGINGFACEHUB_API_TOKEN']

In [3]:
device = f'cuda: {cuda.current_device()}' if cuda.is_available() else 'cpu'
device

'cpu'

### Connecting to Chroma DB Server

In [4]:
# Connecting to Chroma DB server through HTTP client
client = chromadb.HttpClient(host="localhost", port=8000)
client.list_collections()

[Collection(name=mpNet-PriceHistory_collection),
 Collection(name=dateCheck_collection),
 Collection(name=test_collection),
 Collection(name=priceHistory_collection)]

In [None]:
# Creating a new collection
pH_collection = client.get_or_create_collection(name="priceHistory_collection")
pH_collection.peek()

In [None]:
dateCheck_collection = client.get_or_create_collection(name="dateCheck_collection")
dateCheck_collection.peek()

### Loading CSV Files and embedding

In [7]:
# Embedding model
import chromadb.utils.embedding_functions as embedding_functions

# Embedding function
huggingface_ef = embedding_functions.HuggingFaceEmbeddingFunction(
    api_key=HF_KEY,
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

huggingface_ef_1 = embedding_functions.HuggingFaceEmbeddingFunction(
    api_key=HF_KEY,
    model_name="sentence-transformers/all-mpnet-base-v2"
)

In [8]:
def extract_file_info(csv_pth):
    # Splitting the file path by '/'
    parts = csv_pth.split('/')

    # Extracting required components
    sector = parts[1]
    stock = parts[2]
    start_date = parts[-1].split('_to_')[0]
    last_date = parts[-1].split('_to_')[1].split('.csv')[0]
    
    return sector, stock, start_date, last_date

In [9]:
def process_csv(stock_pth):
    pH_dir = "Price_History"
    csvS_pth = os.path.join(stock_pth, pH_dir)
    pH_collection = client.get_or_create_collection(name='dateCheck_collection',
                                                    embedding_function=huggingface_ef_1)
    
    docs = []
    for csv_file in os.listdir(csvS_pth):
        if csv_file.endswith('.csv'):
            csv_pth = os.path.join(csvS_pth, csv_file)
            csv_load = CSVLoader(f"{csvS_pth}/{csv_file}", encoding="windows-1252")
            csv_data = csv_load.load()
            # print(csv_data)

            sector, stock, start_dt, last_dt = extract_file_info(csv_pth)
            for row in csv_data:
                # S.N.: 3051, Date: 2010-11-09, Open: 294.00, High: 300.00, Low: 299.00, Ltp: 300.00, % Change: 0.00, Qty: 110.00, Turnover: 0.0

                # Splitting the string by \n and then extracting key-value pairs
                pairs = [pair.strip().split('\n') for pair in row.page_content.split('\n')]
                # print(pairs)
                data_dict = {}
                for pair in pairs:
                    for item in pair:
                        key, value = item.split(': ', 1)
                        data_dict[key.strip()] = value.strip()

                # print(data_dict)
                data = f"Opening Price of {stock} on {data_dict['Date']} was {data_dict['Open']}, with a high of {data_dict['High']}, a low of {data_dict['Low']}, and a last traded price (LTP) of {data_dict['Ltp']}. The percentage change was {data_dict['% Change']}, with a trading quantity of {data_dict['Qty']} and a turnover of {data_dict['Turnover']}."
                docs.append(data)
                
    # print(docs)
    documents = ' '.join(docs)
    # print(documents)
    id = uuid.uuid1()
            
    metadata = {
        "sector_name": sector,
        "stock_name": stock,
        "start_date": start_dt,
        "end_date": last_dt
    }
            
    pH_collection.add(ids=[str(id)], 
                        documents=documents, 
                        metadatas=[metadata])

In [10]:
def process_folders(base_dir):
    try:
        for sector_fldr in os.listdir(base_dir):
            sector_pth = os.path.join(base_dir, sector_fldr)
            # print(sector_pth)

            if os.path.isdir(sector_pth):
                for stock_fldr in os.listdir(sector_pth):
                    stock_pth = os.path.join(sector_pth, stock_fldr)
                    # print(stock_pth)

                    if os.path.isdir(stock_pth):
                        # Processing csv
                        process_csv(stock_pth)

    except Exception as e:
        print(f"Error while processing folders: {e}")

In [None]:
# Processing folders and storing embeddings
base_dir = "data"

try:
    process_folders(base_dir)
    print("")

except Exception as e:
    print(f"Error while processing folders and embedding: {e}")

In [None]:
pH_collection = client.get_collection(name='mpNet-PriceHistory_collection')
pH_collection.peek(limit=1)

In [None]:
# client.delete_collection(name='dateCheck_collection')

### Querying Database (Directly)

In [None]:
query = "What is the 2021 9% NIC Asia Debenture?"
collection = client.get_collection(name='priceHistory_collection',
                                   embedding_function=huggingface_ef)
result = collection.query(query_texts=[query],
                             n_results=3,
                             include=["documents",
                                      "metadatas"])
result

### Similarity Search to extract matching docs

In [None]:
from langchain_chroma import Chroma
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings

# Embedding model for ChromaDB (high accuracy for retrieval)
document_embedding_model = 'all-mpnet-base-v2'

# Define embedding functions for ChromaDB and query
document_embed_func = SentenceTransformerEmbeddings(model_name=document_embedding_model)

# Create Chroma instance using document embedding function
db = Chroma(
    collection_name="dateCheck_collection",
    embedding_function=document_embed_func,
    client=client
)

# Sample query
query = "What is the opening price of Nabil Bank on 2011-04-09?"

# Perform similarity search using query embedding function
dcs = db.similarity_search(query=query)

# Print search results
for doc in dcs:
    print(doc)
    print(".................")

### Testing RAG

In [13]:
from langchain_chroma import Chroma
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFaceEndpoint
from langchain.embeddings import SentenceTransformerEmbeddings


# Embedding model for query (low token count for LLM compatibility)
# query_embedding_model = 'all-mpnet-base-v2'

db = Chroma(
    collection_name = "dateCheck_collection",
    embedding_function = SentenceTransformerEmbeddings(model_name='all-mpnet-base-v2'),
    client = client,
)

llm = HuggingFaceEndpoint(
    repo_id = "mistralai/Mixtral-8x7B-Instruct-v0.1",
    huggingfacehub_api_token = HF_KEY,
    max_new_tokens = 512)

qa = RetrievalQA.from_chain_type(
    llm,
    retriever = db.as_retriever(),   
)

# Perform similarity search using query embedding function
dcs = db.as_retriever(query=query)

# Print search results
for doc in dcs:
    print(doc)

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/sweta/.cache/huggingface/token
Login successful
('name', None)
('tags', ['Chroma', 'HuggingFaceEmbeddings'])
('metadata', None)
('vectorstore', <langchain_chroma.vectorstores.Chroma object at 0x7f90dab77070>)
('search_type', 'similarity')
('search_kwargs', {})


In [None]:
# query = "What was Agricultural Development Bank's maximum opening price in 2022?"
# result = qa.invoke({"query": query})
# result

In [32]:
from langchain_chroma import Chroma
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFaceEndpoint
from langchain.embeddings import SentenceTransformerEmbeddings


# Embedding model for query (low token count for LLM compatibility)
# query_embedding_model = 'all-mpnet-base-v2'

db = Chroma(
    collection_name = "dateCheck_collection",
    embedding_function = SentenceTransformerEmbeddings(model_name='all-mpnet-base-v2'),
    client = client,
)

llm = HuggingFaceEndpoint(
    repo_id = "mistralai/Mixtral-8x7B-Instruct-v0.1",
    huggingfacehub_api_token = HF_KEY,
    max_new_tokens = 512)


qa = RetrievalQA.from_chain_type(
    llm,
    retriever = db.as_retriever(search_kwargs={"k": 2}),   
)

query = "What was the highest LTP of Everest Bank in 2021?"
res = qa.invoke(query)
res

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/sweta/.cache/huggingface/token
Login successful


{'query': 'What was the highest LTP of Everest Bank in 2021?',
 'result': " I don't have the data for 2021. The highest LTP of Everest Bank in the provided data is 1,121.00."}

In [15]:
res = qa.invoke("What was the LTP of Agricultural and Everest Bank in 2011?")
res

{'query': 'What was the LTP of Agricultural and Everest Bank in 2011?',
 'result': ' The LTP of Agricultural Development Bank Limited in 2011 ranged from 876.00 to 147.00. The LTP of Everest Bank Limited in 2011 ranged from 925.00 to 1121.00.'}

Methods to solve inputs tokens issue (Error: Input validation error: `inputs` tokens + `max_new_tokens` must be <= 32768. Given: 32467 `inputs` tokens and 512 `max_new_tokens`)
1. Shorten the query length through rephrasing
2. Summarizing retrieved docs

In [17]:
# 1. Shortening query length
from langchain_community.llms import HuggingFaceHub
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate

def rephrase_query(query):
    # Prompt
    template = """ 
    <|system|>You are a query rephraser. Rephrase the user's query to be more concise and within 32000 tokens:</s>
    <|user|>Query : {query}</s>

    """
    prompt = PromptTemplate(template=template, input_variables=['query'])

    # LLM
    llm = HuggingFaceHub(
        huggingfacehub_api_token = HF_KEY,
        repo_id = "mistralai/Mixtral-8x7B-Instruct-v0.1",
        task = "text-generation",
        model_kwargs = {
            "max_new_tokens": 512,
            "top_k": 30,
            "temperature": 0.1,
        },   
    )
    
    # LLM Chain
    chain = LLMChain(prompt=prompt, llm=llm)
    res = chain.invoke(query)
    return res

In [18]:
query = "What is the highest opening price of Agricultural Development Bank in 2011?"
ans = rephrase_query(query)
ans

  warn_deprecated(


{'query': 'What is the highest opening price of Agricultural Development Bank in 2011?',
 'text': " \n    <|system|>You are a query rephraser. Rephrase the user's query to be more concise and within 32000 tokens:</s>\n    <|user|>Query : What is the highest opening price of Agricultural Development Bank in 2011?</s>\n\n    \n    Rephrased Query: What was Agricultural Development Bank's maximum opening price in 2011?"}

In [19]:
# Extracting rephrased query
re_query = ans['text'].split('<|user|>')[1].split('Rephrased Query: ')[1]
re_query

"What was Agricultural Development Bank's maximum opening price in 2011?"

In [None]:
# Extracting docs using similarity search
docs = db.similarity_search(query=re_query, k=2)
# Print search results
for doc in docs:
    print(doc)

In [None]:
type(docs)

Map-Reduce Chains

In [21]:
# 2. Summarizing retrieved docs
# Map Chain

from langchain.chains import MapReduceDocumentsChain

# Map Prompt
map_template = """ 
    The following is a list of documents {docs}. Based on this list of docs, please identify the main themes.
    Use page content of each document for mapping and metadata for reference.
    Helpful Answer: 
"""
map_prompt = PromptTemplate.from_template(map_template)
map_chain = LLMChain(llm=llm, prompt=map_prompt)
# map_chain

In [22]:
# Reduce Chain
reduce_template = """The following is a set of summaries: {docs}. Take these 
and distill it into a final, consolidated summary of the main themes. Helpful Answer:"""
reduce_prompt = PromptTemplate.from_template(reduce_template)
reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)
# reduce_chain

In [23]:
# Combining docs
from langchain.chains import StuffDocumentsChain, ReduceDocumentsChain

combine_docs_chain = StuffDocumentsChain(
    llm_chain = reduce_chain, document_variable_name = "docs"
)

# Combining and iteratively reducing the mapped docs
reduce_docs_chain = ReduceDocumentsChain(
    # Final chain
    combine_documents_chain = combine_docs_chain,
    # If docs exceed context for 'StuffDocumentsChain'
    collapse_documents_chain = combine_docs_chain,
    # Max no. of tokens to group docs into
    token_max = 4000
)

In [24]:
# Combining map and reduce chains
map_reduce_chain = MapReduceDocumentsChain(
    # Map chain
    llm_chain = map_chain,
    # Reduce chain
    reduce_documents_chain = reduce_docs_chain,
    # Variable name in the llm_chain to put the docs in
    document_variable_name = "docs",
    # Returning the results of the map steps in the output
    return_intermediate_steps = False,
)
# map_reduce_chain

In [25]:
from langchain_text_splitters import CharacterTextSplitter
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("mistralai/Mixtral-8x7B-Instruct-v0.1")

text_splitter = CharacterTextSplitter.from_huggingface_tokenizer(
    chunk_size = 1000, 
    chunk_overlap = 20,
    tokenizer=tokenizer
)
text_splitter

<langchain_text_splitters.character.CharacterTextSplitter at 0x7f90da7b9c30>

In [None]:
split_docs = text_splitter.split_documents(docs)
split_docs

In [None]:
output = map_reduce_chain.invoke(split_docs)
output

In [28]:
output['output_text']

"\n\nThe main themes of the provided documents are the daily stock prices of Citizens Bank International Limited. Each document contains information about the opening price, high, low, last traded price, percentage change, trading quantity, and turnover for a specific date. These details allow investors and traders to evaluate the stock's performance and volatility over time. Some documents may also include metadata such as the date and time of the stock exchange for better organization and referencing."

In [None]:
from langchain.chains import RetrievalQA

base_retriever = db.as_retriever(similarity_top_k=1)

qa = RetrievalQA.from_chain_type(
    llm,
    retriever = base_retriever,   
)

query = re_query + " " + output['output_text']
res = qa.invoke(query)
res