In [1]:
from langchain_astradb import AstraDBVectorStore
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv
import os
import pandas as pd

In [2]:
load_dotenv()

True

In [3]:
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [4]:
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [5]:
embedding = OpenAIEmbeddings()

## Steps to setup and access AstraDB for data

In [65]:
ASTRA_DB_API_ENDPOINT = "https://4c489f2d-a32e-4446-98e1-164bacd2c6da-us-east-2.apps.astra.datastax.com"

In [66]:
ASTRA_DB_APPLICATION_TOKEN = "AstraCS:xhSzAbGIqSzGWbrmJrLFJTFs:5db1d2cb8de93689458492c1fc531b37baccfbeeb2251ca02d77cad9227f4204"

In [67]:
ASTRA_DB_KEYSPACE = "default_keyspace"

In [68]:
collection_name = "financebot"

In [69]:
# initializing astradb object with parameters
vstore = AstraDBVectorStore(
    embedding=embedding,
    collection_name=collection_name,
    api_endpoint=ASTRA_DB_API_ENDPOINT,
    token = ASTRA_DB_APPLICATION_TOKEN,
    namespace=ASTRA_DB_KEYSPACE
)

In [70]:
from langchain_community.document_loaders import PyPDFLoader

In [71]:
# make sure to add second \
loader = PyPDFLoader("D:\\final_attempt_at_GENAI\\7_projects\\Trading_bot\\data\\finance_data.pdf")

In [72]:
pages = loader.load()

In [73]:
len(pages)

108

#### this indicates that there is 108 pages of data in the pdf file, we will not use this much, but select a few 

In [74]:
pages = pages[10:20] # selecting 10 pages of data

In [75]:
pages[0].page_content

'Table of Contents \n9 \nunderstand root causes. Our full-reticle CV test chips use a sh ortened process flow to provide a faster \nlearning cycle for specific process modules. \n\uf0a7 Our Scribe CV test chips are inserted directly on customers’ product wafers to collect data about critical \nlayers. \n\uf0a7 Our DirectProbe™ CV test chips are designed to enable ultra-fast yield learning for new product designs \nby allowing our customers to measure components of actual product layout and identify yield issues. \n• pdFasTest® Electrical Tester – Our proprietary electrical test hardware is optimized to quickly test our CV test \nchips, enabling fast defect and p arametric characterization of manufacturing processes. As part of the system \noffering, we provide test programs for each CV test chip that are tuned to the customer’s process. This automated \nsystem provides parallel functional testing, thus minimizing th e time required to perform millions of electrical \nmeasurements to te

In [76]:
len(pages)

10

In [77]:
for i, doc in enumerate(pages):
    print(i, doc)

0 page_content='Table of Contents \n9 \nunderstand root causes. Our full-reticle CV test chips use a sh ortened process flow to provide a faster \nlearning cycle for specific process modules. \n\uf0a7 Our Scribe CV test chips are inserted directly on customers’ product wafers to collect data about critical \nlayers. \n\uf0a7 Our DirectProbe™ CV test chips are designed to enable ultra-fast yield learning for new product designs \nby allowing our customers to measure components of actual product layout and identify yield issues. \n• pdFasTest® Electrical Tester – Our proprietary electrical test hardware is optimized to quickly test our CV test \nchips, enabling fast defect and p arametric characterization of manufacturing processes. As part of the system \noffering, we provide test programs for each CV test chip that are tuned to the customer’s process. This automated \nsystem provides parallel functional testing, thus minimizing th e time required to perform millions of electrical \nmea

In [78]:
# collecting all data as string, this will eliminate page split issues

raw_text = ''
for i, doc in enumerate (pages):
    text = doc.page_content
    if text:
        raw_text += text

In [79]:
print(raw_text)

Table of Contents 
9 
understand root causes. Our full-reticle CV test chips use a sh ortened process flow to provide a faster 
learning cycle for specific process modules. 
 Our Scribe CV test chips are inserted directly on customers’ product wafers to collect data about critical 
layers. 
 Our DirectProbe™ CV test chips are designed to enable ultra-fast yield learning for new product designs 
by allowing our customers to measure components of actual product layout and identify yield issues. 
• pdFasTest® Electrical Tester – Our proprietary electrical test hardware is optimized to quickly test our CV test 
chips, enabling fast defect and p arametric characterization of manufacturing processes. As part of the system 
offering, we provide test programs for each CV test chip that are tuned to the customer’s process. This automated 
system provides parallel functional testing, thus minimizing th e time required to perform millions of electrical 
measurements to test our CV test chips. W

## Now we will make chunking using RecursiveCharacterSplit

In [80]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [81]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 500,
    chunk_overlap = 100,
)

In [82]:
text_splitter.split_text(raw_text)

['Table of Contents \n9 \nunderstand root causes. Our full-reticle CV test chips use a sh ortened process flow to provide a faster \nlearning cycle for specific process modules. \n\uf0a7 Our Scribe CV test chips are inserted directly on customers’ product wafers to collect data about critical \nlayers. \n\uf0a7 Our DirectProbe™ CV test chips are designed to enable ultra-fast yield learning for new product designs \nby allowing our customers to measure components of actual product layout and identify yield issues.',
 '• pdFasTest® Electrical Tester – Our proprietary electrical test hardware is optimized to quickly test our CV test \nchips, enabling fast defect and p arametric characterization of manufacturing processes. As part of the system \noffering, we provide test programs for each CV test chip that are tuned to the customer’s process. This automated \nsystem provides parallel functional testing, thus minimizing th e time required to perform millions of electrical',
 'measurements 

In [83]:
texts = text_splitter.split_text(raw_text)

In [84]:
len(texts)

90

In [85]:
from langchain_core.documents import Document

In [86]:
# conversion into pagecontent/document format
docs = []
for i in range(len(texts)):
    doc = Document(page_content=texts[i])
    docs.append(doc)

#### data has to be passed into vectorstore in this specific format

In [87]:
docs

[Document(page_content='Table of Contents \n9 \nunderstand root causes. Our full-reticle CV test chips use a sh ortened process flow to provide a faster \nlearning cycle for specific process modules. \n\uf0a7 Our Scribe CV test chips are inserted directly on customers’ product wafers to collect data about critical \nlayers. \n\uf0a7 Our DirectProbe™ CV test chips are designed to enable ultra-fast yield learning for new product designs \nby allowing our customers to measure components of actual product layout and identify yield issues.'),
 Document(page_content='• pdFasTest® Electrical Tester – Our proprietary electrical test hardware is optimized to quickly test our CV test \nchips, enabling fast defect and p arametric characterization of manufacturing processes. As part of the system \noffering, we provide test programs for each CV test chip that are tuned to the customer’s process. This automated \nsystem provides parallel functional testing, thus minimizing th e time required to per

In [88]:
len(docs)

90

## Now we finally add the chunked data into our vector_store

In [89]:

vstore.add_documents(docs)

['74b60f15a7dd44f490bc7465a7cb3411',
 'a4ed7144bab24e02ab076a8e924db5d8',
 'f5225efba0694dc49d2e92ff55ded29e',
 '6b7daf8a056f47fdac828247f11b8577',
 '26f4f878dfd44824a1dc4226935af444',
 '260a124f1d7449fb99a837016ab77467',
 'e60fc51cf8904823b40e1fc804a4818c',
 'd7fa6c9d95a64c7493f7233b720326c8',
 '19cbaab3ff7b4cc8a2f95597a50b7028',
 '5c6cc975abf1472c9f8e793ae1fbb53b',
 '43f7f15547b2497cbe5e84ad274e5e0a',
 '6646892966364edbad343fa9384f0c67',
 '989a107b41c241999d732e9855299b72',
 'c4393244ec504aa4bbd938c355c22646',
 '239ec5cf90d3444fbfe0c744ce4b7127',
 '9eb93b3850fd49ee98f2f32d548a435a',
 'df6e336de74742acadb4b29e5606a4dc',
 'b48ab04e51bd42eab9c0781b5f4c9aad',
 '6e77fffdf27949e7a94080a68e317dd5',
 '4c2e208122a742b1a9b3082ff834e8f6',
 '6cbd891adb30430ba6a6d42800f2db65',
 '6c29fff67f694ad58c808c312a0c1db9',
 '8fe4f454d9a34c42abc8c4825b945392',
 '649c3dc1297d4a28abd74c73b3c43dc7',
 '322ecdb39c9a458a8a2e79a122d9fec9',
 '9aa93050101a477ca6d8d9e12d180dcc',
 '4e4b7c020a9d4b358df26861ab2d88c3',
 

#### data ingestion is done!!!!, retrieval and generation needs to be done

# 2. Retrival

In [None]:
retriever = vstore.as_retriever(search_kwargs={"k":3})

# k=3 is to pick the top 3 similar responses

In [None]:
# eg: will get 3 docs/responses
retriever.get_relevant_documents("What is the market for Registrant's common equity?")

[Document(page_content='organization, which would give any such competitor’s products a competitive advantage over our solutions. \nWe believe that our solutions compare favorably with respect to  competition because we have demonstrated results \nand reputation, strong core technology, ability to create innovative technology, and ability to implement solutions for new \ntechnology and product generations. See the discussions in “Risk Factors” under Item 1A for more information about risks'),
 Document(page_content='“Risk Factors”. \nSee our “Notes to Consolidated F inancial Statements”, included under Part II, Item 8. “Financial Statements and \nSupplementary Data” for additional geographic information. \nSales and Marketing \nOur sales strategy is primarily to pursue targeted accounts through a combination of our direct sales force, our service \nteams, and strategic alliances. After we are engaged by a customer and early in the services process, our engineers seek to'),
 Document(pa

In [93]:
PRODUCT_BOT_TEMPLATE = """
    Your finance bot is an expert in finance related advice.
    Ensure your answers are relevant to the query context and refrain from straying off-topic.
    Your responses should be concise and informative.

    CONTEXT:
    {context}

    QUESTION: {question}

    YOUR ANSWER:  
"""

In [95]:
from langchain_core.prompts import ChatPromptTemplate

In [96]:
prompt = ChatPromptTemplate.from_template(PRODUCT_BOT_TEMPLATE)

In [98]:
from langchain_openai import ChatOpenAI

In [100]:
llm = ChatOpenAI()

In [101]:
from langchain_core.runnables import RunnablePassthrough

In [102]:
from langchain_core.output_parsers import StrOutputParser

In [108]:
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
     
)

In [109]:
chain.invoke("What is the market for Registrant's common equity?")

'The market for Registrant\'s common equity is influenced by factors such as the company\'s demonstrated results, reputation, core technology, ability to innovate, and ability to implement solutions for new technology and product generations. This can impact the trading value of the company\'s securities. It is important to consider the risks associated with the company\'s business, including investments in research and development, as these factors can affect the market for Registrant\'s common equity. For more detailed information, refer to the "Risk Factors" section and the financial statements provided in the document.'

answer = 'The market for Registrant\'s common equity is influenced by factors such as the company\'s demonstrated results, reputation, core technology, ability to innovate, and ability to implement solutions for new technology and product generations. This can impact the trading value of the company\'s securities. It is important to consider the risks associated with the company\'s business, including investments in research and development, as these factors can affect the market for Registrant\'s common equity. For more detailed information, refer to the "Risk Factors" section and the financial statements provided in the document.'

## We have successfully used the chain, prompttemplate to send a question to the LLM and it has accessed the AstraDB to retrieve the response.!!!