In [1]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import chromadb
import json
import ollama
from langchain_core.messages import HumanMessage, SystemMessage
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain_ollama import ChatOllama

In [22]:
#sys.path.append('../..')
#_ = load_dotenv(find_dotenv()) # read local .env file
#openai.api_key  = os.environ['OPENAI_API_KEY']

In [2]:
# Load PDF
loaders = [
    # Duplicate documents on purpose - messy data
    PyPDFLoader("pdf/book.pdf"),
]
docs = []
for loader in loaders:
    docs.extend(loader.load())

In [3]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150,
    length_function=len,
    is_separator_regex=False,
)
splits = text_splitter.split_documents(docs)

In [7]:
#!pip install langchain-openai

[33mDEPRECATION: Loading egg at /Users/ivy/anaconda3/lib/python3.11/site-packages/binwalk-2.3.3-py3.11.egg is deprecated. pip 23.3 will enforce this behaviour change. A possible replacement is to use pip for package installation..[0m[33m


In [8]:
# Use if  Not local embedding gonna be used 
embedding = OpenAIEmbeddings()
persist_directory = 'docs/chroma/'
!rm -rf ./docs/chroma  # remove old database files if any
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=persist_directory
)

In [4]:
# If  local embedding gonna be used
embedding = ollama.embeddings(
  model='mxbai-embed-large',# also nomic-embed-text avilable
  prompt='Llamas are members of the camelid family',
)
persist_directory = 'docs/chroma/'

In [5]:
!rm -rf ./docs/chroma  # remove old database files if any

In [7]:
client = chromadb.Client()
collection = client.create_collection(name="docs")

In [None]:
#store each document in a vector embedding database
for i, d in enumerate(splits):
    page_content = str(d.page_content)    
    try:
        response = ollama.embeddings(
            model="mxbai-embed-large", 
            prompt=page_content
        )
        embedding = response["embedding"]
        collection.add(
            ids=[str(i)],
            embeddings=[embedding],
            documents=[page_content], 
        )
    except Exception as e:
        print(f"Error processing document {i}: {e}")

In [21]:
from langchain_ollama import OllamaEmbeddings
from langchain_community.vectorstores import Chroma
persist_directory = 'docs/chroma/'
embedding_model = OllamaEmbeddings(model="mxbai-embed-large")

# Wrap the Chroma collection using langchain's Chroma vector store wrapper
vectordb = Chroma.from_documents(
     documents=splits,
     embedding=embedding_model,
     persist_directory=persist_directory
 )
#vector_store.add_documents(splits)

### LLM

In [28]:
### LLM
local_llm = 'llama3.2:1b-instruct-fp16' #'llama3.2' 
llm = ChatOllama(model=local_llm, temperature=0)
### Router

In [None]:
def test():
    llm_json_mode = ChatOllama(model=local_llm, temperature=0, format='json')
    
    router_instructions = """You are an expert at routing a user question to a vectorstore or web search.
    
    The vectorstore contains documents related to agents, prompt engineering, and adversarial attacks.
                                        
    Use the vectorstore for questions on these topics. For all else, and especially for current events, use web-search.
    
    Return JSON with single key, datasource, that is 'websearch' or 'vectorstore' depending on the question."""
    
    # Test router
    test_web_search = llm_json_mode.invoke([SystemMessage(content=router_instructions)] + [HumanMessage(content="Who is favored to win the NFC Championship game in the 2024 season?")])
    test_web_search_2 = llm_json_mode.invoke([SystemMessage(content=router_instructions)] + [HumanMessage(content="What are the models released today for llama3.2?")])
    test_vector_store = llm_json_mode.invoke([SystemMessage(content=router_instructions)] + [HumanMessage(content="What are the types of agent memory?")])
    print(json.loads(test_web_search.content), json.loads(test_web_search_2.content), json.loads(test_vector_store.content))

In [29]:
# create chain 
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever()
)

In [30]:
#test 1
question = "what is beachead market? and which step is it in?"

result = qa_chain({"query": question})
result["result"]

"A beachhead market is not explicitly defined in the provided text, but based on the context, I can make some educated guesses.\n\nIt appears to be related to a business strategy or marketing approach. Here are a few possible interpretations:\n\n1. **Beachhead market**: In military and strategic planning, a beachhead refers to a strategic location where an army or force establishes a foothold in a hostile territory. Similarly, in business, a beachhead market is a strategic area where a company establishes its presence and gains traction.\n2. **Market entry strategy**: A beachhead market could refer to the initial market entry strategy for a new product, service, or business. It's a critical step where a company decides which markets to focus on first.\n\nThe specific steps mentioned in the text seem to be related to the following:\n\n* STEP 1: Selecting a Beachhead Market (41)\n* STEP 2: Analyzing Top 6-12 Market Opportunities and Choosing One to Pursue\n* STEP 3: Profiling the Persona

In [75]:
# prepate template
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer. 
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate(input_variables=["context", "question"],template=template,)

In [76]:
# test template
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)
question = 'what is selectable market in step 1'
result = qa_chain({"query": question})
result["result"]


'thanks for asking! \n\nA selectable market refers to the first market that you can realistically and efficiently enter with your existing resources, without having to wait for a larger or more established market to mature. It\'s the initial market where you can gain high exposure among potential customers and test your product or service before expanding into other markets.\n\nIn other words, it\'s the "first shot" at success in a new market, rather than trying to enter a large or well-established market that may take time to develop.'

In [77]:
# show where it is 
result["source_documents"][0]

Document(metadata={'page': 58, 'source': 'pdf/book.pdf'}, page_content='STEP 2\nSelect a Beachhead Market\n41')

In [79]:
# test but with info. 
# Rag doesnt work here
# template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer. 
# {context}
# Question: {question}
# Helpful Answer:"""
# question = 'what is beachead market?'
# qa_chain = RetrievalQA.from_chain_type(llm,
#                                        retriever=vectordb.as_retriever(),
#                                        return_source_documents=True,
#                                        chain_type_kwargs={"prompt": QA_CHAIN_PROMPT})
# 
# 
# result = qa_chain({"query": question})
# result["result"]

"I don't know, thanks for asking!"

In [80]:
### Generate this is good.

# Prompt
rag_prompt = """You are an assistant for question-answering tasks. 

Here is the context to use to answer the question:

{context} 

Think carefully about the above context. 

Now, review the user question:

{question}

Provide an answer to this questions using only the above context. 

Use three sentences maximum and keep the answer concise.

Answer:"""

# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Test
docs = vectordb.as_retriever().invoke(question)
docs_txt = format_docs(docs)
rag_prompt_formatted = rag_prompt.format(context=docs_txt, question=question)
generation = llm.invoke([HumanMessage(content=rag_prompt_formatted)])
print(generation.content)

A beachhead market refers to a position or state where a business has established itself with positive cash flow before it runs out, allowing for quick achievement of positive word of mouth (WOM) that can be a source of success or failure. In military operations, a beachhead strategy involves establishing a base of operations in enemy territory to launch further attacks and capture adjacent areas. For entrepreneurs, identifying a beachhead market is crucial as it enables them to establish a strong foundation for future growth and expansion.


In [84]:
import streamlit as st
import requests

# Set the title of the app
st.title("Chatbot with Document Upload")

# Chatbot area
st.header("Chat with AI")
user_input = st.text_input("Enter your message:")
chat_response = ""

if user_input:
    # Replace with your server/chatbot API URL
    api_url = "http://your_server_endpoint/chatbot"
    response = requests.post(api_url, json={"message": user_input})
    
    if response.status_code == 200:
        chat_response = response.json().get("response", "")
    else:
        chat_response = "Error: Couldn't fetch the response from the chatbot."

    st.write("Chatbot:", chat_response)

# Divider
st.markdown("---")

# Document Upload Area
st.header("Upload Your Documents")
uploaded_file = st.file_uploader("Choose a file", type=["pdf", "docx", "txt"])

if uploaded_file is not None:
    # Display uploaded file name
    st.write("File uploaded:", uploaded_file.name)
    
    # Optional: If you want to send the file to the backend
    files = {"file": uploaded_file.getvalue()}
    
    # Replace with your server endpoint that handles document processing
    upload_url = "http://your_server_endpoint/upload"
    response = requests.post(upload_url, files=files)
    
    if response.status_code == 200:
        st.write("Document uploaded successfully!")
    else:
        st.write("Failed to upload the document.")



2024-09-29 00:47:42.907 
  command:

    streamlit run /Users/ivy/anaconda3/lib/python3.11/site-packages/ipykernel_launcher.py [ARGUMENTS]
2024-09-29 00:47:42.908 Session state does not function when running a script without `streamlit run`
