# Create local_rag application by langchain and milvus-lite

## Get contents from url

We need install langchain-community and beautifulsoup4 packages
```
pip install -qU langchain-community beautifulsoup4
```

In [None]:
import requests
from bs4 import BeautifulSoup
# print(bs4.__version__)
# from langchain_community.document_loaders import WebBaseLoader

# The URL of the webpage you want to load
page_url = "YOUR_URL" # change YOUR_URL to the URL of the webpage you want to load
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
# Send HTTP request to the webpage, and get the HTML code of the webpage
response = requests.get(page_url, headers=headers)
response.raise_for_status()  # check if the request is successful

# use BeautifulSoup to parse the HTML code
soup = BeautifulSoup(response.text, 'html.parser')

title_tag = soup.find('h1', class_='rich_media_title')
content_tag = soup.find('div', class_='rich_media_content')
    # print(title_tag,'and',content_tag)

if title_tag and content_tag:
    title = title_tag.get_text(strip=True)
    content = content_tag.get_text(strip=True)
else:
    title = 'No title found'
    content= 'No content found'

print('title:' , title)
print("content:" , content)

## Embedding contents

- connect to embedding model
The connection parameters for the embedding model are stored in the `.env` file.

LangChain provides two methods for using embeddings:
1. `embed_query()` — Used to generate embeddings for queries.
2. `embed_documents()` — Used to generate embeddings for documents, forming the content of a knowledge base.

**Notes**:  
1. Pay attention to the maximum dimension value of the current embedding model.  
2. Each embedding model has a token limit for processing; documents need to be split based on this limit.  
3. The list of documents passed to the embedding model must be in plain text format and cannot be PyString objects.  



In [None]:
from langchain_openai import OpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from dotenv import load_dotenv
from bs4 import BeautifulSoup

import os
import requests

# print(os.getenv("OPENAI_API_KEY"))
# print(os.getenv("OPENAI_API_BASE"))
# print(os.getenv("OPENAI_MODEL_NAME"))
# print(os.getenv("OPENAI_EMB_URL"))
# print(os.getenv("OPENAI_EMB_MODEL"))

load_dotenv(override=True)

embeddings_model = OpenAIEmbeddings(
    api_key=os.getenv("OPENAI_API_KEY"),
    model=os.getenv("OPENAI_EMB_MODEL"),
    base_url=os.getenv("OPENAI_EMB_URL"),
    dimensions=768,
)

# doc=[]
# doc.append(content)

text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=240,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)
docs=text_splitter.create_documents([content])
text=[doc.page_content for doc in docs]
print(docs)
print(text)
embeddings = embeddings_model.embed_documents(text)
print(embeddings)
# print(content)

## Establishing the Storage Structure for Vector Data  
Using the vector database `Milvus Lite`  
```
pip install pymilvus
```

**Note**:  
`Milvus Lite` cannot be directly installed and used in a Windows environment.  

The data structure in Milvus includes schema, collection, and index.  
You need to define the storage structure for vector data first.  

In the example below, a collection named `kgbase` will be created.  
The vector data dimension is same as dimensions of embedding model.  

There are 4 fields in total:  
- **id**: `int64` - Store the primary key, `auto_id=True`.  
- **docid**: `varchar`, `max_length=512` - Store the document ID or document name.  
- **doc_vector**: `float_vector` - the "dim" params should be the dimentions of embedding model. Store the contents vector
- **doc_content**: `varchar`, `max_length=4096` - Store the document contents.  

In [None]:
from pymilvus import MilvusClient,DataType,connections,Collection

client = MilvusClient("./milvus_kgbase.db")

schema = MilvusClient.create_schema(
    auto_id=False,
    enable_dynamic_field=True,
)

schema.add_field(field_name="id", datatype=DataType.INT64, is_primary=True, auto_id=True),
schema.add_field(field_name="docid", datatype=DataType.VARCHAR, max_length=512),
schema.add_field(field_name="doc_vector", datatype=DataType.FLOAT_VECTOR, dim=768),
schema.add_field(field_name="doc_content", datatype=DataType.VARCHAR, max_length=4096),

index=client.prepare_index_params(
    field_name = "doc_vector",
    index_type = "AUTOINDEX",
    metric_type = "COSINE"
)
collection_name ="kgbase"
if client.has_collection(collection_name):
    client.drop_collection(collection_name)

client.create_collection(
    collection_name=collection_name,
    schema=schema,
    index_params=index
)

# res =[]
res = client.get_load_state(
    collection_name="kgbase"
)

print(res)

## Save Embedded Vectors to the Vector Database  
The vector database used is the local `Milvus Lite`.  

If any issues occur, remember to restart the Python virtual environment after fixing them.  

In [None]:
from pymilvus import MilvusClient,DataType,connections,Collection

data = []

for i in range(len(text)):
    docid = title
    doc_vector = embeddings[i]
    doc_content = text[i]
    data.append({"docid":docid,"doc_vector":doc_vector,"doc_content":doc_content})

# for i in range(len(data)):
#     print(data[i])

client = MilvusClient("./milvus_kgbase.db")
res = client.get_load_state(
    collection_name="kgbase"
)

print(res)

res = client.insert(
    collection_name="kgbase",
    data=data
)

print(res)

## Question Retrieval and Answering  

This process involves:  
1. Generating embeddings for the question.  
2. Retrieving the vectorized question from the Milvus database.  
3. Combining the retrieved results with the question text into a prompt.  
4. Submitting the generated prompt to the large language model for answering. 


embedding the query

In [None]:
from langchain_openai import OpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from dotenv import load_dotenv
from bs4 import BeautifulSoup

import os
import requests

load_dotenv(override=True)

embeddings_model = OpenAIEmbeddings(
    api_key=os.getenv("OPENAI_API_KEY"),
    model=os.getenv("OPENAI_EMB_MODEL"),
    base_url=os.getenv("OPENAI_EMB_URL"),
    dimensions=768,
)

query="YOUR QUERY"    # Change "YOUR QUERY" to your query
query_vector=embeddings_model.embed_query(query)
print(query_vector)

Perform retrieval in `Milvus Lite`  
Extract the contents from the retrieval results  


In [None]:
from pymilvus import MilvusClient,DataType,connections,Collection

client = MilvusClient("./milvus_kgbase.db")
collection_name = "kgbase"

res = client.search(
    collection_name="kgbase",
    anns_field="doc_vector",
    data=[query_vector],
    limit=10,
    search_params={"metric_type": "COSINE"},
    output_fields=["docid", "doc_content"]  # Specify output fields
)

# Add the doc_content from the search results to a new list.
res_content = []
for hits in res:
    for hit in hits:
        res_content.append(hit['entity']['doc_content'])

print(res_content)


Generate a prompt that includes the search results and the query, then get the answer from LLM

In [None]:
from langchain_openai import OpenAI
from langchain.prompts import PromptTemplate
# dotenv package is import to load the params in the .env file
from dotenv import load_dotenv
import os

load_dotenv(override=True)

# Load environment variables
os.getenv("OPENAI_API_KEY")
os.getenv("OPENAI_API_BASE")
os.getenv("OPENAI_MODEL_NAME")

llm=OpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),
    model=os.getenv("OPENAI_MODEL_NAME"),
    base_url=os.getenv("OPENAI_API_BASE"),
    temperature=0.1,
#    streaming=True,
)


# Generate the prompt template
template = """
You are a knowledgeable assistant capable of answering a wide range of questions. 
Before each question, you will be provided with some reference materials, and you will organize and interpret the information and knowledge points related to the question based on these materials, then answer the question in language that is understandable to the general public.

Reference Materials: {contents}
Question: {question}
Answer:
"""

prompt=PromptTemplate(
    input_variables=["question","contents"],
    template=template,    
)

contents = ""
for item in res_content:
    # print(item)
    contents=contents+"\n"+item

# print(contents)

generate_prompt=prompt.format(question=query,contents=contents)

# print(generate_prompt)

response = llm.invoke(generate_prompt)
print(response)