# Database

In [1]:
from pymilvus import connections, db

In [58]:
conn = connections.connect(host="127.0.0.1", port=19530)

In [60]:
db.using_database("book")

In [59]:
try:
    database = db.create_database("book")
except Exception as e:
    print(e.message)

In [38]:
# db.drop_database("book")

In [61]:
db.list_database()

['default', 'book']

# Collection

In [2]:
from pymilvus import MilvusClient, DataType

In [3]:
client = MilvusClient()

In [44]:
client.create_collection(
    collection_name="quick_setup",
    dimension=5
)

In [3]:
client.get_load_state(
    collection_name="cities"
)

{'state': <LoadState: Loaded>}

In [4]:
client.list_collections()

['quick_setup', 'cities']

In [5]:
client.describe_collection(
    collection_name="cities"
)


{'collection_name': 'cities',
 'auto_id': True,
 'num_shards': 1,
 'description': '',
 'fields': [{'field_id': 100,
   'name': 'doc_title',
   'description': '',
   'type': <DataType.VARCHAR: 21>,
   'params': {'max_length': 65535}},
  {'field_id': 101,
   'name': 'chunk_num',
   'description': '',
   'type': <DataType.INT64: 5>,
   'params': {}},
  {'field_id': 102,
   'name': 'text',
   'description': '',
   'type': <DataType.VARCHAR: 21>,
   'params': {'max_length': 65535}},
  {'field_id': 103,
   'name': 'pk',
   'description': '',
   'type': <DataType.INT64: 5>,
   'params': {},
   'auto_id': True,
   'is_primary': True},
  {'field_id': 104,
   'name': 'vector',
   'description': '',
   'type': <DataType.FLOAT_VECTOR: 101>,
   'params': {'dim': 1024}}],
 'aliases': [],
 'collection_id': 448618908884676457,
 'consistency_level': 1,
 'properties': {},
 'num_partitions': 1,
 'enable_dynamic_field': False}

In [6]:
client.list_aliases()

{'aliases': [], 'db_name': 'default'}

In [5]:
from langchain_community.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings()

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
text = "How big is the city of Seatle?"
query_result = embeddings.embed_query(text)
query_result

[-0.03240237757563591,
 0.003940323833376169,
 -0.034126315265893936,
 0.036654822528362274,
 -0.02641349844634533,
 0.03161881864070892,
 0.027606409043073654,
 0.003917064052075148,
 -0.026377873495221138,
 0.033003631979227066,
 0.0460604652762413,
 -0.06516462564468384,
 0.04244999960064888,
 0.027392804622650146,
 0.033403780311346054,
 -0.026797985658049583,
 -0.006886670365929604,
 -0.022796545177698135,
 -0.03339025750756264,
 -0.01452640164643526,
 -0.05771815404295921,
 -0.02878451533615589,
 -0.018893226981163025,
 -0.011787383817136288,
 0.018746955320239067,
 0.03301381692290306,
 -0.07081154733896255,
 -0.021360285580158234,
 -0.05209425091743469,
 0.019447827711701393,
 0.03411497920751572,
 -0.05939173698425293,
 0.013601923361420631,
 0.036904770880937576,
 1.5657182075301534e-06,
 -0.005555565468966961,
 -0.018000874668359756,
 0.03439940884709358,
 0.020253153517842293,
 0.02928280271589756,
 0.040980949997901917,
 -0.014247896149754524,
 -0.012113183736801147,
 -0.0

In [7]:
len(query_result)

768

In [9]:
import json

In [10]:
file_name = "Seattle"

res = client.search(
    collection_name="cities", # Replace with the actual name of your collection
    # Replace with your query vector
    data=[query_result],
    limit=5, # Max. number of search results to return
    search_params={"metric_type": "L2", "params": {}}, # Search parameters
    output_fields=['doc_title', 'text'],
    filter=f'doc_title == "{file_name}"'
)

result = json.dumps(res, indent=4)
print(result)

[
    [
        {
            "id": 448641095281941019,
            "distance": 1.381169080734253,
            "entity": {
                "doc_title": "Seattle",
                "text": "Seattle (  see-AT-\u0259l; Lushootseed: d\u1dbbid\u1dbb\u0259lal\u0315i\u010d) is a seaport city on the West Coast of the United States. It is the seat of King County, Washington. With a 2022 population of 749,256 it is the most populous city in both the state of Washington and the Pacific Northwest region of North America. The Seattle metropolitan area's population is 4.02 million, making it the 15th-largest in the United States. Its growth rate of 21.1% between 2010 and 2020 made it one of the country's fastest-growing large cities.Seattle is situated on an isthmus between Puget Sound (an inlet of the Pacific Ocean) and Lake Washington. It is the northernmost major city in the United States, located about 100 miles (160 km) south of the Canadian border. A major gateway for trade with East Asia, the 

In [52]:
res = client.get(
    collection_name="cities",
    ids=[448641095281940840, 448641095281940749]
)
res

[{'text': "=== 21st century ===\nBoston is an intellectual, technological, and political center but has lost some important regional institutions, including the loss to mergers and acquisitions of local financial institutions such as FleetBoston Financial, which was acquired by Charlotte-based Bank of America in 2004. Boston-based department stores Jordan Marsh and Filene's have both merged into the New York Cityâ€“based Macy's.\nThe 1993 acquisition of The Boston Globe by The New York Times was reversed in 2013 when it was re-sold to Boston businessman John W. Henry. In 2016, it was announced General Electric would be moving its corporate headquarters from Connecticut to the Seaport District in Boston, joining many other companies in this rapidly developing neighborhood.\nBoston has experienced gentrification in the latter half of the 20th century, with housing prices increasing sharply since the 1990s when the city's rent control regime was struck down by statewide ballot proposition

# Trying various embeddings

In [58]:
%pip install -U langchain-mistralai

Collecting langchain-mistralai
  Obtaining dependency information for langchain-mistralai from https://files.pythonhosted.org/packages/f2/00/80322f2346fb9d4920c4d68480833a754a38103cbd4fbdcf2c2a7d176007/langchain_mistralai-0.1.0-py3-none-any.whl.metadata
  Downloading langchain_mistralai-0.1.0-py3-none-any.whl.metadata (2.2 kB)
Collecting httpx-sse<1,>=0.3.1 (from langchain-mistralai)
  Obtaining dependency information for httpx-sse<1,>=0.3.1 from https://files.pythonhosted.org/packages/e1/9b/a181f281f65d776426002f330c31849b86b31fc9d848db62e16f03ff739f/httpx_sse-0.4.0-py3-none-any.whl.metadata
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Downloading langchain_mistralai-0.1.0-py3-none-any.whl (10 kB)
Downloading httpx_sse-0.4.0-py3-none-any.whl (7.8 kB)
Installing collected packages: httpx-sse, langchain-mistralai
Successfully installed httpx-sse-0.4.0 langchain-mistralai-0.1.0
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [64]:
from langchain_mistralai import MistralAIEmbeddings

In [65]:
embedding = MistralAIEmbeddings()

In [66]:
query_result = embeddings.embed_query(text)
query_result

[-0.03240237757563591,
 0.003940323833376169,
 -0.034126315265893936,
 0.036654822528362274,
 -0.02641349844634533,
 0.03161881864070892,
 0.027606409043073654,
 0.003917064052075148,
 -0.026377873495221138,
 0.033003631979227066,
 0.0460604652762413,
 -0.06516462564468384,
 0.04244999960064888,
 0.027392804622650146,
 0.033403780311346054,
 -0.026797985658049583,
 -0.006886670365929604,
 -0.022796545177698135,
 -0.03339025750756264,
 -0.01452640164643526,
 -0.05771815404295921,
 -0.02878451533615589,
 -0.018893226981163025,
 -0.011787383817136288,
 0.018746955320239067,
 0.03301381692290306,
 -0.07081154733896255,
 -0.021360285580158234,
 -0.05209425091743469,
 0.019447827711701393,
 0.03411497920751572,
 -0.05939173698425293,
 0.013601923361420631,
 0.036904770880937576,
 1.5657182075301534e-06,
 -0.005555565468966961,
 -0.018000874668359756,
 0.03439940884709358,
 0.020253153517842293,
 0.02928280271589756,
 0.040980949997901917,
 -0.014247896149754524,
 -0.012113183736801147,
 -0.0

In [63]:
len(query_result)

768

In [None]:
#Gemini

def my_retriever(paper_id, question):
  # Assuming embeddings.embed_query returns a vector
  query_vector = embeddings.embed_query(question)
  
  # Search for relevant documents using paper ID filter
  res = client.search(
      collection_name="cities",  # Replace with your collection name
      data=[query_vector],
      limit=1,  # Retrieve only the top 1 result (the target paper)
      search_params={"metric_type": "L2", "params": {}},  # Search parameters
      output_fields=[],  # Don't need specific fields here, as context will be retrieved later
      filter=f'doc_id == "{paper_id}"'
  )
  
  # Check if a document was found
  if res.total > 0:
    # Retrieve the context (text) for the found document (assuming 'text' field stores the document content)
    context = client.get(collection_name="cities", ids=[res.hits[0].id])["embeddings"][0]["text"]
    return context
  else:
    return None  # Indicate no relevant document found

chain = (
  {"paper_id": RunnablePassthrough(), "question": RunnablePassthrough()}
  | my_retriever  # Custom retriever function
  | PromptTemplate.from_template(template)  # Add context and question
  | llm
  | StrOutputParser()
)


In [72]:
# CLAude

from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate

# Load your LLM model
llm = OpenAI(temperature=0)

# Define the prompt template
template = """Answer the question based only on the following context: {context}
Question: {question}"""
prompt = PromptTemplate(template=template, input_variables=["context", "question"])

# Retrieve relevant text based on paper_id
paper_id = "your_paper_id"
res = client.search(
    collection_name="cities",
    data=[query_result],
    limit=5,
    search_params={"metric_type": "L2", "params": {}},
    output_fields=['doc_title', 'text'],
    filter=f'doc_id == "{paper_id}"'
)

# Extract the relevant text from the search results
relevant_text = "\n".join([doc.text for doc in res])

# Create the Retrieval QA chain
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=None,  # Set retriever to None since we already have the relevant text
    return_intermediate_steps=True,
    prompt=prompt
)

# Get the answer from the QA chain
result = qa({"context": relevant_text, "question": "What is the main focus industry of this paper?"})
print(result['result'])

  warn_deprecated(


ValidationError: 1 validation error for OpenAI
__root__
  Did not find openai_api_key, please add an environment variable `OPENAI_API_KEY` which contains it, or pass `openai_api_key` as a named parameter. (type=value_error)