In [10]:
# %pip install pymilvus
# %pip install langchain transformers langchain_openai 
# %pip install sentence-transformers
# import os
# import time

In [11]:
from pymilvus import db, connections, Collection, utility

conn_name = 'cache_conn'
db_name = 'cache_db'
collection_name = 'llm_cache'

connections.add_connection(
    cache_conn = {
        "host":"localhost",
        "port":"19530",
        "username":"username",
        "password":"password"
    }
)

connections.connect(conn_name)

current_dbs = db.list_database(using=conn_name)

if db_name not in current_dbs:
    print(f"Creating database:{db_name}")
    resume_db = db.create_database(db_name, using=conn_name)
else:
    print(f"Database {db_name} already exists!!!")

db.using_database(db_name, using=conn_name)


Database cache_db already exists!!!


In [12]:
from pymilvus import FieldSchema, CollectionSchema, Collection, DataType

cache_id = FieldSchema(
    name='cache_id',
    dtype=DataType.INT64,
    auto_id = True,
    is_primary  = True,
)

prompt_text = FieldSchema(
    name='prompt_text',
    dtype=DataType.VARCHAR,
    max_length= 8192  # Increased to handle longer prompts
)

prompt_embedding = FieldSchema(
    name ='prompt_embedding',
    dtype=DataType.FLOAT_VECTOR,
    dim = 384
)

response_text = FieldSchema(
    name='response_text',
    dtype=DataType.VARCHAR,
    max_length= 32768  # Increased to handle longer LLM responses (max is 65535)
)

cache_schema = CollectionSchema(
    fields=[cache_id, prompt_text, response_text, prompt_embedding],
    description='cache schema',
    enable_dynamic_field = True
)

if utility.has_collection(collection_name, using=conn_name):
    print(f"Dropping existing collection: {collection_name}")
    utility.drop_collection(collection_name, using=conn_name)
else:
    print(f"Collection {collection_name} does not exist. Proceeding to create.")

cache_collection = Collection(
    name = collection_name,
    schema=cache_schema,
    using=conn_name,
    shard_num=2    #A shard is a horizontal split of your data like dividing your collection into smaller parts to parallelize queries and insertions.
)


print(f"Collection Schema: {cache_collection.schema}")


Dropping existing collection: llm_cache
Collection Schema: {'auto_id': True, 'description': 'cache schema', 'fields': [{'name': 'cache_id', 'description': '', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': True}, {'name': 'prompt_text', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 8192}}, {'name': 'response_text', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 32768}}, {'name': 'prompt_embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 384}}], 'enable_dynamic_field': True}


In [13]:
index_params = {
    'index_type':'IVF_FLAT',
    'metric_type':'L2', #using L2 with 
    'params': {"nlist":1024}   #number of clusters (buckets) Milvus divides your vector space into during indexing.
}

cache_collection.create_index(
    field_name='prompt_embedding',
    index_params=index_params
)

cache_collection.flush()
cache_collection.load()

#### Inference Processing with Caching

In [14]:
# %pip install -U langchain langchain-core langchain-community langchain-openai
# %pip install langchain-google-genai

In [None]:
from transformers import AutoTokenizer
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.chat_models import ChatOllama
from langchain_google_genai import ChatGoogleGenerativeAI
from sentence_transformers import SentenceTransformer
import os
import time

# llm = ChatOpenAI(temperature = 0.0,model='text-davinci-003')
llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0.3, api_key=GOOGLE_API_KEY)
# llm = ChatOllama(temperature = 0.0,model='llama3.2')

embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

similarity_threshold = 0.95

search_params = {
    'metric_type':'L2',
    # 'limit':5, why cant i define limit in the params only? - params are more related to index than to search
    'ignore_growing':False,
    'params':{'nprobe':20, 'radius':similarity_threshold} #how many clusters Milvus searches during query time.
}




In [16]:
def get_response(prompt):
    
    start_time = time.time()

    prompt_embed = embedding_model.encode(prompt, normalize_embeddings=True)

    cache_results = cache_collection.search(
        data=[prompt_embed.tolist()],
        anns_field='prompt_embedding',
        offset=0,
        param=search_params,
        limit=1,
        expr=None,
        output_fields=["prompt_text","response_text"],
        consistency_level='Strong'
    )

    returned_response = None

    if len(cache_results[0]) > 0:
        print(f"{prompt} : Cache Hit: {cache_results[0]}")
        returned_response = cache_results[0][0].get('response_text')

    else:
        # llm_response = llm(prompt)
        llm_response = llm.invoke(prompt)  #This is used for LangChain Chatbots
        print(f"{prompt} : LLM Response : {llm_response.content}")

        prompt_text = [prompt]
        prompt_embedding = [prompt_embed.tolist()]
        response_text = [llm_response.content]

        insert_data = [prompt_text, response_text, prompt_embedding]

        mr = cache_collection.insert(insert_data)

        # This writes the data from the buffer to the storage engine.
        print("Flushing data to Milvus...")
        cache_collection.flush()

        

    end_time = time.time()

    print(f"Total Response Time : {start_time}-{end_time}")
    return returned_response

In [17]:
response = get_response("In which year was Shaq born?")
response = get_response("Distance between the earth and the moon?")
response = get_response("Which Countries have been under Civil War")
response = get_response("In Isreal a good country?")
response = get_response("Nasa is better at it's work than spaceX")

In which year was Shaq born? : LLM Response : Shaquille O'Neal (Shaq) was born in **1972**.
Flushing data to Milvus...
Total Response Time : 1762799121.073844-1762799126.485801
Distance between the earth and the moon? : LLM Response : The distance between the Earth and the Moon is not constant because the Moon's orbit around Earth is elliptical. However, we can provide an average distance and the range:

*   **Average Distance:** Approximately **384,400 kilometers (238,900 miles)**.

Here's the range:

*   **Perigee (closest point):** Around **363,104 kilometers (225,623 miles)**
*   **Apogee (farthest point):** Around **405,696 kilometers (252,088 miles)**

To give you a sense of scale, you could fit about 30 Earths side-by-side in the space between our planet and the Moon.
Flushing data to Milvus...
Total Response Time : 1762799126.486322-1762799135.654628
Which Countries have been under Civil War : LLM Response : Civil war is a broad term for an internal conflict within a country, o