## How Cache
https://python.langchain.com/docs/how_to/chat_model_caching/
### InMemory Cache

In [2]:
%%time

from langchain_openai import ChatOpenAI
# <!-- ruff: noqa: F821 -->
from langchain_core.globals import set_llm_cache

llm = ChatOpenAI(model="gpt-4o-mini")


from langchain_core.caches import InMemoryCache

set_llm_cache(InMemoryCache())

# The first time, it is not yet in cache, so it should take longer
llm.invoke("Tell me a joke")

CPU times: user 63.6 ms, sys: 6.97 ms, total: 70.6 ms
Wall time: 1.69 s


AIMessage(content='Why did the scarecrow win an award?\n\nBecause he was outstanding in his field!', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 18, 'prompt_tokens': 11, 'total_tokens': 29, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_bd83329f63', 'finish_reason': 'stop', 'logprobs': None}, id='run-ccd6ec31-00b3-4ff2-8e98-e0c769ee8a2d-0', usage_metadata={'input_tokens': 11, 'output_tokens': 18, 'total_tokens': 29, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})

In [3]:
%%time
llm.invoke("Tell me a joke")

CPU times: user 266 μs, sys: 15 μs, total: 281 μs
Wall time: 281 μs


AIMessage(content='Why did the scarecrow win an award?\n\nBecause he was outstanding in his field!', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 18, 'prompt_tokens': 11, 'total_tokens': 29, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_bd83329f63', 'finish_reason': 'stop', 'logprobs': None}, id='run-ccd6ec31-00b3-4ff2-8e98-e0c769ee8a2d-0', usage_metadata={'input_tokens': 11, 'output_tokens': 18, 'total_tokens': 29, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})

### SQL lite Cache

In [4]:
%%time

from langchain_community.cache import SQLiteCache

set_llm_cache(SQLiteCache(database_path="local_cache/.langchain.db"))

CPU times: user 1.57 s, sys: 593 ms, total: 2.16 s
Wall time: 1.14 s


In [5]:
%%time
# The first time, it is not yet in cache, so it should take longer
llm.invoke("Complete roses are red with a line starting with V")

CPU times: user 21.8 ms, sys: 4.57 ms, total: 26.4 ms
Wall time: 1.05 s


AIMessage(content='Roses are red,  \nViolets are blue,  \nSugar is sweet,  \nAnd so are you.', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 24, 'prompt_tokens': 17, 'total_tokens': 41, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_bd83329f63', 'finish_reason': 'stop', 'logprobs': None}, id='run-672d3f9f-9517-4ad8-a0c0-e1826857ef8e-0', usage_metadata={'input_tokens': 17, 'output_tokens': 24, 'total_tokens': 41, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})

In [6]:
%%time
# Second time from SQLlite
llm.invoke("Complete roses are red with a line starting with V")

CPU times: user 1.67 ms, sys: 1.25 ms, total: 2.92 ms
Wall time: 2.03 ms


AIMessage(content='Roses are red,  \nViolets are blue,  \nSugar is sweet,  \nAnd so are you.', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 24, 'prompt_tokens': 17, 'total_tokens': 41, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_bd83329f63', 'finish_reason': 'stop', 'logprobs': None}, id='run-672d3f9f-9517-4ad8-a0c0-e1826857ef8e-0', usage_metadata={'input_tokens': 17, 'output_tokens': 24, 'total_tokens': 41, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})

In [7]:
%%time
# Same prompt with a slight difference
llm.invoke("Complete roses are red with a line starting with P")

CPU times: user 11.3 ms, sys: 3.24 ms, total: 14.6 ms
Wall time: 1.04 s


AIMessage(content='Roses are red,  \nViolets are blue,  \nPetals softly fall,  \nWhispering love true.', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 26, 'prompt_tokens': 17, 'total_tokens': 43, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_72ed7ab54c', 'finish_reason': 'stop', 'logprobs': None}, id='run-7e246e0f-af64-4edf-97cb-dd4df6bbf0ba-0', usage_metadata={'input_tokens': 17, 'output_tokens': 26, 'total_tokens': 43, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})

In [8]:
%%time
# Same prompt with a slight difference again
llm.invoke("Complete roses are red with a line starting with P")

CPU times: user 1.65 ms, sys: 1.6 ms, total: 3.24 ms
Wall time: 2.22 ms


AIMessage(content='Roses are red,  \nViolets are blue,  \nPetals softly fall,  \nWhispering love true.', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 26, 'prompt_tokens': 17, 'total_tokens': 43, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_72ed7ab54c', 'finish_reason': 'stop', 'logprobs': None}, id='run-7e246e0f-af64-4edf-97cb-dd4df6bbf0ba-0', usage_metadata={'input_tokens': 17, 'output_tokens': 26, 'total_tokens': 43, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})

In [8]:
import os

# Use the environment variable if set, otherwise default to localhost
REDIS_URL = os.getenv("REDIS_URL", "redis://localhost:6379")
print(f"Connecting to Redis at: {REDIS_URL}")

Connecting to Redis at: redis://localhost:6379


### REDIS Cache
https://python.langchain.com/docs/integrations/caches/redis_llm_caching/
```bash

# install redis server not bare bones redis

docker run -d --name redis-stack -p 6379:6379 redis/redis-stack-server:latest


```

In [13]:
import time
import redis
from langchain_core.globals import set_llm_cache
from langchain.schema import Generation
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_redis import RedisCache, RedisSemanticCache

In [10]:
# Initialize RedisCache
redis_cache = RedisCache(redis_url=REDIS_URL)


# Initialize the language model
llm = ChatOpenAI(model="gpt-4o-mini")


# Function to measure execution time
def timed_completion(prompt):
    start_time = time.time()
    result = llm.invoke(prompt)
    end_time = time.time()
    return result, end_time - start_time


# First call (not cached)
prompt = "Explain the concept of caching in three sentences."
result1, time1 = timed_completion(prompt)
print(f"First call (not cached):\nResult: {result1}\nTime: {time1:.2f} seconds\n")

First call (not cached):
Result: content='Caching is a technique used to store frequently accessed data in a temporary storage layer, allowing for quicker retrieval and improved performance. By saving copies of data that are expensive to fetch or compute, caching reduces latency and decreases the load on underlying resources, such as databases or APIs. Effective caching strategies can significantly enhance the efficiency of applications and systems by minimizing redundant operations and optimizing resource usage.' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 76, 'prompt_tokens': 16, 'total_tokens': 92, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_72ed7ab54c', 'finish_reason': 'stop', 'logprobs': None} id='run-de0c875

In [11]:
#Set the cache for LangChain to use
set_llm_cache(redis_cache)

# Second call (should be cached)
result2, time2 = timed_completion(prompt)
print(f"Second call (cached):\nResult: {result2}\nTime: {time2:.2f} seconds\n")

print(f"Speed improvement: {time1 / time2:.2f}x faster")


Second call (cached):
Result: content='Caching is a technique used to store frequently accessed data in a temporary storage layer, allowing for quicker retrieval and improved performance. By saving copies of data that are expensive to fetch or compute, caching reduces latency and decreases the load on underlying resources, such as databases or APIs. Effective caching strategies can significantly enhance the efficiency of applications and systems by minimizing redundant operations and optimizing resource usage.' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 76, 'prompt_tokens': 16, 'total_tokens': 92, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_72ed7ab54c', 'finish_reason': 'stop', 'logprobs': None} id='run-de0c8759-6

In [None]:
# Clear the cache
redis_cache.clear()
print("Cache cleared")

In [15]:
# Initialize RedisSemanticCache
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
semantic_cache = RedisSemanticCache(
    redis_url=REDIS_URL, embeddings=embeddings, distance_threshold=0.2
)

# Set the cache for LangChain to use
set_llm_cache(semantic_cache)


# Function to test semantic cache
def test_semantic_cache(prompt):
    start_time = time.time()
    result = llm.invoke(prompt)
    end_time = time.time()
    return result, end_time - start_time


# Original query
original_prompt = "What is the capital of France?"
result1, time1 = test_semantic_cache(original_prompt)
print(
    f"Original query:\nPrompt: {original_prompt}\nResult: {result1}\nTime: {time1:.2f} seconds\n"
)

# Semantically similar query
similar_prompt = "Can you tell me the capital city of France?"
result2, time2 = test_semantic_cache(similar_prompt)
print(
    f"Similar query:\nPrompt: {similar_prompt}\nResult: {result2}\nTime: {time2:.2f} seconds\n"
)

print(f"Speed improvement: {time1 / time2:.2f}x faster")

# Clear the semantic cache
semantic_cache.clear()
print("Semantic cache cleared")

Original query:
Prompt: What is the capital of France?
Result: content='The capital of France is Paris.' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 8, 'prompt_tokens': 14, 'total_tokens': 22, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_bd83329f63', 'finish_reason': 'stop', 'logprobs': None} id='run-290b5136-2ad3-432e-a8d3-bbcd1200c23d-0' usage_metadata={'input_tokens': 14, 'output_tokens': 8, 'total_tokens': 22, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}}
Time: 2.26 seconds

Similar query:
Prompt: Can you tell me the capital city of France?
Result: content='The capital of France is Paris.' additional_kwargs={'refusal': None} response_metadata={'token_