In [3]:
from langchain_anthropic import ChatAnthropic
from langchain_core.tools import tool

In [4]:
from langgraph.graph import MessageGraph
from langgraph.prebuilt import ToolNode,tools_condition

In [5]:
@tool
def divide(a:float,b:float) -> int:
    "Return a / b"
    return a/b
    

In [17]:
llm

ChatAnthropic(model='claude-3-haiku-20240307', anthropic_api_url='https://api.anthropic.com', anthropic_api_key=SecretStr('**********'), model_kwargs={})

In [18]:
tools = [divide]

In [19]:
graph_builder = MessageGraph()

/tmp/ipykernel_27495/1301157418.py:1: LangGraphDeprecatedSinceV10: MessageGraph is deprecated in LangGraph v1.0.0, to be removed in v2.0.0. Please use StateGraph with a `messages` key instead. Deprecated in LangGraph V1.0 to be removed in V2.0.
  graph_builder = MessageGraph()


In [20]:
graph_builder = MessageGraph()
graph_builder.add_node("tools", ToolNode(tools))
graph_builder.add_node("chatbot", llm.bind_tools(tools))
graph_builder.add_edge("tools", "chatbot")
graph_builder.add_conditional_edges(
    "chatbot",
    # highlight-next-line
    tools_condition,
    {
        # If it returns 'action', route to the 'tools' node
        "action": "tools",
        # If it returns '__end__', route to the end
        "__end__": "__end__",
    },
)
graph_builder.set_entry_point("chatbot")
graph = graph_builder.compile()
graph.invoke([("user", "What's 329993 divided by 13662?")])

/tmp/ipykernel_27495/668081823.py:1: LangGraphDeprecatedSinceV10: MessageGraph is deprecated in LangGraph v1.0.0, to be removed in v2.0.0. Please use StateGraph with a `messages` key instead. Deprecated in LangGraph V1.0 to be removed in V2.0.
  graph_builder = MessageGraph()


BadRequestError: Error code: 400 - {'type': 'error', 'error': {'type': 'invalid_request_error', 'message': 'Your credit balance is too low to access the Anthropic API. Please go to Plans & Billing to upgrade or purchase credits.'}, 'request_id': 'req_011CU5i3CBkDGDj5FXsUSWZu'}

In [1]:
import torch 

In [2]:
tensorA = torch.randn((13,768))

In [5]:
tensorA.mean(dim = 0).shape

torch.Size([768])

### Embeddings and RAG Experiments

In [1]:
from langchain_huggingface import HuggingFaceEmbeddings 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
EmbeddingsModel = HuggingFaceEmbeddings

In [3]:
type(EmbeddingsModel)

pydantic._internal._model_construction.ModelMetaclass

In [12]:
device = "cuda"

In [14]:
embedding = HuggingFaceEmbeddings(model_name = "sentence-transformers/all-MiniLM-L6-v2",
                                  model_kwargs = {"device":device,
                                                  "trust_remote_code":True},
                                                  encode_kwargs = {"normalize_embeddings":False})

  from .autonotebook import tqdm as notebook_tqdm


In [42]:
sentence1 = "I am vegan"
sentence2 = "I am not vegan"

In [43]:
vec1 = await embedding.aembed_documents([sentence1])
vec2 = await embedding.aembed_documents([sentence2])

In [44]:
vec1_tensor = torch.tensor(vec1)
vec2_tensor = torch.tensor(vec2)

In [45]:
cos = torch.nn.CosineSimilarity(dim=1,eps = 1e-6)

In [46]:
cos(vec1_tensor,vec2_tensor)

tensor([0.9258])

Why did this happen? The sentences "I am vegan" and "I am not vegan" are two completely opposite sentences, yet they have a very high cosine similarity score. 

The reason is sentence embeddings like sentence-BERT captures semantic proximity between two sentences, not logic or polarity.

Models like all-MiniLM-L6-v2 are trained on semantic similarity tasks — they learn to cluster related sentences, not necessarily opposite in meaning ones.

The model doesn’t actually understand negation — it just sees a small difference (“not”) in an otherwise identical sentence.

BERT-style models are trained on next sentence prediction or paraphrase detection, not on logical reasoning.

Their training doesn’t penalize failure to detect contradiction.

The loss function (contrastive / cosine-based) only rewards closeness of semantically similar pairs — not recognizing opposites.

### TextSplitters 

### Strategy to split texts:

#### Length-based

Each chunk should exceed a particular chunk-size threshold. 

- Straight forward

- Consistent chunk sizes

- Easily adaptable to different model requirements

Types of length-based splitting:
- Token-based
- Character-based

### Implementation

In [1]:
from langchain_text_splitters import CharacterTextSplitter
from langchain_core.documents import Document

In [2]:
with open('./llm_wiki.txt','r') as f:
    content = f.read()

### Split by " " and use len() for counting chunks. i.e. using len function gives us split by character.

In [39]:
text_splitter = CharacterTextSplitter(chunk_size = 100,
                                      chunk_overlap = 0,
                                      separator = " ",
                                      length_function = len,
                                      is_separator_regex=False
                                      )

In [36]:
documents = text_splitter.create_documents(
    [content],metadatas=[{"source":"llm_wiki.txt"}]
)

In [37]:
texts = text_splitter.split_text(content)

In [38]:
len(texts[0])

99

### If let's say I wanted to split by tokens. I could create a function that returns the number of tokens for the purposed chunk(str). 

In [41]:
import tiktoken

In [119]:
tokenizer = tiktoken.get_encoding("o200k_harmony")

In [126]:
def count_token(text: str) -> int:
    return len(tokenizer.encode(text))

#### Just a little experiment with tokenizer

In [128]:
import random

for i in [random.randint(1000,5000) for _ in range(1)]:
    try:
        token_bytes = tokenizer.decode_single_token_bytes(i)
        try:
            token_str = token_bytes.decode("utf-8")
        except UnicodeDecodeError:
            token_str = repr(token_bytes)
        print(f"Token_ID: {i}, Token: {token_str}")
    except KeyError:
        print(f"Token_ID: {i} is not a valid token.")
        continue

Token_ID: 4746, Token: b'\xe1\x83\x90\xe1\x83\xa0\xe1\x83'


In [151]:
text_splitter_token = CharacterTextSplitter(
    chunk_size = 5,
    chunk_overlap = 0,
    separator= " ",
    length_function = len
)

In [152]:
texts_tokens = text_splitter.split_text(content)

In [153]:
texts_tokens[0]

'A large language model (LLM) is a language model trained with self-supervised machine learning on a'

In [154]:
len(tokenizer.encode(texts_tokens[0]))

21