In [1]:
## Data Ingestion
from langchain_community.document_loaders import TextLoader

loader=TextLoader("speech.txt")
text_documents=loader.load()
text_documents

[Document(metadata={'source': 'speech.txt'}, page_content='The world must be made safe for democracy. Its peace must be planted upon the tested foundations of political liberty. We have no selfish ends to serve. We desire no conquest, no dominion. We seek no indemnities for ourselves, no material compensation for the sacrifices we shall freely make. We are but one of the champions of the rights of mankind. We shall be satisfied when those rights have been made as secure as the faith and the freedom of nations can make them.\n\nJust because we fight without rancor and without selfish object, seeking nothing for ourselves but what we shall wish to share with all free peoples, we shall, I feel confident, conduct our operations as belligerents without passion and ourselves observe with proud punctilio the principles of right and of fair play we profess to be fighting for.\n\nâ€¦\n\nIt will be all the easier for us to conduct ourselves as belligerents in a high spirit of right and fairness 

In [None]:
import os
from dotenv import load_dotenv
load_dotenv()

os.environ['ANTHROPIC_API_KEY']=os.getenv('ANTHROPIC_API_KEY')

In [3]:
# web based loader
from langchain_community.document_loaders import WebBaseLoader
import bs4

## load,chunk and index the content of the html page

loader=WebBaseLoader(web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
                     bs_kwargs=dict(parse_only=bs4.SoupStrainer(
                         class_=("post-title","post_content","post_header")

                     )))

text_documents=loader.load()

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [4]:
text_documents

[Document(metadata={'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/'}, page_content='\n      LLM Powered Autonomous Agents\n    ')]

In [5]:
from langchain_community.document_loaders import PyPDFLoader
loader=PyPDFLoader('attention.pdf')
docs=loader.load( )


In [6]:
docs

[Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2023-08-03T00:07:29+00:00', 'author': '', 'keywords': '', 'moddate': '2023-08-03T00:07:29+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'attention.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1'}, page_content='Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\nAttention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan N. Gomez∗ †\nUniversity of Toronto\naidan@cs.toronto.edu\nŁukasz Kaiser∗\nGoogle Brain\nlukaszk

In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter=RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=200)
documents=text_splitter.split_documents(docs)
documents[:5]

[Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2023-08-03T00:07:29+00:00', 'author': '', 'keywords': '', 'moddate': '2023-08-03T00:07:29+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'attention.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1'}, page_content='Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\nAttention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan N. Gomez∗ †\nUniversity of Toronto\naidan@cs.toronto.edu\nŁukasz Kaiser∗\nGoogle Brain\nlukaszk

In [8]:
documents

[Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2023-08-03T00:07:29+00:00', 'author': '', 'keywords': '', 'moddate': '2023-08-03T00:07:29+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'attention.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1'}, page_content='Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\nAttention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan N. Gomez∗ †\nUniversity of Toronto\naidan@cs.toronto.edu\nŁukasz Kaiser∗\nGoogle Brain\nlukaszk

In [None]:
import os
from dotenv import load_dotenv

# --- Step 1: Load Environment Variables ---
# This must be executed successfully before the embedding class is created
load_dotenv()

# --- Step 2: Vector Embedding And Vector Store ---
from langchain_community.vectorstores import Chroma
# Load model directly
from transformers import AutoModel

embedding_model = AutoModel.from_pretrained("jinaai/jina-embeddings-v4", trust_remote_code=True, torch_dtype="auto")
# OpenAIEmbeddings() will now automatically find the OPENAI_API_KEY 
# set in the environment by load_dotenv()
# db = Chroma.from_documents(documents, embedding_model)

print("Chroma database created successfully using OpenAI Embeddings.")

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-embeddings-v4:
- custom_lora_module.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-embeddings-v4:
- qwen2_5_vl.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-embeddings-v4:
- custom_lora_module.py
- qw

In [None]:
import torch
from sentence_transformers import SentenceTransformer
import numpy as np

# --- Configuration ---
# You may need to log in to Hugging Face if the model is gated.
# To do so, run 'huggingface-cli login' in your terminal or
# use the HUGGING_FACE_HUB_TOKEN environment variable.

# Choose the Jina embedding model.
# jinaai/jina-embeddings-v2-base-en is a common English choice.
# Other options: jinaai/jina-embeddings-v3 (multilingual)
MODEL_NAME = 'jinaai/jina-embeddings-v2-base-en'

# Set your document chunks (list of strings)
document_chunks = [
    "The first paragraph discusses the origins of artificial intelligence, dating back to the 1950s.",
    "A key component in modern AI is deep learning, which uses neural networks with many layers.",
    "Jina embeddings are notable for their extended context length of up to 8192 tokens, allowing for longer document chunks.",
    "The process of creating embeddings is crucial for tasks like semantic search and retrieval-augmented generation (RAG)."
]

# --- Load Model and Encode ---

# 1. Initialize the SentenceTransformer model
# trust_remote_code=True is essential for Jina models as they include
# custom encoding logic (like attention with linear biases - ALiBi).
try:
    print(f"Loading model: {MODEL_NAME}...")
    model = SentenceTransformer(MODEL_NAME, trust_remote_code=True)
    print("Model loaded successfully.")
    
    # Optional: Move model to GPU if available
    if torch.cuda.is_available():
        model.to('cuda')
        print("Model moved to GPU.")

except Exception as e:
    print(f"Error loading model: {e}")
    print("Ensure you have the required packages and are logged into Hugging Face if the model is gated.")
    exit()


# 2. Encode the document chunks
# The .encode() method handles tokenization, passing through the model,
# and applying mean-pooling to generate a single vector for each chunk.
print(f"\nEncoding {len(document_chunks)} document chunks...")
chunk_embeddings = model.encode(document_chunks)
print("Encoding complete.")

# --- Output and Verification ---

# 3. Print the results
print("\n--- Embeddings Results ---")

# The shape of the output: (number of chunks, embedding dimension)
print(f"Total number of embeddings generated: {chunk_embeddings.shape[0]}")

# Jina-embeddings-v2-base-en has a dimension of 768.
print(f"Embedding dimension: {chunk_embeddings.shape[1]}")

# Print the first few dimensions of the first chunk's embedding
print(f"\nFirst chunk: '{document_chunks[0][:50]}...'")
print(f"Embedding (first 5 values): {chunk_embeddings[0][:5]}")

# Example of a structured result (e.g., for storing in a vector database)
structured_results = []
for chunk, embedding in zip(document_chunks, chunk_embeddings):
    structured_results.append({
        "text_chunk": chunk,
        "embedding": embedding.tolist() # Convert numpy array to list for storage
    })

print("\nStructured Data Example (First entry):")
print(f"Text: {structured_results[0]['text_chunk']}")
print(f"Vector Dimension: {len(structured_results[0]['embedding'])}")
#

In [None]:
from huggingface_hub import login

# Option 1: Will prompt you to enter the token in a text box
login()

# Option 2: Pass the token as a string (Less secure, use environment variables if possible)
# from getpass import getpass
# token = getpass("Enter your Hugging Face token: ")
# login(token=token)

In [5]:
import os
from dotenv import load_dotenv
from huggingface_hub import login

load_dotenv()
huggingface_api_key = os.getenv('HUGGINGFACE_API_KEY')
login(token=huggingface_api_key)

from langchain_community.vectorstores import Chroma
from transformers import AutoModel

# embedding_model = AutoModel.from_pretrained("jinaai/jina-embeddings-v4", trust_remote_code=True, torch_dtype="auto")
# OpenAIEmbeddings() will now automatically find the OPENAI_API_KEY 
# set in the environment by load_dotenv()
# db = Chroma.from_documents(documents, embedding_model)

print("Chroma database created successfully using OpenAI Embeddings.")

  from .autonotebook import tqdm as notebook_tqdm


Chroma database created successfully using OpenAI Embeddings.


In [None]:
from transformers import AutoTokenizer, AutoModel
# You should load both the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained("jinaai/jina-embeddings-v4")
embedding_model = AutoModel.from_pretrained("jinaai/jina-embeddings-v4", trust_remote_code=True, torch_dtype="auto")

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
`torch_dtype` is deprecated! Use `dtype` instead!
Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. 

In [1]:
import requests
from langchain_community.embeddings import JinaEmbeddings
from numpy import dot
from numpy.linalg import norm
from PIL import Image

In [2]:
text_embeddings = JinaEmbeddings(
    jina_api_key="jina_*", model_name="jina-embeddings-v2-base-en"
)

In [3]:
text = "This is a test document."

In [7]:
query_result = text_embeddings.embed_query(text)

RuntimeError: [RID: e2fdbc7f886bd94cd2a331ce0bcf057c] Invalid API key

In [1]:
# Note: JinaEmbeddings is NOT used here; we use HuggingFaceEmbeddings
# and rely on the model being downloaded locally after login.
from langchain_community.embeddings import HuggingFaceEmbeddings
import torch

# 1. Define the Hugging Face model ID
HF_MODEL_NAME = "jinaai/jina-embeddings-v2-base-en"

# 2. Instantiate HuggingFaceEmbeddings
# This will load the model from the Hub/cache using transformers
text_embeddings = HuggingFaceEmbeddings(
    model_name=HF_MODEL_NAME,
    # Configure device for local execution
    model_kwargs={'device': 'cuda' if torch.cuda.is_available() else 'cpu'} 
)

text = "This is a test document."
# The rest of the logic is the same
query_result = text_embeddings.embed_query(text)

print("Hugging Face Local Embedding successful.")
print(f"Embedding dimensions: {len(query_result)}")

  text_embeddings = HuggingFaceEmbeddings(
  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of BertModel were not initialized from the model checkpoint at jinaai/jina-embeddings-v2-base-en and are newly initialized: ['embeddings.position_embeddings.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.de

Hugging Face Local Embedding successful.
Embedding dimensions: 768


In [2]:
query_result

[-0.026103580370545387,
 0.030853180214762688,
 0.0367080383002758,
 0.0018579147290438414,
 -0.0008549651829525828,
 -0.010201890952885151,
 -0.018863258883357048,
 0.03157246857881546,
 -0.01612643152475357,
 -0.06437158584594727,
 0.014311768114566803,
 -0.007991430349647999,
 -0.05703592300415039,
 -0.012066303752362728,
 0.0004766607016790658,
 -0.004395137075334787,
 -0.005211707204580307,
 -0.026913614943623543,
 0.019416334107518196,
 -0.029991568997502327,
 0.06431490927934647,
 -0.08003038167953491,
 -0.008734742179512978,
 -0.029141999781131744,
 0.0330209881067276,
 0.033436864614486694,
 -0.024063359946012497,
 -0.04254300892353058,
 0.02024824544787407,
 0.10005417466163635,
 -0.014807842671871185,
 0.05332331359386444,
 0.06323389708995819,
 0.0010150914313271642,
 -0.024810243397951126,
 -0.020881695672869682,
 -0.024798745289444923,
 0.005016841925680637,
 -0.02774004451930523,
 0.02153436653316021,
 0.010243836790323257,
 -0.007237004116177559,
 -0.0007312633679248393