In [16]:
import os
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
os.environ["OPENAI_API_KEY"]=os.getenv("OPENAI_API_KEY")

In [5]:
from langchain_openai import OpenAIEmbeddings
embeddings=OpenAIEmbeddings(model="text-embedding-3-large")
embeddings


OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x10baf05b0>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x10ccc5760>, model='text-embedding-3-large', dimensions=None, deployment='text-embedding-ada-002', openai_api_version=None, openai_api_base=None, openai_api_type=None, openai_proxy=None, embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True)

In [6]:
text="This is a tutorial on OPENAI embedding"
query_result=embeddings.embed_query(text)
query_result

[0.001961575821042061,
 0.04168045148253441,
 -0.01386804599314928,
 -0.039876703172922134,
 0.0239898469299078,
 0.004051495343446732,
 0.016608355566859245,
 0.05810843035578728,
 -0.017732229083776474,
 0.0012340063694864511,
 -0.01655285619199276,
 -0.0036872769705951214,
 -0.007416178938001394,
 -0.012036547996103764,
 0.013486484065651894,
 0.020978976041078568,
 0.0246419720351696,
 0.0538349375128746,
 -0.01967472769320011,
 -0.026237593963742256,
 0.014693607576191425,
 0.0013207250740379095,
 -0.04023745283484459,
 -0.03737920522689819,
 0.02554384432733059,
 -0.021076099947094917,
 0.014693607576191425,
 0.046925194561481476,
 -0.015692606568336487,
 0.04809069260954857,
 0.014187171123921871,
 -0.01034380029886961,
 0.0038884642999619246,
 -0.007388428784906864,
 -0.016941355541348457,
 0.02658446878194809,
 0.0434287004172802,
 0.04098670184612274,
 -0.0458984449505806,
 0.02120097540318966,
 0.03696295619010925,
 -0.000461776799056679,
 -0.0002547360199969262,
 -0.0214784

In [7]:
len(query_result)

3072

In [9]:
'''
- This line creates an instance of `OpenAIEmbeddings` class
- It's configured to use OpenAI's model `text-embedding-3-large`
- The parameter specifies that the embedding vector will have 1024 dimensions `dimensions=1024`
- is OpenAI's most capable embedding model `text-embedding-3-large`
- The `embed_query()` method converts the input text into a numerical vector representation
- The resulting will be a vector with 1024 dimensions (as specified in step 1) `query_result`
- Each dimension in this vector represents different semantic features of the input text

**What are embeddings?**
- Embeddings are numerical representations of text that capture semantic meaning
- They convert words/sentences into vectors of numbers
- Similar texts will have similar vector representations
- These vectors can be used for:
    - Semantic search
    - Text classification
    - Finding similar documents
    - Many other NLP tasks

'''
embeddings_1024=OpenAIEmbeddings(model="text-embedding-3-large",dimensions=1024)
text="This is a tutorial on OPENAI embedding"
query_result=embeddings_1024.embed_query(text)
len(query_result)

1024

In [13]:
"""
Text Embedding and Semantic Search Implementation

This code demonstrates a semantic search system using OpenAI embeddings and Chroma vector database:
1. Loads and splits a text document into chunks
2. Converts text chunks into 1024-dimensional embeddings using OpenAI's text-embedding-3-large model
3. Stores embeddings in a Chroma vector database
4. Performs semantic similarity search on the stored documents

Key components:
- Document Loading: Uses TextLoader for reading text files
- Text Splitting: Implements RecursiveCharacterTextSplitter with 500-char chunks and 50-char overlap
- Vector Store: Chroma DB for efficient similarity search
- Embeddings: OpenAI's text-embedding-3-large model for high-quality text vectorization

The system enables semantic search beyond simple keyword matching by understanding the contextual
meaning of text through vector representations.
"""
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
loader=TextLoader('speech.txt')
docs=loader.load()
text_splitter=RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=50)
final_documents=text_splitter.split_documents(docs)
db=Chroma.from_documents(final_documents,embeddings_1024)
### Retrieve the results from query vectorstore db
query="It will be all the easier for us to conduct ourselves as belligerents"
retrieved_results=db.similarity_search(query)
print(retrieved_results)

[Document(metadata={'source': 'speech.txt'}, page_content='It will be all the easier for us to conduct ourselves as belligerents in a high spirit of right and fairness because we act without animus, not in enmity toward a people or with the desire to bring any injury or disadvantage upon them, but only in armed opposition to an irresponsible government which has thrown aside all considerations of humanity and of right and is running amuck. We are, let me say again, the sincere friends of the German people, and shall desire nothing so much as the early'), Document(metadata={'source': 'speech.txt'}, page_content='It will be all the easier for us to conduct ourselves as belligerents in a high spirit of right and fairness because we act without animus, not in enmity toward a people or with the desire to bring any injury or disadvantage upon them, but only in armed opposition to an irresponsible government which has thrown aside all considerations of humanity and of right and is running amu

In [None]:
from langchain_community.embeddings import OllamaEmbeddings
embeddings=(
    OllamaEmbeddings(model="mxbai-embed-large")  ##by default it ues llama2
)
text = "This is a test document."
query_result = embeddings.embed_query(text)
query_result

In [18]:
os.environ['HF_TOKEN']=os.getenv("HF_TOKEN")
from langchain_huggingface import HuggingFaceEmbeddings
embeddings=HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
text="this is atest documents"
query_result=embeddings.embed_query(text)
query_result

  from .autonotebook import tqdm as notebook_tqdm


[-0.04311223700642586,
 0.13562121987342834,
 0.022339945659041405,
 0.007216723170131445,
 0.034210607409477234,
 0.02403409406542778,
 -0.02484881319105625,
 0.045667339116334915,
 0.018850579857826233,
 0.04899343475699425,
 -0.004306051414459944,
 0.05968935415148735,
 0.0029522778932005167,
 -0.059990886598825455,
 -0.11980380862951279,
 -0.005690671503543854,
 -0.020968426018953323,
 0.009721250273287296,
 0.04023446887731552,
 0.050469983369112015,
 -0.0021608059760183096,
 0.09888077527284622,
 0.021964695304632187,
 -0.05851993337273598,
 0.029561897739768028,
 0.004117683973163366,
 -0.09333007782697678,
 -0.0430552177131176,
 0.0696839988231659,
 -0.04684080556035042,
 0.04395323619246483,
 0.010073340497910976,
 0.09620824456214905,
 0.02793020009994507,
 0.07333722710609436,
 -0.01297685969620943,
 0.0761367678642273,
 -0.011923186480998993,
 0.011215229518711567,
 -0.008163193240761757,
 -0.010897375643253326,
 -0.07058069854974747,
 -0.027596015483140945,
 -0.00615309504