In [3]:
import os
from dotenv import load_dotenv

In [5]:
load_dotenv()

True

# Open AI Embeddings

In [6]:
os.environ["OPENAI_API_KEY"]=os.getenv("OPENAI_API_KEY")

In [9]:
from langchain_openai import OpenAIEmbeddings

In [10]:
embeddings=OpenAIEmbeddings(model = "text-embedding-3-large")

In [11]:
text = "this is tutorial on embedding"
query_result = embeddings.embed_query(text)
query_result

[-0.012467957101762295,
 0.01653045415878296,
 -0.019207622855901718,
 -0.018578700721263885,
 0.0392991378903389,
 -0.007984762080013752,
 -0.002570931799709797,
 0.064082071185112,
 -0.040149033069610596,
 0.005715542938560247,
 -0.009034382179379463,
 0.026652701199054718,
 -0.0037714082282036543,
 -0.049225907772779465,
 0.022641198709607124,
 -0.000993314548395574,
 -0.029729321599006653,
 0.014184745028614998,
 -0.023763058707118034,
 -0.012969395145773888,
 0.0681275725364685,
 -0.021757308393716812,
 -0.03158209100365639,
 -0.006068249233067036,
 0.01186453178524971,
 0.022471219301223755,
 0.0018336480716243386,
 0.02605777606368065,
 -0.03535562381148338,
 -0.0007999634835869074,
 0.016020517796278,
 0.009646306745707989,
 0.0017560952110216022,
 -0.0032848434057086706,
 -0.0035716830752789974,
 0.013581319712102413,
 0.049157917499542236,
 0.02916838973760605,
 -0.052353519946336746,
 -0.003456947160884738,
 0.051741596311330795,
 -0.010343220084905624,
 -0.01405726000666618

In [12]:
len(query_result)

3072

In [13]:
len(text)

29

In [14]:
embeddings2=OpenAIEmbeddings(model = "text-embedding-3-large",dimensions = 1024)

In [15]:
query_result2 = embeddings2.embed_query(text)

In [16]:
len(query_result2)

1024

In [17]:
from langchain_community.document_loaders import TextLoader

In [18]:
txtloader = TextLoader("Data Sources/speech.txt")

In [20]:
docs = txtloader.load()
docs

[Document(metadata={'source': 'Data Sources/speech.txt'}, page_content="Dr. A.P.J. Abdul Kalam gave many speeches on a variety of topics, including education, the environment, and India's development. Here are some examples of his speeches: [1, 2, 3]  \n\nAddress to students \n\nKalam encouraged students to excel in their studies and become leaders in their chosen fields. He also emphasized the importance of having a strong moral compass and traditional values. [1]  \n\nVision for India \n\nKalam spoke about his vision for India to become a developed nation, and how the country should stand up to the world. He also emphasized the importance of protecting the environment and leaving the planet in good condition for future generations. [2, 3]  \n\nFoundation Day Lecture \n\nKalam spoke about the importance of teachers and how they can inspire students to achieve their goals. He also emphasized the importance of creating an environment where all students are valued equally. [4]  \n\nSpeec

In [21]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [22]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=20)
final_docs = text_splitter.split_documents(docs)

In [23]:
final_docs

[Document(metadata={'source': 'Data Sources/speech.txt'}, page_content='Dr. A.P.J. Abdul Kalam gave many speeches on a variety of topics, including education, the'),
 Document(metadata={'source': 'Data Sources/speech.txt'}, page_content="education, the environment, and India's development. Here are some examples of his speeches: [1, 2,"),
 Document(metadata={'source': 'Data Sources/speech.txt'}, page_content='speeches: [1, 2, 3]'),
 Document(metadata={'source': 'Data Sources/speech.txt'}, page_content='Address to students'),
 Document(metadata={'source': 'Data Sources/speech.txt'}, page_content='Kalam encouraged students to excel in their studies and become leaders in their chosen fields. He'),
 Document(metadata={'source': 'Data Sources/speech.txt'}, page_content='chosen fields. He also emphasized the importance of having a strong moral compass and traditional'),
 Document(metadata={'source': 'Data Sources/speech.txt'}, page_content='and traditional values. [1]'),
 Document(metadata={

In [25]:
from langchain_community.vectorstores import Chroma

In [27]:
db = Chroma.from_documents(final_docs,embeddings2)

In [28]:
db

<langchain_community.vectorstores.chroma.Chroma at 0x23caa45b820>

In [29]:
query = "what did kalam speak about india?"

In [30]:
retreived_results = db.similarity_search(query)

In [31]:
retreived_results

[Document(metadata={'source': 'Data Sources/speech.txt'}, page_content='Kalam spoke about his vision for India to become a developed nation, and how the country should'),
 Document(metadata={'source': 'Data Sources/speech.txt'}, page_content='Kalam spoke about the importance of international cooperation to address issues like poverty,'),
 Document(metadata={'source': 'Data Sources/speech.txt'}, page_content='Dr. A.P.J. Abdul Kalam gave many speeches on a variety of topics, including education, the'),
 Document(metadata={'source': 'Data Sources/speech.txt'}, page_content="education, the environment, and India's development. Here are some examples of his speeches: [1, 2,")]

# OLLAMA

In [33]:
from langchain_ollama import OllamaEmbeddings

In [34]:
embeddings_ollama = OllamaEmbeddings(
    model="gemma:2b",
)

In [35]:
r1 = embeddings_ollama.embed_documents(
    ["Alpha is the first letter of Greek Alphabet",
    "Beta is the second letter of Greek Alphabet"]
)

In [36]:
len(r1)

2

In [37]:
len(r1[0])

2048

In [38]:
embeddings_ollama.embed_query("what is the second letter of Greek Alphabet")

[-0.030401707,
 -0.00071388605,
 0.0012280147,
 0.026962468,
 0.003499441,
 0.0027774586,
 -0.010109305,
 -0.0019527468,
 -0.007773112,
 -0.03132442,
 -0.00048322964,
 -0.0007269115,
 0.022775473,
 -0.014123025,
 -0.005131783,
 -0.0057400274,
 0.10831864,
 -0.015564403,
 0.013117269,
 -0.0031647705,
 0.019957634,
 -0.012732095,
 0.011175631,
 0.0071917884,
 -0.015515367,
 -0.019783832,
 0.0025546954,
 -0.0048068617,
 0.0068001556,
 0.0065726764,
 -0.0065887705,
 -0.0035143937,
 0.032706466,
 -0.0032869126,
 -0.014764161,
 -0.006108736,
 -0.031040799,
 0.013407359,
 0.002330412,
 -0.0023020036,
 0.011417752,
 0.030031992,
 -0.01028898,
 -0.01774078,
 -0.011228645,
 -0.022193335,
 0.01521478,
 -0.0022831864,
 -0.017864037,
 0.010863241,
 -0.26216194,
 -0.16053914,
 -0.017292008,
 -0.0087485295,
 -0.017964456,
 0.0062611164,
 -0.011425859,
 -0.00048964727,
 -0.0073840036,
 -0.0015851592,
 -0.018701408,
 0.0038131361,
 -0.011001167,
 -0.0097143855,
 -0.023585362,
 -0.0171705,
 0.0046469695

In [39]:
r1[1]

[-0.056596883,
 -0.032935206,
 0.019591767,
 0.065866716,
 0.0010125086,
 0.009493311,
 -0.0109762745,
 -0.014957237,
 0.019306438,
 -0.02151985,
 0.0060546445,
 0.013940233,
 0.045969892,
 -0.021931592,
 -0.022843052,
 -0.024462365,
 0.105551176,
 -0.001238318,
 0.016348975,
 0.004794427,
 0.024045566,
 0.004523449,
 -0.006763428,
 -0.0069193584,
 -0.022932222,
 -0.0320994,
 -0.0036015431,
 0.010873421,
 0.020077255,
 0.04060832,
 -0.00840203,
 0.0062012346,
 0.016932718,
 0.009434609,
 -0.033989377,
 -0.0068929656,
 -0.051311612,
 0.026190838,
 4.2008054e-05,
 -0.0005578028,
 0.032186512,
 0.022225069,
 0.022565454,
 -0.009219788,
 -0.0054784506,
 -0.038098134,
 0.03817819,
 0.0036975995,
 -0.028300084,
 -0.0023167967,
 -0.42320716,
 -0.23236611,
 -0.009545145,
 0.0014114943,
 -0.04413925,
 -0.019023806,
 -0.013956233,
 0.009616274,
 -0.027385907,
 0.012804718,
 -0.008175281,
 -0.022270229,
 -0.0073443432,
 -0.030846141,
 -0.054409187,
 -0.048494376,
 0.015630124,
 0.0070185335,
 -0.

In [41]:
embeddings_ollama2 = OllamaEmbeddings(
    model="mxbai-embed-large",
)
embeddings_ollama2.embed_query("what is the second letter of Greek Alphabet")

[0.006141204,
 0.012978657,
 0.021502227,
 -0.016592756,
 -0.079371944,
 -0.029577728,
 0.06960856,
 0.01433814,
 -0.0050159157,
 0.039703805,
 0.012970139,
 0.012581777,
 -0.020464579,
 0.007471561,
 -0.049836714,
 -0.0020065908,
 -0.0059931567,
 -0.021825898,
 -0.054613348,
 -0.00883924,
 0.058798544,
 0.043648414,
 -0.06488437,
 0.015105209,
 -0.004931051,
 0.026220618,
 -0.017163636,
 -0.027439935,
 0.009280412,
 0.049091026,
 -0.024908064,
 0.017975781,
 0.03189649,
 -0.049260184,
 0.00022970284,
 -0.020873152,
 0.038281247,
 -0.032670572,
 -0.0066524157,
 -0.064777635,
 0.038639396,
 -0.009850522,
 -0.014001946,
 -0.018116558,
 -0.03602649,
 -0.019146334,
 -0.01810843,
 -0.046307806,
 -0.0824237,
 -0.01858545,
 -0.0039228513,
 -0.012134118,
 0.049327612,
 -0.028317431,
 -0.049177166,
 0.02113196,
 0.044724453,
 -0.0061120293,
 -0.025994848,
 0.0122315325,
 0.040178973,
 -0.010169026,
 -0.00054625585,
 -0.050192647,
 -0.029460825,
 -0.01830253,
 0.040784426,
 -0.008490147,
 0.0324

# HuggingFace

In [42]:
load_dotenv()

True

In [43]:
os.environ["HF_TOKEN"]=os.getenv("HF_TOKEN")

In [46]:
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

In [47]:
embeddings_hf = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [48]:
text = "This is a test document."
query_result = embeddings.embed_query(text)
query_result

[-0.014365176670253277,
 -0.02717982605099678,
 -0.020029161125421524,
 0.05731477588415146,
 -0.022272864356637,
 0.021488480269908905,
 -0.023203181102871895,
 0.0640641301870346,
 -0.016727449372410774,
 0.018952911719679832,
 0.01849687471985817,
 0.024698983877897263,
 -0.015678564086556435,
 -0.04844940826296806,
 -0.007032095920294523,
 0.03867196664214134,
 -0.023349111899733543,
 -0.0012324409326538444,
 -0.012960582040250301,
 -0.02349504455924034,
 0.016554156318306923,
 0.004521610215306282,
 -0.041335225105285645,
 0.045494288206100464,
 0.015851859003305435,
 0.016873382031917572,
 -0.00019096564210485667,
 0.008008016273379326,
 0.018268857151269913,
 0.0040746936574578285,
 0.016088997945189476,
 0.04567670077085495,
 -0.023476803675293922,
 -0.027982452884316444,
 0.053301647305488586,
 -0.004232026636600494,
 0.044764626771211624,
 0.059722650796175,
 0.015587356872856617,
 -0.017046676948666573,
 0.03073691762983799,
 -0.012066748924553394,
 -0.0224187970161438,
 0.0

In [49]:
len(query_result)

3072