### Load Data using Data Loaders

In [41]:
from langchain_community.document_loaders import TextLoader

In [42]:
loader = TextLoader('onepiece.txt')
docs  = loader.load()

In [43]:
full_text  = "\n".join([doc.page_content for doc in docs])

### Split into chunk using RecursiveCharacterSplitter

In [44]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [45]:
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=200)

In [46]:
chunks = splitter.create_documents([full_text])
chunks

[Document(metadata={}, page_content='Luffy dreams of becoming the Pirate King.He sails across the vast \nseas in search of adventure. Along the way, he meets friends who \nshare his dream. They face dangers, islands with mysteries, \nand powerful enemies. Each challenge teaches them courage, loyalty, and hope. \nTogether, they form a crew that cannot be broken. The journey is long, \nbut their determination never fades. Every step brings them closer to their ultimate goal.'),
 Document(metadata={}, page_content='Together, they form a crew that cannot be broken. The journey is long, \nbut their determination never fades. Every step brings them closer to their ultimate goal. \nEven in darkness, their friendship shines like a guiding light. \nThe oceans are unpredictable, with storms that test their will and \nstrange creatures that threaten their voyage. Yet Luffyâ€™s spirit inspires \neveryone around him to keep moving forward.\nHe listens to the stories of those he meets and learns fro

### Converting chunks into embeddings using
    - OpenAI Embeddings (Paid)
    - OLLama Embeddings (Open source)
    - HuggingFace (Open source)

#### We will skip OpenAI Embeddings and move forward

### Ollama

In [47]:
from langchain_community.embeddings import OllamaEmbeddings

In [None]:
# Define embedding method
embedding = OllamaEmbeddings(model="mxbai-embed-large")

In [None]:
# Initialize chroma db
from langchain_community.vectorstores import Chroma

vector_db = Chroma(
    collection_name="onepiece",
    embedding_function=embedding,
    persist_directory="./chromadb"
)

# Add documents to chroma
vector_db.add_documents(chunks)

# Persist to disk
vector_db.persist()

vector_db.similarity_search("")

In [61]:
query = "He listens to the stories of those he meets"
search_result = vector_db.similarity_search(query=query)
search_result

[Document(metadata={}, page_content='everyone around him to keep moving forward.\nHe listens to the stories of those he meets and learns from their experiences. \nEvery ally, every foe, and every island leaves a mark on their journey. \nThe crew celebrates victories, mourns losses, and grows stronger with each passing day. \nThey discover treasures, ancient secrets, and the true meaning of\nfreedom. Luffyâ€™s laughter echoes across the waves, reminding his friends to \nnever give up. Through storms and trials, the crewâ€™s'),
 Document(metadata={}, page_content='everyone around him to keep moving forward.\nHe listens to the stories of those he meets and learns from their experiences. \nEvery ally, every foe, and every island leaves a mark on their journey. \nThe crew celebrates victories, mourns losses, and grows stronger with each passing day. \nThey discover treasures, ancient secrets, and the true meaning of\nfreedom. Luffyâ€™s laughter echoes across the waves, reminding his friends

### HuggignFace Embeddings

In [1]:
import os
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
os.environ["HF_TOKEN"] = os.getenv("HF_TOKEN")

##### Sentence Transformer In HuggingFace Embeddings

In [6]:
from langchain_huggingface import HuggingFaceEmbeddings
embedding = HuggingFaceEmbeddings(model="all-MiniLM-L6-v2")

In [7]:
text = "This is a test text used to check how HunggingFace Embeddings works"
query_result = embedding.embed_query(text)

In [8]:
query_result

[-0.039458323270082474,
 0.032614246010780334,
 0.0168529711663723,
 0.02940402738749981,
 0.04992632567882538,
 -0.00021676089090760797,
 -0.001053126878105104,
 -0.016775881871581078,
 -0.020794672891497612,
 0.0023823988158255816,
 0.07275105267763138,
 0.041822243481874466,
 -0.0003433232777751982,
 -0.021644143387675285,
 -0.06521336734294891,
 0.03196694701910019,
 0.020151525735855103,
 -0.03479117900133133,
 -0.0008459604578092694,
 0.07416307181119919,
 0.013020235113799572,
 0.04040542617440224,
 0.04266686365008354,
 -0.03830939531326294,
 0.0425090566277504,
 0.04174739494919777,
 -0.07364597171545029,
 0.07668173313140869,
 0.09430322051048279,
 -0.010736869648098946,
 -0.0001312023086939007,
 0.041115522384643555,
 -0.013989885337650776,
 0.0588948093354702,
 -0.015760071575641632,
 0.028391843661665916,
 0.03788631781935692,
 0.035987336188554764,
 -0.07057615369558334,
 0.06085406243801117,
 0.003934813663363457,
 -0.0654352456331253,
 0.006587972864508629,
 0.037918705

In [9]:
len(query_result)

384