In [1]:
# import dependency
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter

### 1. Load and transforming document

In [2]:
# load text file
loader = TextLoader("../temp/file.txt")
docs = loader.load()

In [3]:
# transform into chunks
text_splitter = CharacterTextSplitter(chunk_size=10, chunk_overlap=0)
texts = text_splitter.split_documents(docs)

Created a chunk of size 107, which is longer than the specified 10
Created a chunk of size 298, which is longer than the specified 10
Created a chunk of size 218, which is longer than the specified 10


In [4]:
# compare size of doc and text
print("length of doument: {} and length of splitted text: {}".format(len(docs),len(texts)))

length of doument: 1 and length of splitted text: 4


In [7]:
# visualize chunk and doc
print(docs)

[Document(page_content='    Batman does not possess any superpowers, instead relying on his intellect, fighting skills, and wealth.\n\n    As a baby, his parents sent him to Earth in a small spaceship moments before Krypton was destroyed in a natural cataclysm. His ship landed in the American countryside, near the fictional town of Smallville. He was found and adopted by farmers Jonathan and Martha Kent, who named him Clark Kent.\n\n    In her homeland, the island nation of Themyscira, her official title is Princess Diana of Themyscira. When blending into the society outside of her homeland, she sometimes adopts her civilian identity Diana Prince.\n\n    This new Flash was Barry Allen, a police scientist who gained super-speed when bathed by chemicals after a shelf of them was struck by lightning. He adopted the name The Scarlet Speedster after reading a comic book featuring the Golden Age Flash.', metadata={'source': '../temp/file.txt'})]


In [8]:
texts

[Document(page_content='Batman does not possess any superpowers, instead relying on his intellect, fighting skills, and wealth.', metadata={'source': '../temp/file.txt'}),
 Document(page_content='As a baby, his parents sent him to Earth in a small spaceship moments before Krypton was destroyed in a natural cataclysm. His ship landed in the American countryside, near the fictional town of Smallville. He was found and adopted by farmers Jonathan and Martha Kent, who named him Clark Kent.', metadata={'source': '../temp/file.txt'}),
 Document(page_content='In her homeland, the island nation of Themyscira, her official title is Princess Diana of Themyscira. When blending into the society outside of her homeland, she sometimes adopts her civilian identity Diana Prince.', metadata={'source': '../temp/file.txt'}),
 Document(page_content='This new Flash was Barry Allen, a police scientist who gained super-speed when bathed by chemicals after a shelf of them was struck by lightning. He adopted t

### 2. Generate Embedding for trasformed document

In [9]:
# import dependency
from langchain.embeddings import LlamaCppEmbeddings
embeddings = LlamaCppEmbeddings(model_path="../models/llama-7b.ggmlv3.q5_K_S.bin")

AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 | 


In [10]:
# convert langchain doc to str
_texts = []
for i in range(len(texts)):
    _texts.append(texts[i].page_content)

In [11]:
# visualize string
_texts

['Batman does not possess any superpowers, instead relying on his intellect, fighting skills, and wealth.',
 'As a baby, his parents sent him to Earth in a small spaceship moments before Krypton was destroyed in a natural cataclysm. His ship landed in the American countryside, near the fictional town of Smallville. He was found and adopted by farmers Jonathan and Martha Kent, who named him Clark Kent.',
 'In her homeland, the island nation of Themyscira, her official title is Princess Diana of Themyscira. When blending into the society outside of her homeland, she sometimes adopts her civilian identity Diana Prince.',
 'This new Flash was Barry Allen, a police scientist who gained super-speed when bathed by chemicals after a shelf of them was struck by lightning. He adopted the name The Scarlet Speedster after reading a comic book featuring the Golden Age Flash.']

In [12]:
# embedded list of string
embedded_text = embeddings.embed_documents(_texts)

In [16]:
print("length of embedded text: {} and length of single list embedding text: {}".format(len(embedded_text), len(embedded_text[0])))

length of embedded text: 4 and length of single list embedding text: 4096


In [17]:
# Embed query
query = "What skills did batman had?"
embedded_query = embeddings.embed_query(query)
len(embedded_query)

4096

In [18]:
embedded_query[:4]

[0.940786600112915,
 -0.14845183491706848,
 -0.9119601249694824,
 -5.006768226623535]

### 3. Create Vector Space & Retrieving Docs

In [19]:
# import dependency
from langchain.vectorstores import Chroma

In [20]:
# Create a chrom vector store from a list of documents
db = Chroma.from_documents(texts, embeddings)

In [21]:
# perform similarity search with query over db
query = "What is a name of flash?"
docs = db.similarity_search(query, k=1)
docs

[Document(page_content='Batman does not possess any superpowers, instead relying on his intellect, fighting skills, and wealth.', metadata={'source': '../temp/file.txt'})]

In [22]:
# search document using query vector
query = "What is a name of flash?"
query_vector = embeddings.embed_query(query)
docs = db.similarity_search_by_vector(query_vector, k=1)

In [23]:
docs

[Document(page_content='Batman does not possess any superpowers, instead relying on his intellect, fighting skills, and wealth.', metadata={'source': '../temp/file.txt'})]