# Installation

In [None]:
!pip install --upgrade langchain openai  -q

In [None]:
!pip install sentence_transformers -q

In [None]:
!pip install unstructured -q
!pip install unstructured[local-inference] -q
!pip install detectron2@git+https://github.com/facebookresearch/detectron2.git@v0.6

In [None]:
!apt-get install poppler-utils

In [None]:
!pip install chromadb -q

# Import

In [6]:
from langchain.vectorstores import Chroma
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import TextLoader

In [12]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [7]:
from sentence_transformers import SentenceTransformer

# Data

In [8]:
loader = DirectoryLoader("/content/data", glob = "./*.txt", loader_cls=TextLoader)

In [10]:
document = loader.load()

In [16]:
len(document)

3

In [13]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 30)
text = text_splitter.split_documents(document)

In [None]:
text

In [34]:
data = []
for txt in text:
    data.append(txt.page_content)
len(data)

37

In [None]:
text[10].page_content

# ChromaDB

In [None]:
persist_directory = "db"
embedding = SentenceTransformer('paraphrase-MiniLM-L3-v2')

In [29]:
embeding = embedding.encode(data).tolist()

In [31]:
len(embeding[0])

384

In [None]:
ids = [str(i) for i in range(1,38)]


In [47]:

import chromadb

client = chromadb.Client()
client = chromadb.PersistentClient(path="db")
collection = client.create_collection("test")

In [56]:
collection.add(
    documents=data,
    embeddings=embeding,
    ids = ids
)

In [57]:
vectordb = Chroma(persist_directory="/content/db", embedding_function=embedding)

In [58]:
vectordb

<langchain_community.vectorstores.chroma.Chroma at 0x7b5d644e9090>

In [59]:

!zip -r 'db.zip' '/content/db'

  adding: content/db/ (stored 0%)
  adding: content/db/a869f3ca-a745-4499-9c7f-97f774bf8071/ (stored 0%)
  adding: content/db/a869f3ca-a745-4499-9c7f-97f774bf8071/data_level0.bin (deflated 61%)
  adding: content/db/a869f3ca-a745-4499-9c7f-97f774bf8071/link_lists.bin (stored 0%)
  adding: content/db/a869f3ca-a745-4499-9c7f-97f774bf8071/header.bin (deflated 61%)
  adding: content/db/a869f3ca-a745-4499-9c7f-97f774bf8071/length.bin (deflated 41%)
  adding: content/db/chroma.sqlite3 (deflated 70%)


# Testing

In [62]:
query = "what is rivalry between india and pakistan?"
input_em = embedding.encode(query).tolist()

results = collection.query(
    query_embeddings=[input_em],
    n_results=1
)
results

{'ids': [['1']],
 'distances': [[9.17283107813053]],
 'metadatas': [[None]],
 'embeddings': None,
 'documents': [["The cricketing rivalry between India and Pakistan is one of the most fiercely contested and deeply ingrained rivalries in the world of sports. Dating back to the partition of India in 1947, which led to the creation of Pakistan, this rivalry transcends the boundaries of sport and reflects the complex political, cultural, and historical dynamics between the two neighboring nations. Here's an exploration of the rich history of the Indian cricket vs. Pakistan cricket rivalry:"]],
 'uris': None,
 'data': None}