# Installation

In [None]:
!pip install --upgrade langchain openai  -q

In [None]:
!pip install sentence_transformers -q

In [None]:
!pip install unstructured -q
!pip install unstructured[local-inference] -q
!pip install detectron2@git+https://github.com/facebookresearch/detectron2.git@v0.6

In [None]:
!apt-get install poppler-utils

In [None]:
!pip install chromadb -q

# Import

In [6]:
from langchain.vectorstores import Chroma
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import TextLoader

In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os

In [None]:
from langchain_community.chat_models import ChatOpenAI
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.chains.question_answering import load_qa_chain

# Data

In [16]:
loader = DirectoryLoader("/content/data", glob = "./*.txt", loader_cls=TextLoader)

In [17]:
document = loader.load()

In [18]:
len(document)

3

In [19]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 30)
text = text_splitter.split_documents(document)

In [20]:
len(text)

37

In [21]:
data = []
for txt in text:
    data.append(txt.page_content)
len(data)

37

In [22]:
text[10].page_content

'Pathway to Peace:\nDespite the political tensions and occasional hostilities, cricket has often been seen as a pathway to peace and reconciliation between India and Pakistan. The cricketing diplomacy, exemplified by initiatives such as the "Cricket for Peace" series and the "Aman ki Asha" (Hope for Peace) campaign, underscores the potential of sport to bridge divides and foster understanding between nations.'

In [28]:
len(text)

37

# ChromaDB

In [None]:
persist_directory = "db"
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

In [32]:
db = Chroma.from_documents(text, embeddings)

In [34]:
vectordb = Chroma.from_documents(
    documents=text, embedding=embeddings, persist_directory=persist_directory
)

# Test

In [40]:

query = "What is the rivalry between india and pakistan?"
matching_docs = db.similarity_search(query, k=3)


In [None]:
matching_docs

In [42]:
os.environ["OPENAI_API_KEY"] = ""

In [44]:
model_name = "gpt-3.5-turbo"
llm = ChatOpenAI(model_name=model_name)

In [47]:
chain = load_qa_chain(llm, chain_type="stuff")

In [None]:

query = "What is the rivalry between india and pakistan?"
matching_docs = db.similarity_search(query, k=3)
answer =  chain.run(input_documents=matching_docs, question=query)
answer

# End Test

In [49]:

!zip -r 'db.zip' '/content/db'

  adding: content/db/ (stored 0%)
  adding: content/db/dcc23105-c247-4bf9-8367-7527c863b090/ (stored 0%)
  adding: content/db/dcc23105-c247-4bf9-8367-7527c863b090/data_level0.bin (deflated 49%)
  adding: content/db/dcc23105-c247-4bf9-8367-7527c863b090/link_lists.bin (stored 0%)
  adding: content/db/dcc23105-c247-4bf9-8367-7527c863b090/header.bin (deflated 61%)
  adding: content/db/dcc23105-c247-4bf9-8367-7527c863b090/length.bin (deflated 53%)
  adding: content/db/chroma.sqlite3 (deflated 70%)
