In [1]:
import os
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Pinecone, Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chains import ConversationalRetrievalChain
from langchain.chat_models import ChatOpenAI

In [2]:
os.environ['OPENAI_API_KEY'] = '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'

> [LangChain document loaders](https://python.langchain.com/docs/modules/data_connection/document_loaders.html)

In [3]:
from langchain.document_loaders import DirectoryLoader

pdf_doc_loader = DirectoryLoader('../dataset/', glob="**/*.pdf")
markdown_doc_loader = DirectoryLoader('../dataset/', glob="**/*.md")
txt_doc_loader = DirectoryLoader('../dataset/', glob="**/*.txt")

In [4]:
# take all the loader
doc_loaders = [pdf_doc_loader, markdown_doc_loader, txt_doc_loader]

# ets create document 
documents = []
for doc in doc_loaders:
    documents.extend(doc.load())

[nltk_data] Downloading package punkt to /home/sm-ce-36/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/sm-ce-36/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [7]:
print (f'You have {len(documents)} document(s) in your data')
print (f'There are {len(documents[0].page_content)} characters in document 1')
print (f'There are {len(documents[1].page_content)} characters in document 2')
print (f'There are {len(documents[2].page_content)} characters in document 3')
print (f'There are {len(documents[3].page_content)} characters in document 4')

You have 4 document(s) in your data
There are 4379 characters in document 1
There are 11243 characters in document 2
There are 638 characters in document 3
There are 894 characters in document 4


In [8]:
documents[3]

Document(page_content='Our History Anaconda was founded in 2012 by Peter Wang and Travis Oliphant out of the need to bring Python into business data analytics, which was rapidly transforming as a result of emerging technology trends. Additionally, the open-source community lacked an entity that could organize and collectivize it to maximize its impact. Since that time, the Python ecosystem has significantly expanded, with Python being the most popular programming language used today. Alongside this expansion, Anaconda has provided value to students learning Python and data science, individual practitioners, small teams, and enterprise businesses. We aim to meet every user where they are in their data science journey. Anaconda now has over 300 full-time employees based in the United States, Canada, Germany, United Kingdom, Australia, India, and Japan. We are proud to serve over 35 million users worldwide.', metadata={'source': '../dataset/anaconda.txt'})

> Retrieve the text from the documents

In [9]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=40)
documents = text_splitter.split_documents(documents)
print(len(documents))

21


> Storing embeddings in the Vectorestore

In [10]:
embeddings = OpenAIEmbeddings()

> Using vector store Pinecone

* [Pinecone](https://python.langchain.com/docs/integrations/vectorstores/pinecone?highlight=pinecone#pinecone)
* What is [vectorstore](https://www.pinecone.io/learn/vector-database/)
* [Get your pinecone api key and env](https://app.pinecone.io/)

In [None]:
# %pip install pinecone-client

> Using vector store Chroma

In [12]:
from langchain.vectorstores import Chroma

vs = Chroma.from_documents(documents, embeddings)

In [13]:
query = 'what is anaconda ?'
docs = vs.similarity_search(query)

In [14]:
len(docs)

4

In [22]:
for doc in docs:
    print(doc.page_content, '\n--------------------')

Our History Anaconda was founded in 2012 by Peter Wang and Travis Oliphant out of the need to bring Python into business data analytics, which was rapidly transforming as a result of emerging technology trends. Additionally, the open-source community lacked an entity that could organize and collectivize it to maximize its impact. Since that time, the Python ecosystem has significantly expanded, with Python being the most popular programming language used today. Alongside this expansion, Anaconda has provided value to students learning Python and data science, individual practitioners, small teams, and enterprise businesses. We aim to meet every user where they are in their data science journey. Anaconda now has over 300 full-time employees based in the United States, Canada, Germany, United Kingdom, Australia, India, and Japan. We are proud to serve over 35 million users worldwide. 
--------------------
Our History Anaconda was founded in 2012 by Peter Wang and Travis Oliphant out of t

> Talk to the documents?

In [23]:
from langchain.llms import OpenAI

retriever = vs.as_retriever(search_type="similarity", search_kwargs={"k":2})
qa = ConversationalRetrievalChain.from_llm(OpenAI(temperature=0), retriever)

In [24]:
chat_history = []
query = "what is anaconda ?"
result = qa({"question": query, "chat_history": chat_history})
result["answer"]

' Anaconda is a company founded in 2012 that provides value to students learning Python and data science, individual practitioners, small teams, and enterprise businesses. It has over 300 full-time employees based in the United States, Canada, Germany, United Kingdom, Australia, India, and Japan, and serves over 35 million users worldwide.'

In [25]:
chat_history.append((query, result["answer"]))
chat_history

[('what is anaconda ?',
  ' Anaconda is a company founded in 2012 that provides value to students learning Python and data science, individual practitioners, small teams, and enterprise businesses. It has over 300 full-time employees based in the United States, Canada, Germany, United Kingdom, Australia, India, and Japan, and serves over 35 million users worldwide.')]

In [26]:
query = "What are the pricings available for anaconda ?"
result = qa({"question": query, "chat_history": chat_history})
result["answer"]

' Anaconda has three pricing options: FREE, STARTER ($9/mo), and PRO ($25/mo).'

In [27]:
query = "Command to create anaconda environment ?"
result = qa({"question": query, "chat_history": chat_history})
result["answer"]

' You can create an Anaconda environment by using the conda build command. You can include any version of Python packaged with conda when creating the environment.'