In [11]:
!pip3 install -r requirements.txt

INFO: pip is looking at multiple versions of langchain-community to determine which version is compatible with other requirements. This could take a while.
Collecting langchain-community (from -r requirements.txt (line 12))
  Downloading langchain_community-0.2.3-py3-none-any.whl.metadata (9.0 kB)
Downloading langchain_community-0.2.3-py3-none-any.whl (2.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m27.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: langchain-community
  Attempting uninstall: langchain-community
    Found existing installation: langchain-community 0.0.18
    Uninstalling langchain-community-0.0.18:
      Successfully uninstalled langchain-community-0.0.18
Successfully installed langchain-community-0.2.3


In [7]:
from dotenv import dotenv_values
import os
SECRETS=dotenv_values(".env")
os.environ['OPENAI_API_KEY'] = SECRETS['OPENAI_API_KEY']
os.environ['APIFY_API_TOKEN'] = SECRETS['APIFY_API_KEY']

In [25]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain_openai import OpenAI

embeddings = OpenAIEmbeddings(show_progress_bar=True)

llm = OpenAI(temperature=0)

# Initializing apify content crawler

In [18]:
from langchain.utilities import ApifyWrapper
from langchain.docstore.document import Document

apify = ApifyWrapper()
link  = "https://medium.com/data-and-beyond/vector-databases-a-beginners-guide-b050cbbe9ca0"
loader = apify.call_actor(
    actor_id="apify/website-content-crawler",
    run_input={
        "startUrls": [{"url": link}],
        "aggressivePrune": True, # aggressivePrune : to avoid duplicates documents 

    },
    dataset_mapping_function=lambda item: Document(
        page_content=item["text"] or "", metadata={"source": item["url"]}
    ),
)

In [17]:
loader

ApifyDatasetLoader(apify_client=<apify_client.client.ApifyClient object at 0x10fcb58e0>, dataset_id='MqLDMo79Rhdcpgxgb', dataset_mapping_function=<function <lambda> at 0x10b979620>)

Why using `RecursiveCharacterTextSplitter` ?

- This text splitter is the recommended one for generic text. It is parameterized by a list of characters. It tries to split on them in order until the chunks are small enough. The default list is 
`["\n\n", "\n", " ", ""]`. This has the effect of trying to keep all paragraphs (and then sentences, and then words) together as long as possible, as those would generically seem to be the strongest semantically related pieces of text.

In [23]:
from langchain.indexes import VectorstoreIndexCreator
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=0
)

index = VectorstoreIndexCreator(
    text_splitter=text_splitter,
    embedding=embeddings
).from_loaders([loader])

index

  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 1/1 [00:00<00:00,  1.42it/s]


VectorStoreIndexWrapper(vectorstore=<langchain_community.vectorstores.inmemory.InMemoryVectorStore object at 0x1746c48c0>)

In [27]:
query = " What is the main benefit of a vector database ?"

index.query_with_sources(query, llm=llm)

100%|██████████| 1/1 [00:00<00:00,  3.64it/s]


{'question': ' What is the main benefit of a vector database ?',
 'answer': ' The main benefit of a vector database is its ability to handle complex, high-dimensional data while offering efficient querying and retrieval mechanisms. This makes it a critical tool for extracting meaningful insights and unlocking new opportunities in various industries. \n',
 'sources': 'https://medium.com/data-and-beyond/vector-databases-a-beginners-guide-b050cbbe9ca0'}

In [30]:
from langchain.chains import RetrievalQA

retriever = index.vectorstore.as_retriever()

qa = RetrievalQA.from_chain_type(
    llm=llm, 
    retriever=retriever,
)

qa.run(
    query, 
)

  warn_deprecated(
100%|██████████| 1/1 [00:00<00:00,  2.04it/s]


' The main benefit of a vector database is its ability to handle complex, high-dimensional data while offering efficient querying and retrieval mechanisms.'