# Indexing data

In [2]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

file_path = '/Users/damienbenveniste/Projects/Teaching/Introduction_Langchain/data/mixed_data/element_of_SL.pdf'

loader = PyPDFLoader(file_path=file_path)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=0
)

data = loader.load_and_split(text_splitter=text_splitter)
data

[Document(page_content='Springer Series in Statistics\nTrevor Hastie\nRobert TibshiraniJerome FriedmanSpringer Series in Statistics\nThe Elements of\nStatistical Learning\nData Mining, Inference, and Prediction\nThe Elements of Statistical LearningDuring the past decade there has been an explosion in computation and information tech-', metadata={'source': '/Users/damienbenveniste/Projects/Teaching/Introduction_Langchain/data/mixed_data/element_of_SL.pdf', 'page': 0}),
 Document(page_content='nology. With it have come vast amounts of data in a variety of fields such as medicine, biolo-gy, finance, and marketing. The challenge of understanding these data has led to the devel-opment of new tools in the field of statistics, and spawned new areas such as data mining,machine learning, and bioinformatics. Many of these tools have common underpinnings butare often expressed with different terminology. This book describes the important ideas inthese areas in a common conceptual framework.', met

In [3]:
data[0].page_content

'Springer Series in Statistics\nTrevor Hastie\nRobert TibshiraniJerome FriedmanSpringer Series in Statistics\nThe Elements of\nStatistical Learning\nData Mining, Inference, and Prediction\nThe Elements of Statistical LearningDuring the past decade there has been an explosion in computation and information tech-'

In [5]:
%pip install faiss-cpu

Note: you may need to restart the kernel to use updated packages.


In [13]:
from langchain.embeddings.openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(show_progress_bar=True)

vector1 = embeddings.embed_query('How are you?')

len(vector1)

  0%|          | 0/1 [00:00<?, ?it/s]

1536

In [10]:
import numpy as np
from numpy.linalg import norm

def get_cosine(vec1, vec2):
    return np.dot(vec1,vec2)/(norm(vec1)*norm(vec2))
    
vector1 = embeddings.embed_query('machine learning')
vector2 = embeddings.embed_query('artificial intelligence')
cosine = get_cosine(vector1, vector2)
cosine

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

0.8819345646753848

In [11]:
vector3 = embeddings.embed_query('peperoni pizza')
cosine = get_cosine(vector2, vector3)
cosine

  0%|          | 0/1 [00:00<?, ?it/s]

0.7401281950537992

In [15]:
from langchain.vectorstores import FAISS

index = FAISS.from_documents(data, embeddings)

  0%|          | 0/5 [00:00<?, ?it/s]

In [16]:
index.similarity_search_with_relevance_scores(
    "What is machine learning?"
)

  0%|          | 0/1 [00:00<?, ?it/s]

[(Document(page_content='This is page 1\nPrinter: Opaque this\n1\nIntroduction\nStatistical learning plays a key role in many areas of science, ﬁnance and\nindustry. Here are some examples of learning problems:\n•Predict whether a patient, hospitalized due to a heart attac k, will\nhave a second heart attack. The prediction is to be based on de mo-\ngraphic, diet and clinical measurements for that patient.\n•Predict the price of a stock in 6 months from now, on the basis o f\ncompany performance measures and economic data.', metadata={'source': '/Users/damienbenveniste/Projects/Teaching/Introduction_Langchain/data/mixed_data/element_of_SL.pdf', 'page': 19}),
  0.7547787193298542),
 (Document(page_content='This is page 389\nPrinter: Opaque this\n11\nNeural Networks\n11.1 Introduction\nIn this chapter we describe a class of learning methods that w as developed\nseparately in diﬀerent ﬁelds—statistics and artiﬁcial inte lligence—based\non essentially identical models. The central idea is 

In [22]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.callbacks import StdOutCallbackHandler

retriever = index.as_retriever()
retriever.search_kwargs['fetch_k'] = 20
retriever.search_kwargs['maximal_marginal_relevance'] = True
retriever.search_kwargs['k'] = 10

llm = ChatOpenAI()

chain = RetrievalQA.from_chain_type(
    llm=llm, 
    retriever=retriever,
    verbose=True
)

handler = StdOutCallbackHandler()

chain.run(
    'What is machine learning?',
    callbacks=[handler]
)



[1m> Entering new RetrievalQA chain...[0m


  0%|          | 0/1 [00:00<?, ?it/s]



[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the users question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
This is page 1
Printer: Opaque this
1
Introduction
Statistical learning plays a key role in many areas of science, ﬁnance and
industry. Here are some examples of learning problems:
•Predict whether a patient, hospitalized due to a heart attac k, will
have a second heart attack. The prediction is to be based on de mo-
graphic, diet and clinical measurements for that patient.
•Predict the price of a stock in 6 months from now, on the basis o f
company performance measures and economic data.

This is page 389
Printer: Opaque this
11
Neural Networks
11.1 Introduction
In this chapter we describe a class of learning methods that w as developed
separately in diﬀerent ﬁelds—statis

'Machine learning is a field of study that involves the development of algorithms and models that can learn from data and make predictions or decisions without being explicitly programmed. It focuses on creating computer systems that can automatically learn and improve from experience, rather than being explicitly programmed for specific tasks. Machine learning algorithms analyze large amounts of data to identify patterns, make predictions, or learn from examples and feedback. It is widely used in various fields such as science, finance, and industry for tasks like predicting stock prices, medical diagnoses, and customer behavior analysis.'

# Loading data into a Vector Database

In [19]:
%pip install pinecone-client

Note: you may need to restart the kernel to use updated packages.


In [20]:
import pinecone 
from langchain.vectorstores import Pinecone

pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_ENV  # next to api key in console
)

index_name = "langchain-demo"
db = Pinecone.from_documents(
    data, 
    embeddings, 
    index_name=index_name
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

In [21]:
chain = RetrievalQA.from_chain_type(
    llm=llm, 
    retriever=db.as_retriever(),
    verbose=True
)

chain.run(
    'What is machine learning?',
    callbacks=[handler]
)



[1m> Entering new RetrievalQA chain...[0m


  0%|          | 0/1 [00:00<?, ?it/s]



[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the users question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
This is page 1
Printer: Opaque this
1
Introduction
Statistical learning plays a key role in many areas of science, ﬁnance and
industry. Here are some examples of learning problems:
•Predict whether a patient, hospitalized due to a heart attac k, will
have a second heart attack. The prediction is to be based on de mo-
graphic, diet and clinical measurements for that patient.
•Predict the price of a stock in 6 months from now, on the basis o f
company performance measures and economic data.

This is page 389
Printer: Opaque this
11
Neural Networks
11.1 Introduction
In this chapter we describe a class of learning methods that w as developed
separately in diﬀerent ﬁelds—statis

'Machine learning is a field of study that focuses on developing algorithms and models that enable computers to learn and make predictions or decisions without being explicitly programmed. It involves training a computer system on a large amount of data and allowing it to learn patterns and relationships within the data, which it can then use to make predictions or take actions in new, unseen situations. In essence, machine learning enables computers to learn from experience and improve their performance over time.'

# Providing sources

In [24]:
%pip install newsapi-python

Note: you may need to restart the kernel to use updated packages.


In [25]:
from datetime import date, timedelta
from newsapi import NewsApiClient

newsapi = NewsApiClient(api_key=NEWS_API_KEY)

today = date.today()
last_week = today - timedelta(days=7)

latest_news = newsapi.get_everything(
    q='artificial intelligence',
    from_param=last_week.strftime("%Y-%m-%d"),
    to=today.strftime("%Y-%m-%d"),
    sort_by='relevancy',
    language='en'
)

In [27]:
latest_news['articles']

[{'source': {'id': None, 'name': 'Lifehacker.com'},
  'author': 'Stephen Johnson',
  'title': 'An AI Moderator Is Coming With ‘Call of Duty: Modern Warfare 3’',
  'description': 'When Call of Duty: Modern Warfare 3 comes out on Nov. 10, all players’ voice chats will be silently monitored by artificial intelligence. The AI-powered moderation technology, Toxmod, is designed to identify toxic speech in real time in multiplayer games to c…',
  'url': 'https://lifehacker.com/an-ai-moderator-is-coming-with-call-of-duty-modern-wa-1850793420',
  'urlToImage': 'https://i.kinja-img.com/gawker-media/image/upload/c_fill,f_auto,fl_progressive,g_center,h_675,pg_1,q_80,w_1200/1c72c41d607fc3ffe4fb1c5268fd3449.jpg',
  'publishedAt': '2023-08-31T22:30:00Z',
  'content': 'When Call of Duty: Modern Warfare 3 comes out on Nov. 10, all players voice chats will be silently monitored by artificial intelligence. The AI-powered moderation technology, Toxmod, is designed to i… [+2695 chars]'},
 {'source': {'id':

In [28]:
from langchain.docstore.document import Document
docs = [
    Document(
    page_content=article['title'] + '\n\n' + article['description'], 
    metadata={
        'source': article['url'],
    }
    ) for article in latest_news['articles']
]

print(docs[0].page_content)
print(docs[0].metadata)

An AI Moderator Is Coming With ‘Call of Duty: Modern Warfare 3’

When Call of Duty: Modern Warfare 3 comes out on Nov. 10, all players’ voice chats will be silently monitored by artificial intelligence. The AI-powered moderation technology, Toxmod, is designed to identify toxic speech in real time in multiplayer games to c…
{'source': 'https://lifehacker.com/an-ai-moderator-is-coming-with-call-of-duty-modern-wa-1850793420'}


In [29]:
from langchain.chains import create_qa_with_sources_chain
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.prompts import PromptTemplate

qa_chain = create_qa_with_sources_chain(llm)

doc_prompt = PromptTemplate(
    template="Content: {page_content}\nSource: {source}",
    input_variables=["page_content", "source"],
)

final_qa_chain = StuffDocumentsChain(
    llm_chain=qa_chain,
    document_variable_name="context",
    document_prompt=doc_prompt,
)

index = FAISS.from_documents(docs, embedding=embeddings)


chain = RetrievalQA(
    retriever=index.as_retriever(), 
    combine_documents_chain=final_qa_chain
)


  0%|          | 0/1 [00:00<?, ?it/s]

In [30]:
question = """
What is the most important news about artificial intelligence from last week?
"""

answer = chain.run(question)

print(answer)

  0%|          | 0/1 [00:00<?, ?it/s]

{
  "answer": "The most important news about artificial intelligence from last week is the use of AI to train on the works of authors Stephen King and Margaret Atwood. These authors responded to the revelation that their work is being used to train AI. Additionally, AI took the stage at the Edinburgh Fringe festival, raising the question of whether AI can deliver a satisfying punchline. Furthermore, a tech expert from the University of Oxford highlighted the potential workplace threats of AI, including the possibility of AI becoming a monitoring boss. Finally, AI is being seen as a tool that can help companies connect with customers in a more personalized and efficient way.",
  "sources": [
    "https://www.theatlantic.com/newsletters/archive/2023/09/books-briefing-ai-stephen-king-margaret-atwood/675213/?utm_source=feed",
    "https://www.cnet.com/tech/ai-took-the-stage-at-the-worlds-largest-arts-festival-heres-what-happened/",
    "https://www.foxnews.com/tech/tech-expert-existential-

# Indexing a website

In [31]:
%pip install apify-client chromadb

Note: you may need to restart the kernel to use updated packages.


In [33]:
from langchain.utilities import ApifyWrapper
from langchain.document_loaders.base import Document

apify = ApifyWrapper()

loader = apify.call_actor(
    actor_id="apify/website-content-crawler",
    run_input={
        "startUrls": [{"url": "https://newsletter.theaiedge.io/"}],
        "aggressivePrune": True,
    },
    dataset_mapping_function=lambda item: Document(
        page_content=item["text"] or "", metadata={"source": item["url"]}
    ),
)

In [34]:
from langchain.indexes import VectorstoreIndexCreator

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=0
)

index = VectorstoreIndexCreator(
    text_splitter=text_splitter
).from_loaders([loader])

index

VectorStoreIndexWrapper(vectorstore=<langchain.vectorstores.chroma.Chroma object at 0x1272497d0>)

In [35]:
query = "What is the main subject of the aiedge newsletter?"

index.query_with_sources(query)

{'question': 'What is the main subject of the aiedge newsletter?',
 'answer': ' The main subject of the AiEdge newsletter is Machine Learning applications, Machine Learning System Design, MLOps, and the latest techniques and news about the field.\n',
 'sources': ''}

In [36]:
retriever = index.vectorstore.as_retriever()

qa = RetrievalQA.from_chain_type(
    llm=llm, 
    retriever=retriever,
)

query = "What is the most recent article of the aiedge newsletter?"

qa.run(
    query, 
    callbacks=[handler]
)



[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the users question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
The AiEdge Newsletter
A newsletter for continuous learning about Machine Learning applications, Machine Learning System Design, MLOps, the latest techniques and news. Subscribe and receive a free Machine Learning book PDF!

Keep reading with a 7-day free trial
Subscribe to 
The AiEdge Newsletter
to keep reading this post and get 7 days of free access to the full post archives.

Keep reading with a 7-day free trial
Subscribe to 
The AiEdge Newsletter
to keep reading this post and get 7 days of free access to the full post archives.

The AiEdge Newsletter is a simple way to keep learning about Artificial Intelligence and Machine 

"I'm sorry, but I don't have access to the specific articles or the most recent content of the AiEdge Newsletter. As an AI language model, I don't have real-time access to current articles or newsletters. It would be best to subscribe to the newsletter and check the latest edition for the most recent article."

# Indexing a GitHub repo

In [37]:
%pip install GitPython

Note: you may need to restart the kernel to use updated packages.


In [39]:
from langchain.document_loaders import GitLoader

loader = GitLoader(
    clone_url="https://github.com/langchain-ai/langchain",
    repo_path="./data/repo/",
    file_filter=lambda file_path: file_path.endswith(".py"),
    branch="master",
)

documents = loader.load()

In [41]:
print(documents[0].page_content)

"""Configuration file for the Sphinx documentation builder."""
# Configuration file for the Sphinx documentation builder.
#
# This file only contains a selection of the most common options. For a full
# list see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html

# -- Path setup --------------------------------------------------------------

import json
import os
import sys
from pathlib import Path

import toml
from docutils import nodes
from sphinx.util.docutils import SphinxDirective

# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.

_DIR = Path(__file__).parent.absolute()
sys.path.insert(0, os.path.abspath("."))
sys.path.insert(0, os.path.abspath("../../libs/langchain"))
sys.path.insert(0, os.path.abspath("../../libs/experimental"))

with (_DIR.parents[1] / "libs" /

In [42]:
len(documents)

1763

In [43]:
from langchain.text_splitter import Language

python_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON, 
    chunk_size=1000, 
    chunk_overlap=200
)

documents = python_splitter.split_documents(documents)

In [44]:
documents[0]

Document(page_content='"""Configuration file for the Sphinx documentation builder."""\n# Configuration file for the Sphinx documentation builder.\n#\n# This file only contains a selection of the most common options. For a full\n# list see the documentation:\n# https://www.sphinx-doc.org/en/master/usage/configuration.html\n\n# -- Path setup --------------------------------------------------------------\n\nimport json\nimport os\nimport sys\nfrom pathlib import Path\n\nimport toml\nfrom docutils import nodes\nfrom sphinx.util.docutils import SphinxDirective\n\n# If extensions (or modules to document with autodoc) are in another directory,\n# add these directories to sys.path here. If the directory is relative to the\n# documentation root, use os.path.abspath to make it absolute, like shown here.\n\n_DIR = Path(__file__).parent.absolute()\nsys.path.insert(0, os.path.abspath("."))\nsys.path.insert(0, os.path.abspath("../../libs/langchain"))\nsys.path.insert(0, os.path.abspath("../../libs/e

In [45]:
len(documents)

10906

In [46]:
index = FAISS.from_documents(documents, embeddings)
retriever = index.as_retriever()

qa = RetrievalQA.from_chain_type(
    llm=llm, 
    retriever=retriever,
)

query = "What is a stuff chain?"

qa.run(query, callbacks=[handler])

  0%|          | 0/11 [00:00<?, ?it/s]



[1m> Entering new RetrievalQA chain...[0m


  0%|          | 0/1 [00:00<?, ?it/s]



[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the users question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
def _load_stuff_chain(
    llm: BaseLanguageModel,
    prompt: BasePromptTemplate = stuff_prompt.PROMPT,
    document_prompt: BasePromptTemplate = stuff_prompt.EXAMPLE_PROMPT,
    document_variable_name: str = "summaries",
    verbose: Optional[bool] = None,
    **kwargs: Any,
) -> StuffDocumentsChain:
    llm_chain = LLMChain(llm=llm, prompt=prompt, verbose=verbose)
    return StuffDocumentsChain(
        llm_chain=llm_chain,
        document_variable_name=document_variable_name,
        document_prompt=document_prompt,
        verbose=verbose,
        **kwargs,
    )

def _load_stuff_chain(
    llm: BaseLanguageModel,
    prompt: Optional[BasePromptTemplate] = None,
    

'A stuff chain is a sequence of operations performed on a language model (LLM) to generate or process text. It typically consists of a language model chain (LLMChain) and a document chain (StuffDocumentsChain). The LLMChain is responsible for generating text based on a prompt, while the StuffDocumentsChain is used to process and manipulate documents or summaries. The specific details and functionality of a stuff chain can vary depending on the context and configuration.'

In [48]:
retriever.search_kwargs["distance_metric"] = "cos"
retriever.search_kwargs['fetch_k'] = 200
retriever.search_kwargs['maximal_marginal_relevance'] = True
retriever.search_kwargs['k'] = 20

qa = RetrievalQA.from_chain_type(
    llm=llm, 
    retriever=retriever,
)

query = "When should I use a map reduce chain?"

qa.run(query, callbacks=[handler])



[1m> Entering new RetrievalQA chain...[0m


  0%|          | 0/1 [00:00<?, ?it/s]



[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the users question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
class MapReduceChain(Chain):
    """Map-reduce chain."""

    combine_documents_chain: BaseCombineDocumentsChain
    """Chain to use to combine documents."""
    text_splitter: TextSplitter
    """Text splitter to use."""
    input_key: str = "input_text"  #: :meta private:
    output_key: str = "output_text"  #: :meta private:

collapse_documents_chain=collapse_chain,
        token_max=token_max,
        verbose=verbose,
    )
    return MapReduceDocumentsChain(
        llm_chain=map_chain,
        document_variable_name=map_reduce_document_variable_name,
        reduce_documents_chain=reduce_documents_chain,
        verbose=verbose,
        callback_manager=callback_mana


[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m


'A MapReduce chain is useful when you have a large document that needs to be processed in parallel. It splits the document into smaller parts, applies a map function to each part, and then combines the results using a reduce function. This allows for efficient processing of large amounts of data by distributing the workload across multiple processors or machines.'

In [49]:
query = "When should I use a map rank chain?"

qa.run(query, callbacks=[handler])



[1m> Entering new RetrievalQA chain...[0m


  0%|          | 0/1 [00:00<?, ?it/s]



[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the users question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
def _load_map_rerank_documents_chain(
    config: dict, **kwargs: Any
) -> MapRerankDocumentsChain:
    if "llm_chain" in config:
        llm_chain_config = config.pop("llm_chain")
        llm_chain = load_chain_from_config(llm_chain_config)
    elif "llm_chain_path" in config:
        llm_chain = load_chain(config.pop("llm_chain_path"))
    else:
        raise ValueError("One of `llm_chain` or `llm_chain_config` must be present.")
    return MapRerankDocumentsChain(llm_chain=llm_chain, **config)

def _load_map_rerank_chain(
    llm: BaseLanguageModel,
    prompt: BasePromptTemplate = MAP_RERANK_PROMPT,
    verbose: bool = False,
    document_variable_name: str = "context"


[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m


'You should use a MapRerankDocumentsChain when you want to combine multiple documents by mapping a chain over them and then reranking the results. This algorithm calls an LLMChain on each input document and uses an OutputParser to parse the results into an answer and a score. The answer with the highest score is then returned.'