In [1]:
!pip install -q newspaper3k==0.2.8 python-dotenv

In [2]:
!pip install langchain deeplake openai tiktoken

Collecting langchain
  Downloading langchain-0.0.239-py3-none-any.whl (1.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting deeplake
  Downloading deeplake-3.6.12.tar.gz (527 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting openai
  Downloading openai-0.27.8-py3-none-any.whl (73 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.6/73.6 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tiktoken
  Downloading tiktoken-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting langsmith<0.1.0,>=0.0.11 (from langchain)
  Downloading langsmith-0.0.

In [16]:
import os

os.environ["OPENAI_API_KEY"] = "<openai key"
os.environ["ACTIVELOOP_TOKEN"] = "<activeloop key>"

In [17]:
import requests
from newspaper import Article # https://github.com/codelucas/newspaper
import time

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'
}

article_urls = [
          "https://www.britannica.com/place/India/History",
    "https://en.wikipedia.org/wiki/History_of_India"                # add as many source you want
]

session = requests.Session()
pages_content = [] # where we save the scraped articles

for url in article_urls:
    try:
        time.sleep(2) # sleep two seconds for gentle scraping
        response = session.get(url, headers=headers, timeout=10)

        if response.status_code == 200:
            article = Article(url)
            article.download() # download HTML of webpage
            article.parse() # parse HTML to extract the article text
            pages_content.append({ "url": url, "text": article.text })
        else:
            print(f"Failed to fetch article at {url}")
    except Exception as e:
        print(f"Error occurred while fetching article at {url}: {e}")



In [18]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import DeepLake

embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")

# TODO: use your organization id here. (by default, org id is your username)
my_activeloop_org_id = "<org_id>"
my_activeloop_dataset_name = "langchain_course_qabot_with_source2"
dataset_path = f"hub://{my_activeloop_org_id}/{my_activeloop_dataset_name}"

db = DeepLake(dataset_path=dataset_path, embedding_function=embeddings)

Deep Lake Dataset in hub://kumarrupesh2002/langchain_course_qabot_with_source2 already exists, loading from the storage


In [19]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

all_texts, all_metadatas = [], []
for d in pages_content:
    chunks = text_splitter.split_text(d["text"])
    for chunk in chunks:
        all_texts.append(chunk)
        all_metadatas.append({ "source": d["url"] })

In [20]:
db.add_texts(all_texts, all_metadatas)

\

Dataset(path='hub://kumarrupesh2002/langchain_course_qabot_with_source2', tensors=['embedding', 'id', 'metadata', 'text'])

  tensor      htype       shape      dtype  compression
  -------    -------     -------    -------  ------- 
 embedding  embedding  (185, 1536)  float32   None   
    id        text      (185, 1)      str     None   
 metadata     json      (185, 1)      str     None   
   text       text      (185, 1)      str     None   


 

['9de48bd4-2864-11ee-8205-0242ac130202',
 '9de48d3c-2864-11ee-8205-0242ac130202',
 '9de48dd2-2864-11ee-8205-0242ac130202',
 '9de48e22-2864-11ee-8205-0242ac130202',
 '9de48e68-2864-11ee-8205-0242ac130202',
 '9de48eae-2864-11ee-8205-0242ac130202',
 '9de48efe-2864-11ee-8205-0242ac130202',
 '9de490d4-2864-11ee-8205-0242ac130202',
 '9de4914c-2864-11ee-8205-0242ac130202',
 '9de491a6-2864-11ee-8205-0242ac130202',
 '9de491ec-2864-11ee-8205-0242ac130202',
 '9de49232-2864-11ee-8205-0242ac130202',
 '9de49278-2864-11ee-8205-0242ac130202',
 '9de492d2-2864-11ee-8205-0242ac130202',
 '9de49318-2864-11ee-8205-0242ac130202',
 '9de4935e-2864-11ee-8205-0242ac130202',
 '9de493ae-2864-11ee-8205-0242ac130202',
 '9de493fe-2864-11ee-8205-0242ac130202',
 '9de49444-2864-11ee-8205-0242ac130202',
 '9de49494-2864-11ee-8205-0242ac130202',
 '9de494da-2864-11ee-8205-0242ac130202',
 '9de49520-2864-11ee-8205-0242ac130202',
 '9de49566-2864-11ee-8205-0242ac130202',
 '9de495ac-2864-11ee-8205-0242ac130202',
 '9de495f2-2864-

In [21]:
from langchain.chains import RetrievalQAWithSourcesChain
from langchain import OpenAI

llm = OpenAI(model_name="text-davinci-003", temperature=0)

chain = RetrievalQAWithSourcesChain.from_chain_type(llm= llm,
                                                    chain_type="stuff",
                                                    retriever=db.as_retriever()
                                                     )

In [22]:
d_response = chain({"question": "create bullet points of Indian History after 1800"})

print("Response:")
print(d_response["answer"])
print("Sources:")
for source in d_response["sources"].split(", "):
    print("- " + source)

Response:

Bullet points of Indian History after 1800:
- East India Company gradually annexed large regions of India from mid-18th century to mid-19th century
- Indian Rebellion of 1857 led to dissolution of the company and India was ruled directly by the British Crown in the British Raj
- Nationwide struggle for independence launched by Indian National Congress led by Mahatma Gandhi
- All-India Muslim League advocated for a separate Muslim-majority nation state
- British Indian Empire was partitioned in August 1947 into the Dominion of India and Dominion of Pakistan
- Colonial government strengthened and expanded infrastructure via court system, legal procedures, and statutes
- Indian Penal Code came into being
- Thomas Babington Macaulay made schooling a priority for the Raj and English was used as the medium of instruction
- Indian economy grew at about 1% per year from 1880 to 1920
- Indian private industry began to grow significantly
- India built a modern railway system in the la