In [1]:
import requests
from googlesearch import search
import chromadb
from bs4 import BeautifulSoup
from dotenv import load_dotenv

## Set up summarization model and prompt

In [2]:
from groq import Groq
import os
load_dotenv('../.env')
client = Groq(api_key = os.getenv('GROQ_API_KEY'))

def summarize(client, transcript):
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": "You are a helpful assistant that summarizes text."
            },
            {
                "role": "user",
                "content": f"Please summarize the following transcription:\n\n{transcript}",
            }
        ],
        model="llama-3.3-70b-versatile",
    )

    return chat_completion.choices[0].message.content


## Do Google search

In [3]:
query = "US tariff news, analysis, and predictions"

search_res = list(search(query, num_results=90, unique=True))
search_res[:5]

['https://www.cnn.com/2025/05/02/economy/europe-lower-inflation-us-tariffs-analysis-intl',
 'https://www.cnbc.com/2025/05/02/heres-how-china-could-retaliate-against-us-tariffs.html',
 'https://www.jpmorgan.com/insights/global-research/current-events/us-tariffs',
 'https://finance.yahoo.com/news/live/trump-tariffs-live-updates-china-says-door-is-open-to-trade-talks-with-the-us-191201877.html',
 'https://www.investors.com/news/trump-tariffs-trade-war-us-ports-empty/']

In [4]:
# for debugging.  and there is a webpage that hangs
search_res = search_res[:10]

## Follow each link, and pull out the text

In [5]:
%%time
## TODO handle video - either skip it or extract the text from the audio
## TODO handle pages that are taking too long to respond

def get_text(url):
    print(f'downloading {url}...')
    response = requests.get(url)
    html = response.text
    text = BeautifulSoup(html, features="html.parser").get_text().strip()
    return text
    
documents = [get_text(url) for url in search_res]
ids = [f'id{i}' for i in range(len(documents))]
metadatas = [{'source-url': url} for url in search_res]

# print out first characters of first document
documents[0][0:100]

downloading https://www.cnn.com/2025/05/02/economy/europe-lower-inflation-us-tariffs-analysis-intl...
downloading https://www.cnbc.com/2025/05/02/heres-how-china-could-retaliate-against-us-tariffs.html...
downloading https://www.jpmorgan.com/insights/global-research/current-events/us-tariffs...
downloading https://finance.yahoo.com/news/live/trump-tariffs-live-updates-china-says-door-is-open-to-trade-talks-with-the-us-191201877.html...
downloading https://www.investors.com/news/trump-tariffs-trade-war-us-ports-empty/...
downloading https://www.nytimes.com/2025/05/02/business/jobs-report-april-tariffs.html...
downloading https://www.cnbc.com/2025/05/02/trump-tariffs-what-to-expect-on-prime-day-july-4-and-black-friday.html...
downloading https://www.reuters.com/business/tariffs/...
downloading https://www.cnn.com/2025/04/27/politics/tariffs-trump-trade-war/index.html...
downloading https://budgetlab.yale.edu/research/where-we-stand-fiscal-economic-and-distributional-effects-all-us-tariff

'Analysis: US tariffs could make Europe ‘Great Again’ by lowering prices | CNN Business\n\n\n\n\n\n\n\n\n\n\n\n\n\n'

## Set up vector store for document embeddings

In [6]:
%%time

chroma_client = chromadb.Client()
collection_name = "webpages"

# get rid of old collection
try:
    chroma_client.get_collection(collection_name)
except:
    # Collection does not exist
    pass
else:
    chroma_client.delete_collection(collection_name)

collection = chroma_client.create_collection(name=collection_name)

# TODO need to hangle the case where search_res has a URL that we did not use
# `add` uses Chroma's default sentence embedding model
collection.add(documents=documents, ids=ids, metadatas=metadatas)

print(f'collection has {collection.count()} documents')


collection has 10 documents
CPU times: user 1.86 s, sys: 821 ms, total: 2.68 s
Wall time: 1.78 s


## Pull the most relevant documents based on the query

In [7]:
%%time

retrieval = collection.query(
    query_texts=["what are the most important bits of news to know about US tariff policy and analysis of future actions?"],
    n_results=10,
    include=["documents", "metadatas"]
)
retrieved_docs = retrieval['documents'][0]

CPU times: user 887 ms, sys: 306 ms, total: 1.19 s
Wall time: 534 ms


## Generate summaries

In [8]:
%%time

summaries = [summarize(client,doc) for doc in retrieved_docs]

CPU times: user 124 ms, sys: 27.3 ms, total: 151 ms
Wall time: 1min 8s


## Print out results

In [9]:
metadatas = retrieval['metadatas'][0]
urls = [data['source-url'] for data in metadatas]
for idx, (url, summary) in enumerate(zip(urls, summaries)):
    print(f'{idx+1}: {url}\n---------------------\n{summary}\n\n')

1: https://budgetlab.yale.edu/research/where-we-stand-fiscal-economic-and-distributional-effects-all-us-tariffs-enacted-2025-through-april
---------------------
The Budget Lab at Yale has analyzed the fiscal, economic, and distributional effects of all US tariffs enacted in 2025 through April 2. The key takeaways are:

1. **Increased Tariff Rate**: The average effective US tariff rate has risen to 22.5%, the highest since 1909, due to the new tariffs.
2. **Price Increases**: The price level is expected to rise by 2.3% in the short run, resulting in an average household loss of $3,800 in 2024.
3. **GDP Impact**: US real GDP growth is expected to be 0.9% lower in 2025, and the economy will be 0.6% smaller in the long run, equivalent to $180 billion annually.
4. **Distributional Effects**: Tariffs are regressive, with households at the bottom of the income ladder bearing a larger burden. The average annual cost to households in the 2nd decile is $1,700, while those in the top decile face 