In [1]:
import requests
from googlesearch import search
import chromadb
from bs4 import BeautifulSoup

## Set up summarization model and prompt

In [2]:
from groq import Groq
import os

client = Groq(api_key = os.getenv('GROQ_API_KEY'))

def summarize(client, transcript):
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": "You are a helpful assistant that summarizes text."
            },
            {
                "role": "user",
                "content": f"Please summarize the following transcription:\n\n{transcript}",
            }
        ],
        model="llama-3.3-70b-versatile",
    )

    return chat_completion.choices[0].message.content


## Do Google search

In [3]:
query = "US tariff news, analysis, and predictions"

search_res = list(search(query, num_results=90, unique=True))
search_res[:5]

['https://www.cnbc.com/2025/04/24/trump-tariffs-pepsi-chipotle-pg-cut-earnings-forecasts.html',
 'https://finance.yahoo.com/news/live/trump-tariffs-live-updates-china-pushes-back-as-trump-claims-progress-on-trade-talks-191201016.html',
 'https://www.pbs.org/newshour/show/economic-forecasts-show-trumps-tariffs-having-major-global-impact',
 'https://finance.yahoo.com/news/live/trump-tariffs-live-updates-trump-says-china-tariffs-wont-stay-at-145-bessent-hints-at-deescalation-191201492.html',
 'https://www.deseret.com/business/2025/04/23/donald-trump-trade-tariffs-imf-report-slower-economic-growth-higher-inflation/']

In [4]:
# for debugging.  and there is a webpage that hangs
search_res = search_res[:10]

## Follow each link, and pull out the text

In [5]:
%%time
## TODO handle video - either skip it or extract the text from the audio
## TODO handle pages that are taking too long to respond

def get_text(url):
    print(f'downloading {url}...')
    response = requests.get(url)
    html = response.text
    text = BeautifulSoup(html, features="html.parser").get_text().strip()
    return text
    
documents = [get_text(url) for url in search_res]
ids = [f'id{i}' for i in range(len(documents))]
metadatas = [{'source-url': url} for url in search_res]

# print out first characters of first document
documents[0][0:100]

downloading https://www.cnbc.com/2025/04/24/trump-tariffs-pepsi-chipotle-pg-cut-earnings-forecasts.html...
downloading https://finance.yahoo.com/news/live/trump-tariffs-live-updates-china-pushes-back-as-trump-claims-progress-on-trade-talks-191201016.html...
downloading https://www.pbs.org/newshour/show/economic-forecasts-show-trumps-tariffs-having-major-global-impact...
downloading https://finance.yahoo.com/news/live/trump-tariffs-live-updates-trump-says-china-tariffs-wont-stay-at-145-bessent-hints-at-deescalation-191201492.html...
downloading https://www.deseret.com/business/2025/04/23/donald-trump-trade-tariffs-imf-report-slower-economic-growth-higher-inflation/...
downloading https://www.pbs.org/newshour/politics/americans-expect-higher-prices-from-trumps-tariffs-new-poll-shows...
downloading https://www.theguardian.com/us-news/2025/apr/16/trump-tariffs-will-send-global-trade-into-reverse-this-year-warns-wto...
downloading https://www.jpmorgan.com/insights/global-research/current-ev

'Access Denied\n\nAccess Denied\n \nYou don\'t have permission to access "http://www.cnbc.com/2025/04/24/t'

## Set up vector store for document embeddings

In [6]:
%%time

chroma_client = chromadb.Client()
collection_name = "webpages"

# get rid of old collection
try:
    chroma_client.get_collection(collection_name)
except:
    # Collection does not exist
    pass
else:
    chroma_client.delete_collection(collection_name)

collection = chroma_client.create_collection(name=collection_name)

# TODO need to hangle the case where search_res has a URL that we did not use
# `add` uses Chroma's default sentence embedding model
collection.add(documents=documents, ids=ids, metadatas=metadatas)

print(f'collection has {collection.count()} documents')


collection has 10 documents
CPU times: user 1.87 s, sys: 777 ms, total: 2.64 s
Wall time: 1.52 s


## Pull the most relevant documents based on the query

In [7]:
%%time

retrieval = collection.query(
    query_texts=["what are the most important bits of news to know about US tariff policy and analysis of future actions?"],
    n_results=10,
    include=["documents", "metadatas"]
)
retrieved_docs = retrieval['documents'][0]

CPU times: user 882 ms, sys: 323 ms, total: 1.2 s
Wall time: 552 ms


## Generate summaries

In [8]:
%%time

summaries = [summarize(client,doc) for doc in retrieved_docs]

CPU times: user 107 ms, sys: 27.4 ms, total: 134 ms
Wall time: 53.9 s


## Print out results

In [9]:
metadatas = retrieval['metadatas'][0]
urls = [data['source-url'] for data in metadatas]
for idx, (url, summary) in enumerate(zip(urls, summaries)):
    print(f'{idx+1}: {url}\n---------------------\n{summary}\n\n')

1: https://www.deseret.com/business/2025/04/23/donald-trump-trade-tariffs-imf-report-slower-economic-growth-higher-inflation/
---------------------
The article discusses the latest data on the impact of tariffs on the US economy. According to a new report from the International Monetary Fund (IMF), the US is headed for a "tariff double-shock" of higher inflation and slower growth. The report predicts that US economic growth will slow to 1.8% this year, down from a previous forecast of 2.7%, and that inflation will rise by 1 percentage point to 3%. The IMF attributes the decline in growth and increase in inflation to trade tensions and high policy uncertainty, with tariffs accounting for 0.4 percentage point of the reduction in growth.

The report also notes that the US has raised tariffs on imports from China to 145%, and that tariffs on steel and aluminum imports, as well as goods from Canada and Mexico, are also in place. However, there are reports that the White House is considering