In [1]:
import os
from typing import TypedDict, Annotated, List
from langgraph.graph import Graph, END
from langchain_openai import ChatOpenAI
from pydantic import BaseModel, Field
from langchain_core.runnables.graph import MermaidDrawMethod
from datetime import datetime
import re

from newsapi import NewsApiClient
import requests
from bs4 import BeautifulSoup

from IPython.display import display, Image as IPImage
from langchain_core.messages import SystemMessage

In [2]:
import os
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
os.environ['summarizorenv'] ="utf-8"

#invoke llm
llm= ChatOpenAI(
    model= "gpt-4o-mini",
    temperature = 0.5,
    api_key = OPENAI_API_KEY
)

class GraphState(TypedDict):
    news_query: Annotated[str, "Input query to extract news search parameters from."]
    num_searches_remaining: Annotated[int, "Number of articles to search for."]
    newsapi_params: Annotated[dict, "Structured argument for the News API."]
    past_searches: Annotated[List[dict], "List of search params already used."]
    articles_metadata: Annotated[list[dict], "Article metadata response from the News API"]
    scraped_urls: Annotated[List[str], "List of urls already scraped."]
    num_articles_tldr: Annotated[int, "Number of articles to create TL;DR for."]
    potential_articles: Annotated[List[dict[str, str, str]], "Article with full text to consider summarizing."]
    tldr_articles: Annotated[List[dict[str, str, str]], "Selected article TL;DRs."]
    formatted_results: Annotated[str, "Formatted results to display."]

In [4]:
class NewsApiParams(BaseModel):
    q: str = Field(description="1-3 concise keyword search terms that are not too specific")
    sources: str =Field(description="comma-separated list of sources from: 'abc-news,abc-news-au,associated-press,australian-financial-review,axios,bbc-news,bbc-sport,bloomberg,business-insider,cbc-news,cbs-news,cnn,financial-post,fortune'")
    from_param: str = Field(description="date in format 'YYYY-MM-DD' Two days ago minimum. Extend up to 30 days on second and subsequent requests.")
    to: str = Field(description="date in format 'YYYY-MM-DD' today's date unless specified")
    language: str = Field(description="language of articles 'en' unless specified one of ['ar', 'de', 'en', 'es', 'fr', 'he', 'it', 'nl', 'no', 'pt', 'ru', 'se', 'ud', 'zh']")
    sort_by: str = Field(description="sort by 'relevancy', 'popularity', or 'publishedAt'")

In [5]:
class NewsState(TypedDict,total=False):
    retry_count: int

In [6]:
"""This function generates News API params as per user's question."""

def generate_newsapi_params(state: GraphState):
    today_date = datetime.now().strftime("%Y-%m-%d")
    news_query = state.get("news_query", "")
    num_searches_remaining = state.get("num_searches_remaining", 0)
    past_searches = state.get("past_searches") or []

    sys_prompt = """
    Today's date is {today_date}
    Create a param dict for the News API on the user query:
    {query}

    These searches have already been made. Loosen the search terms to get more results.
    {past_searches}

    Including this one, you have {num_searches_remaining} searches remaining. If this is your last search, use all news resources and 30 days search range.
"""
    
    sys_msg = sys_prompt.format(
        today_date=today_date,
        query=news_query,
        past_searches=past_searches,
        num_searches_remaining=num_searches_remaining
    )

    llm_with_news_structured_output = llm.with_structured_output(NewsApiParams)

    result = llm_with_news_structured_output.invoke([SystemMessage(content=sys_msg)])

    params = {
        "q": result.q,
        "sources": result.sources,
        "from_param": result.from_param,
        "to": result.to,
        "language": result.language,
        "sort_by": result.sort_by
    }

    state["newsapi_params"] = params

    return state


In [7]:
def retrieve_article_metadata(state: GraphState):
    """This gives metadata about the articles"""
    
    newsapi_params = state.get("newsapi_params", {})
    scraped_urls = state.get("scraped_urls") or []
    potential_articles = state.get("potential_articles") or []
    past_searches = state.get("past_searches") or []
        
    # initiate NewsAPI Client
    newsapi = NewsApiClient(api_key='e99bd6f587ed4652a89f8e326f082aee')
        
    # get the articles
    articles = newsapi.get_everything(**newsapi_params)
    
    # add the parameters for the history
    past_searches.append(newsapi_params)

    new_articles = []
        
    for article in articles['articles']:
        if article['url'] not in scraped_urls and len(potential_articles) + len(new_articles) < 10:
            new_articles.append(article)
    
    state['articles_metadata'] = new_articles
    return state


In [8]:
def retrieve_article_text(state: GraphState):
    """scrape the websites metadata"""

    article_metadata = state.get("articles_metadata") or []

    potential_articles = []

    # header for scraping
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'
    }

    for article in article_metadata:
        url = article['url']

        response = requests.get(url, headers=headers)

        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')

            text = soup.get_text(strip=True)

            potential_articles.append({
                "title": article["title"],
                "url": url,
                "description": article["description"],
                "text": text
            })

            scraped_urls = state.get("scraped_urls") or []
            scraped_urls.append(url)
            state["scraped_urls"] = scraped_urls

    state.get("potential_articles", []).extend(potential_articles)
    return state


In [9]:
def select_top_urls(state: GraphState) -> GraphState:
    """Based on the article synoses, choose the top-n articles to summarize."""
    news_query = state.get("news_query", "")
    num_articles_tldr = state.get("num_articles_tldr", 0)
    
    # load all processed articles with full text but no summaries
    potential_articles = state.get("potential_articles") or []

    # format the metadata
    formatted_metadata = "\n".join([f"{article['url']}\n{article['description']}\n" for article in potential_articles])

    prompt = f"""
    Based on the user news query:
    {news_query}

    Reply with a list of strings of up to {num_articles_tldr} relevant urls.
    Don't add any urls that are not relevant or aren't listed specifically.
    {formatted_metadata}
    """
    result = llm.invoke(prompt).content

    # use regex to extract the urls as a list
    url_pattern = r'(https?://[^\s",]+)'

    # Find all URLs in the text
    urls = re.findall(url_pattern, result)

    # add the selected article metadata to the state
    tldr_articles = [article for article in potential_articles if article['url'] in urls]

    state["tldr_articles"] = tldr_articles
    return state


In [10]:
async def summarize_articles_parallel(state: GraphState) -> GraphState:
    """Summarize the articles based on full text."""
    tldr_articles = state.get("tldr_articles") or []

    prompt = """
    Create a * bulleted summarizing tldr for the article:
    {text}
    Be sure to follow the following format exaxtly with nothing else:
    {title}
    {url}
    * tl;dr bulleted summary
    * use bullet points for each sentence
    """

    # iterate over the selected articles and collect summaries synchronously
    for i in range(len(tldr_articles)):
        text = tldr_articles[i]["text"]
        title = tldr_articles[i]["title"]
        url = tldr_articles[i]["url"]
        # invoke the llm synchronously
        result = llm.invoke(prompt.format(title=title, url=url, text=text))
        tldr_articles[i]["summary"] = result.content

    state["tldr_articles"] = tldr_articles

    return state


In [11]:
def format_results(state: GraphState) -> GraphState:
    """Format the results for display."""
    # load a list of past search queries
    q = [newsapi_params["q"] for newsapi_params in (state.get("past_searches") or [])]
    formatted_results = f"Here are the top {len(state.get('tldr_articles') or [])} articles based on search terms:\n{', '.join(q)}\n\n"

    # load the summarized articles
    tldr_articles = state.get("tldr_articles") or []

    # format article tl;dr summaries
    tldr_articles = "\n\n".join([f"{article['summary']}" for article in tldr_articles])

    # concatenate summaries to the formatted results
    formatted_results += tldr_articles

    state["formatted_results"] = formatted_results

    return state


In [12]:
'''def format_results(state: GraphState) -> GraphState:
    """Format the results for display."""
    # load a list of past search queries
    q = [newsapi_params["q"] for newsapi_params in state["past_searches"]]
    formatted_results = f"Here are the top {len(state['tldr_articles'])} articles based on search terms:\\n{', '.join(q)}\\n\\n"

    # load the summarized articles
    tldr_articles = state["tldr_articles"]

    # format article tl;dr summaries
    tldr_articles = "\\n\\n".join([f"{article['summary']}" for article in tldr_articles])

    # concatenate summaries to the formatted results
    formatted_results += tldr_articles

    state["formatted_results"] = formatted_results

    return state'''

'def format_results(state: GraphState) -> GraphState:\n    """Format the results for display."""\n    # load a list of past search queries\n    q = [newsapi_params["q"] for newsapi_params in state["past_searches"]]\n    formatted_results = f"Here are the top {len(state[\'tldr_articles\'])} articles based on search terms:\\n{\', \'.join(q)}\\n\\n"\n\n    # load the summarized articles\n    tldr_articles = state["tldr_articles"]\n\n    # format article tl;dr summaries\n    tldr_articles = "\\n\\n".join([f"{article[\'summary\']}" for article in tldr_articles])\n\n    # concatenate summaries to the formatted results\n    formatted_results += tldr_articles\n\n    state["formatted_results"] = formatted_results\n\n    return state'

In [13]:
def articles_text_decision(state: GraphState) -> str:
    """Check results of retrieve_articles_text to determine next step."""
    
    if state.get("num_searches_remaining", 0) == 0:
        # if no articles with text were found return END
        if len(state.get("potential_articles") or []) == 0:
            state["formatted_results"] = "No articles with text found."
            return "END"
        # if some articles were found, move on to selecting the top urls
        else:
            return "select_top_urls"
    else:
        # if the number of articles found is less than the number of articles to summarize, continue searching
        if len(state.get("potential_articles") or []) < state.get("num_articles_tldr", 0):
            return "generate_newsapi_params"
        # otherwise move on to selecting the top urls
        else:
            return "select_top_urls"


In [14]:
from langgraph.graph import Graph, START, END

workflow = Graph()

# define nodes
workflow.add_node("generate_newsapi_params", generate_newsapi_params)
workflow.add_node("retrieve_articles_metadata", retrieve_article_metadata)
workflow.add_node("retrieve_articles_text", retrieve_article_text)
workflow.add_node("select_top_urls", select_top_urls)
workflow.add_node("summarize_articles_parallel", summarize_articles_parallel)
workflow.add_node("format_results", format_results)

# entry edge: connect START to first node
workflow.add_edge(START, "generate_newsapi_params")

# define edges
workflow.add_edge("generate_newsapi_params", "retrieve_articles_metadata")
workflow.add_edge("retrieve_articles_metadata", "retrieve_articles_text")
workflow.add_conditional_edges(
    "retrieve_articles_text",
    articles_text_decision,
    {
        # keys here should match possible return values of articles_text_decision(state)
        "generate_newsapi_params": "generate_newsapi_params",
        "select_top_urls": "select_top_urls",
        "END": END
    }
)
workflow.add_edge("select_top_urls", "summarize_articles_parallel")
workflow.add_conditional_edges(
    "summarize_articles_parallel",
    lambda state: "format_results" if len(state.get("tldr_articles", [])) > 0 else "END",
    {
        "format_results": "format_results",
        "END": END
    }
)
workflow.add_edge("format_results", END)

app = workflow.compile()


In [15]:
async def run_workflow(query: str, num_searches_remaining: int = 3, num_articles_tldr: int = 2):
    """Run the LangGraph workflow and display results."""
    initial_state = {
        "news_query": query,
        "num_searches_remaining": num_searches_remaining,
        "newsapi_params": {},
        "past_searches": [],
        "articles_metadata": [],
        "scraped_urls": [],
        "num_articles_tldr": num_articles_tldr,
        "potential_articles": [],
        "tldr_articles": [],
        "formatted_results": "No articles with text found."
    }
    try:
        result = await app.ainvoke(initial_state)
        
        return result["formatted_results"]
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None

In [20]:
query = "Apple Iphone 15"
result=await run_workflow(query, num_articles_tldr=3)
print(result)




Here are the top 3 articles based on search terms:


Apple just joined Nvidia and Microsoft in the $4 trillion club  
https://markets.businessinsider.com/news/stocks/apple-stock-price-market-cap-iphone-17-nvidia-microsoft-ai-2025-10  
* Apple achieved a market capitalization of $4 trillion for the first time.  
* It becomes the third public company to reach this milestone, following Nvidia and Microsoft.  
* The surge in Apple’s stock price, reaching around $270, was driven by strong sales of the new iPhone 17.  
* The iPhone 17 outsold its predecessor in the US and China during its initial release period.  
* Despite the milestone, Apple’s stock has only increased about 7% year to date, amid concerns about its position in the AI sector.  
* Nvidia's stock has risen 44% this year, while Microsoft's has increased by 29%.  
* Apple reported net sales of $391 billion and net income of $94 billion in its last financial year.  
* The company generated over $200 billion from iPhone sales alo