# Building a News Article Summarizer

In [None]:
!pip install newspaper3k lxml_html_clean langchain langchain-cohere python-dotenv

The "COHERE_API_KEY" is stored in .env file

In [None]:
def warn(*args, **kwargs):
    pass

import warnings
warnings.warn = warn
warnings.filterwarnings('ignore')

import os
import cohere
import json

co = cohere.Client(os.getenv("COHERE_API_KEY"))

Use the newspapaer library to extract the title and text of the article

In [None]:
import requests
from newspaper import Article
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/122.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.5",
}
article_url ="""https://www.artificialintelligence-news.com/news/meta-claims-new-ai-supercomputer-will-set-records/"""

session = requests.Session()

try:
    response = session.get(article_url, headers=headers, timeout=10)
    if(response.status_code == 200):
        article = Article(article_url)
        article.download()
        article.parse()

        print(f"Title : {article.title}")
        print(f"Text : {article.text}")
    else:
        print(f"Failed to fetch article at {article_url}")
except Exception as e:
        print("Error occured while fetching article at {article_url} : {e}")

The HumanMessage is a structured data format that captures user messages within chat-based interactions. In the following code, the ChatCohere class is employed for interaction with the AI model and the HumanMessage schema permits a standarized format for user messages.

In [None]:
from langchain.schema import(
    HumanMessage
)
article_title = article.title
article_text = article.text

template = """
You are a very good assistant that summarizes online articles.
Here's the artcile you want to summarize.
===================
Title: {article_title}
{article_text}

===================
Write a summary of the previous article
"""

prompt = template.format(article_title = article.title, article_text = article.text)

messages = [HumanMessage(content=prompt)]

In [None]:
import os
import cohere
from langchain_cohere import ChatCohere

co = cohere.Client(os.getenv("COHERE_API_KEY"))

# Define the Cohere LLM
llm = ChatCohere(
    cohere_api_key=os.getenv("COHERE_API_KEY"),
    model="command-a-03-2025",
    temperature=0,
)

The model processes the prompt and returns the summary :

In [None]:
print(llm.invoke(messages).content)

Modify the propmt to ask for a bulleted list format:

In [None]:
template = """You are an advanced AI assistant that summarizes online articles into bullet lists.
Here's the article you want to summarize.

========================
Title : {article_title}

{article_text}
========================

Now, provide a summarized version of the article in a bulleted list format.
"""

# format prompt
prompt = template.format(article_title= article.title, article_text=article.text)
#generate summary
message = [HumanMessage(content=prompt)]
print(llm.invoke(message).content)

### Improving the summarizer

I will use the PydanticOutputParser class. This output parser allows users to specify an arbitrary JSON schema and query LLMs for JSON outputs that conform to that schema. The goal here is to convert the language model's string output into a strcutured data format. I will use a custom class derived from the BaseModel class of the Pydantic package.

In [None]:
from langchain.output_parsers import PydanticOutputParser
from pydantic import field_validator
from pydantic import BaseModel, Field
from typing import List

# create output parser class
class ArticleSummary(BaseModel):
    title: str = Field(description="Title of the article")
    summary: List[str] = Field(description="""Bulleted list summary of the article""")
    #validating whether the generated summary has at least three lines
    @field_validator('summary')
    def has_three_or_more_lines(cls, list_of_lines):
        if(len(list_of_lines) < 3):
            raise ValueError("""Generated summary has less than three bullet points!""")
        return list_of_lines
    

parser = PydanticOutputParser(pydantic_object=ArticleSummary)

In [None]:
# The prompt template integrates the custom parser to format the prompts
from langchain.prompts import PromptTemplate

template = """You are a very good assistant that summarizes online articles.
Here's the article you want to summarize.
===================
Title: {article_title}
{article_text}

===================
{format_instructions}
"""

prompt_template = PromptTemplate(
    template=template,
    input_variables=["article_title", "article_text"],
    partial_variables={"format_instructions" : parser.get_format_instructions()}
)

Transforming the string output from the model into a specified format :

In [None]:
from langchain import LLMChain
from langchain_cohere import ChatCohere

model = ChatCohere(
    cohere_api_key=os.getenv("COHERE_API_KEY"),
    model="command-a-03-2025",
    temperature=0,
)

chain = LLMChain(llm=model, prompt=prompt_template)
#Use the model to generate a summary
output = chain.run({"article_title" : article_title, "article_text" : article_text})

#Parse the output into the Pydantic model
parsed_output = parser.parse(output)
print(parsed_output)