In [1]:
! pip install markdown



In [2]:
! pip install transformers



In [3]:
# get the respone using requests module
import requests

URL = "https://www.indiatoday.in/"
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
response = requests.get(URL, headers=headers)

print("status code:",response.status_code)

status code: 200


In [4]:
# creating an html file
with open('news.html', 'w', encoding='utf-8') as f:
    f.write(response.text)

In [5]:
# setting up beautiful soup
from bs4 import BeautifulSoup

with open("news.html", 'r', encoding='utf-8') as f:
    html_doc = f.read()
soup = BeautifulSoup(html_doc, 'html.parser')

In [6]:
# get the story grid
story_grids = soup.find_all("div", class_="story__grid")
print(len(story_grids))

6


In [7]:
# get all the links and titles
set_lt = set()

for story_grid in story_grids:
    anchors = story_grid.find_all('a', href=True, title=True)
    for a in anchors:
        if(a['href'][0] == '/'):
            set_lt.add((a['href'], a['title']))

set_lt

{('/business/story/onion-prices-surge-rising-demand-traders-hold-stock-nashik-maharashtra-india-2551660-2024-06-11',
  'Onion prices increase 30-50% in 2 weeks as demand rises: Report '),
 ('/elections/story/nda-south-india-bjp-ministers-modi-30-andhra-pradesh-telangana-karnataka-tamil-nadu-kerala-2551436-2024-06-11',
  "13 South ministers add to Modi 3.0 gunpowder, show BJP's resolve"),
 ('/india/andhra-pradesh/story/pawan-kalyan-seeks-deputy-chief-minister-post-in-andhra-cabinet-sources-2551641-2024-06-11',
  'Pawan Kalyan seeks Deputy Chief Minister post in Andhra cabinet: Sources'),
 ('/india/karnataka/story/kannada-actor-darshan-thoogudeepa-arrested-renukaswamy-murder-case-bengaluru-police-2551669-2024-06-11',
  'Kannada actor Darshan Thoogudeepa, wife Pavithra Gowda detained in murder case'),
 ('/india/story/bengal-man-denies-making-sexual-exploitation-charge-against-amit-malviya-2551622-2024-06-11',
  "Bengal man denies making sexual exploitation charge against BJP's Amit Malviy

In [8]:
# group the news using the first part of the links
groups = {}
for lt in set_lt:
    news_type = lt[0][1: lt[0][1:].find('/')+1]
    if news_type not in groups:
        groups[news_type] = [lt]
    else:
        groups[news_type].append(lt)

In [9]:
# hugging face pipelines
from transformers import pipeline

summarizer =  pipeline("summarization", model="google-t5/t5-base", tokenizer="google-t5/t5-base")

2024-06-11 07:19:44.937907: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-11 07:19:44.937959: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-11 07:19:44.939374: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [10]:
# create an abstraction to scrape paths
import os

def scrape_path(path):
    
    # use the requesets library to retrive the html file
    pathURL = "https://www.indiatoday.in" + path
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
    response = requests.get(pathURL, headers=headers)
    
    if response.status_code != 200:
        return None
    
    with open('temp.html', 'w', encoding='utf-8') as f:
        f.write(response.text)
    with open("temp.html", 'r', encoding='utf-8') as f:
        html_doc = f.read()
        
    # create the soup
    soup = BeautifulSoup(html_doc, 'html.parser')
    
    # get the story description
    story_description = ""
    all_div = soup.find_all('div')
    for div in all_div:
        all_classes = set(div.get('class', []))
        if any("story_description" in cls.lower() for cls in all_classes):
            all_ps = div.find_all('p')
            for p in all_ps:
                if not p.attrs: 
                    story_description += p.text
    story_description = story_description.replace('\n', '')
    
    # get the given summary
    story_summary = ""
    all_ul = soup.find_all("ul")
    for ul in all_ul:
        all_classes = set(ul.get('class', []))
        if any("story_high" in cls.lower() for cls in all_classes):
            all_li = ul.find_all('li')
            for li in all_li:
                story_summary += li.text + '. ';
    
    # get pipeline summary
    pipeline_summary = summarizer(story_description)
    
    # delete the file
    if os.path.exists("temp.html"):
        os.remove("temp.html")
        
    return story_summary + pipeline_summary[0]['summary_text']

In [11]:
final_markdown_string = "# News \n"
for group in groups:
    final_markdown_string += f"## {group} \n"
    for path in groups[group]:
        path_summary = scrape_path(path[0])
        if(path_summary is not None):
            final_markdown_string += f"### {path[1]} \n" + path_summary + ' \n'

Your max_length is set to 200, but your input_length is only 3. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=1)


In [12]:
# Convert markdown to HTML
import markdown

html_content = markdown.markdown(final_markdown_string)

# Optionally, write the HTML content to a file
with open('summary.html', 'w', encoding='utf-8') as f:
      f.write(html_content)

In [13]:
# delete the news file
# delete the file
if os.path.exists("news.html"):
    os.remove("news.html")