In [1]:
import requests
import pandas as pd
import re
import bs4
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
import time

In [2]:
def safe_get(url):
    try:
        response = requests.get(url, timeout=30)
        response.raise_for_status()  # Raise an error for HTTP errors (e.g., 404, 500)
        return response
    except RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

In [3]:
def get_data(soup):
    title=soup.find("meta", {"property": "og:title"})["content"]
    if title:
        title = title.strip()
    else:
        title=soup.title.string
        index = title.find("-")
        if index != -1:
            title = title[index+1:].strip()
    
    description = soup.find("meta", {"name": "description"})
    if description:
        description = description.get("content", "").strip()
    else:
        description = "No description found"
        
    image=soup.find("meta", {"property": "og:image"})
    if image:
        image = image.get("content", "").strip()
    else:
        image = ""
        
    site_url = soup.find("meta", {"property": "og:url"})
    if site_url:
        site_url = site_url.get("content", "").strip()
    else:
        site_url = ""
    
    paragraphs = soup.find("div", class_="article-content").find_all("p")
    if not paragraphs:
        content=""
    else:
        paragraph_texts = [p.get_text() for p in paragraphs]
        content=" ".join(paragraph_texts).strip()
    
    info=soup.find("div", class_="article-info--col").find_all("p")[0].get_text()
    if info:
        info=info[info.find(":") + 1:].strip()
        publish_date, publish_time=info.split("at")
    else:
        publish_date = ""
        publish_time = ""
    return {
        "title": title,
        "description": description,
        "content": content,
        "image": image,
        "site_url": site_url,
        "publish_date": publish_date.strip(),
        "publish_time": publish_time.strip()
    }

In [4]:
response = safe_get("https://www.bangkokpost.com/most-recent")

In [5]:
if response is not None:
    soup = BeautifulSoup(response.text, "html.parser")

In [6]:
figures = soup.find("div", class_="page--link").find_all("figure")
links=[]
for figure in figures:
    link = figure.find("a")
    if link:
        href = link.get("href")
        if href:
            links.append(href.strip())
links = list(set(links))  # Remove duplicates

In [7]:
links

['/thailand/general/2996892/no-signs-of-life-more-bodies-expected-at-collapsed-tower-bangkok-governor',
 '/world/2996902/frances-far-right-leftwingers-rally-supporters-after-le-pen-conviction',
 '/business/general/2996926/indonesia-will-not-retaliate-against-trump-tariff-official-says',
 '/business/general/2996861/us-vietnamese-businesses-ask-trump-to-delay-46-tariffs-on-vietnam',
 '/thailand/general/2996942/friendship-wont-taint-probe-into-bangkok-tower-collapse-anutin',
 '/world/2996917/recovering-pope-surprises-crowd-at-vatican-square',
 '/business/general/2996912/uk-readies-to-protect-industry-as-us-tariffs-upend-global-order-starmer',
 '/video/thailand/2996869/deeper-dive-thailand-vaping-debate-takeaways',
 '/thailand/politics/2996936/bhumjaithai-party-debuts-new-all-blue-logo']

In [8]:
for link in links:
    print("https://www.bangkokpost.com" + link)

https://www.bangkokpost.com/thailand/general/2996892/no-signs-of-life-more-bodies-expected-at-collapsed-tower-bangkok-governor
https://www.bangkokpost.com/world/2996902/frances-far-right-leftwingers-rally-supporters-after-le-pen-conviction
https://www.bangkokpost.com/business/general/2996926/indonesia-will-not-retaliate-against-trump-tariff-official-says
https://www.bangkokpost.com/business/general/2996861/us-vietnamese-businesses-ask-trump-to-delay-46-tariffs-on-vietnam
https://www.bangkokpost.com/thailand/general/2996942/friendship-wont-taint-probe-into-bangkok-tower-collapse-anutin
https://www.bangkokpost.com/world/2996917/recovering-pope-surprises-crowd-at-vatican-square
https://www.bangkokpost.com/business/general/2996912/uk-readies-to-protect-industry-as-us-tariffs-upend-global-order-starmer
https://www.bangkokpost.com/video/thailand/2996869/deeper-dive-thailand-vaping-debate-takeaways
https://www.bangkokpost.com/thailand/politics/2996936/bhumjaithai-party-debuts-new-all-blue-log

In [None]:
res=safe_get("https://www.bangkokpost.com/video/thailand/2996869/deeper-dive-thailand-vaping-debate-takeaways")
if res:
    soup = BeautifulSoup(res.text, "html.parser")
    
    data = get_data(soup)
    print(soup)

AttributeError: 'NoneType' object has no attribute 'find_all'

In [18]:
news=[]
sleep_for=5
count=0
for link in links:
    time.sleep(sleep_for)  # import time
    target="https://www.bangkokpost.com" + link
    print("Fetching: "+target)
    response= safe_get(target)
    print("Response: "+str(response))
    if response:
        soup = BeautifulSoup(response.text, "html.parser")
        if(soup.find("div", class_="article-content") is None):
            continue
        data = get_data(soup)
        print("Count = ",count,data)
        news.append(data)
        count+=1


Fetching: https://www.bangkokpost.com/thailand/general/2996892/no-signs-of-life-more-bodies-expected-at-collapsed-tower-bangkok-governor
Response: <Response [200]>
Count =  0 {'title': 'No signs of life, more bodies expected at collapsed tower: Bangkok governor', 'description': 'Heavy machines are increasingly being used to excavate the rubble of the collapsed State Audit Office building and rescuers are likely to find more bodies as no further signs of life have been detected, Bangkok governor Chadchart Sittipunt said on Sunday.', 'content': 'Heavy machines are increasingly being used to excavate the rubble of the collapsed State Audit Office building and rescuers are likely to find more bodies as no further signs of life have been detected, Bangkok governor Chadchart Sittipunt said on Sunday. With chances of finding more survivors faded, heavy machines are operating at full speed to remove the top of the rubble pile and dig holes into its sides to reach areas where bodies are likely 

In [None]:
news

[{'title': 'CP chief Dhanin tops Thailand wealth list with $15bn',
  'description': 'Dhanin Chearavanont , senior chairman of the Charoen Pokphand (CP) Group, leads the list of Thailand&rsquo;s richest people on on the Forbes magazine World&rsquo;s Billionaires 2025 list, with a net worth of US$15.2 billion.',
  'content': 'Dhanin Chearavanont, senior chairman of the Charoen Pokphand (CP) Group, leads the list of Thailand’s richest people on on the Forbes magazine World’s Billionaires 2025 list, with a net worth of US$15.2 billion. He is ranked 141st among 3,028 billionaires on the list. The United States has a record 902 billionaires, followed by China (516, including Hong Kong) and India (205). Thailand has 25 billionaires on the list. Globally, Forbes said, 15 people have a net worth of at least $100 billion each. In Thailand, Mr Dhanin, 85, is followed by Sarath Ratanavadi, CEO of Gulf Development Plc, who has a net worth of $12.9 billion and is 184th on the global list. Charoen Si

In [None]:
from datetime import datetime

publish_date = "5 Apr 2025"
publish_time = "20:20"

# Combine and parse into datetime
dt = datetime.strptime(f"{publish_date} {publish_time}", "%d %b %Y %H:%M")

print(dt)  # Output: 2025-04-05 20:20:00


2025-04-05 20:20:00


In [None]:
type(dt)

datetime.datetime

In [None]:
dt=dt.strftime("%Y-%m-%d %H:%M:%S")
print(dt)  # Output: 2025-04-05 20:20:00

2025-04-05 20:20:00


In [None]:
type(dt)  # Output: <class 'datetime.datetime'>

str

In [None]:
import html
from io import BytesIO
def clean_text(text):
    return html.unescape(BeautifulSoup(text, "html.parser").get_text()).replace('\xa0', ' ').strip()

for news in news:
    news['title'] = clean_text(news['title'])
    news['description'] = clean_text(news['description'])
    news['content'] = clean_text(news['content'])

In [None]:
news

{'title': 'Arsenal draw at Everton leaves Liverpool smiling',
 'description': 'LIVERPOOL, England - Everton did neighbours Liverpool a massive favour by holding title-chasing Arsenal to a 1-1 Premier League draw at Goodison Park on Saturday.',
 'content': 'LIVERPOOL, England - Everton did neighbours Liverpool a massive favour by holding title-chasing Arsenal to a 1-1 Premier League draw at Goodison Park on Saturday. Iliman Ndiaye’s penalty cancelled out a Leandro Trossard opener for the visitors, who now have 62 points from 31 matches, 11 behind leaders Liverpool, who have a game at hand when they go to Fulham on Sunday. Everton climb to 14th place with 35 points from their 31 matches, 15 points clear of the relegation zone. Arsenal took the lead in the 34th minute with the first shot on target in the game as a poor header in midfield allowed Raheem Sterling to race clear and feed Trossard, who took a touch and drilled his shot low into the far right corner. Everton were awarded a pena