In [None]:
import requests
import pandas as pd
import re
import bs4
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
import time

In [2]:
def safe_get(url):
    try:
        response = requests.get(url, timeout=30)
        response.raise_for_status()  # Raise an error for HTTP errors (e.g., 404, 500)
        return response
    except RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

In [3]:
def get_data(soup):
    title=soup.find("meta", {"property": "og:title"})["content"]
    if title:
        title = title.strip()
    else:
        title=soup.title.string
        index = title.find("-")
        if index != -1:
            title = title[index+1:].strip()
    
    description = soup.find("meta", {"name": "description"})
    if description:
        description = description.get("content", "").strip()
    else:
        description = "No description found"
        
    image=soup.find("meta", {"property": "og:image"})
    if image:
        image = image.get("content", "").strip()
    else:
        image = ""
        
    site_url = soup.find("meta", {"property": "og:url"})
    if site_url:
        site_url = site_url.get("content", "").strip()
    else:
        site_url = ""
    
    paragraphs = soup.find("div", class_="article-content").find_all("p")
    if not paragraphs:
        content=""
    else:
        paragraph_texts = [p.get_text() for p in paragraphs]
        content=" ".join(paragraph_texts).strip()
    
    info=soup.find("div", class_="article-info--col").find_all("p")[0].get_text()
    if info:
        info=info[info.find(":") + 1:].strip()
        publish_date, publish_time=info.split("at")
    else:
        publish_date = ""
        publish_time = ""
    return {
        "title": title,
        "description": description,
        "content": content,
        "image": image,
        "site_url": site_url,
        "publish_date": publish_date.strip(),
        "publish_time": publish_time.strip()
    }

In [4]:
response = safe_get("https://www.bangkokpost.com/most-recent")

In [5]:
if response is not None:
    soup = BeautifulSoup(response.text, "html.parser")

In [6]:
figures = soup.find("div", class_="page--link").find_all("figure")
links=[]
for figure in figures:
    link = figure.find("a")
    if link:
        href = link.get("href")
        if href:
            links.append(href.strip())
links = list(set(links))  # Remove duplicates

In [7]:
links

['/thailand/general/2996427/uk-couple-happy-to-be-home-after-thailand-ordeal',
 '/world/2996402/india-and-uae-to-develop-sri-lanka-energy-hub',
 '/world/2996386/usaid-team-fired-during-myanmar-quake-mission',
 '/world/2996376/manila-sees-beijings-arrests-of-spies-as-retaliation',
 '/thailand/general/2996356/din-daeng-toll-plaza-closing-at-night-from-saturday-to-monday',
 '/thailand/general/2996392/chinese-tourist-found-dead-on-pattaya-beach',
 '/thailand/general/2996349/steel-samples-at-building-collapse-site-sent-for-testing',
 '/world/2996339/trumps-global-baseline-tariff-takes-effect',
 '/thailand/politics/2996304/about-politics-nominees-feel-the-blues']

In [8]:
for link in links:
    print("https://www.bangkokpost.com" + link)

https://www.bangkokpost.com/thailand/general/2996427/uk-couple-happy-to-be-home-after-thailand-ordeal
https://www.bangkokpost.com/world/2996402/india-and-uae-to-develop-sri-lanka-energy-hub
https://www.bangkokpost.com/world/2996386/usaid-team-fired-during-myanmar-quake-mission
https://www.bangkokpost.com/world/2996376/manila-sees-beijings-arrests-of-spies-as-retaliation
https://www.bangkokpost.com/thailand/general/2996356/din-daeng-toll-plaza-closing-at-night-from-saturday-to-monday
https://www.bangkokpost.com/thailand/general/2996392/chinese-tourist-found-dead-on-pattaya-beach
https://www.bangkokpost.com/thailand/general/2996349/steel-samples-at-building-collapse-site-sent-for-testing
https://www.bangkokpost.com/world/2996339/trumps-global-baseline-tariff-takes-effect
https://www.bangkokpost.com/thailand/politics/2996304/about-politics-nominees-feel-the-blues


In [10]:
news=[]
sleep_for=5
count=0
for link in links:
    time.sleep(sleep_for)  # import time
    target="https://www.bangkokpost.com" + link
    print("Fetching: "+target)
    response= safe_get(target)
    print("Response: "+str(response))
    if response is not None:
        soup = BeautifulSoup(response.text, "html.parser")
        data = get_data(soup)
        print("Count = ",count,data)
        news.append(data)
        count+=1


Fetching: https://www.bangkokpost.com/thailand/general/2996427/uk-couple-happy-to-be-home-after-thailand-ordeal
Response: <Response [200]>
Count =  0 {'title': 'UK couple happy to be home after Thailand ordeal', 'description': 'A British couple whose Thailand retirement dream turned into a nightmare are back home and say they &ldquo;couldn&rsquo;t be happier&rdquo; as they prepare to rebuild their lives.', 'content': 'A British couple whose Thailand retirement dream turned into a nightmare are back home and say they “couldn’t be happier” as they prepare to rebuild their lives. “I wish we’d never gone to Thailand but we can’t turn the clock back and Des and I are going to have a wonderful retirement together,” Mary Byrne, 69, told the BBC. She and her husband Des, 77, were badly beaten by two neighbours after a property dispute in Hua Hin, where they had purchased a villa. They said they acted in self-defence but feared they would be jailed when Thai authorities said they and their assa

In [11]:
news

[{'title': 'UK couple happy to be home after Thailand ordeal',
  'description': 'A British couple whose Thailand retirement dream turned into a nightmare are back home and say they &ldquo;couldn&rsquo;t be happier&rdquo; as they prepare to rebuild their lives.',
  'content': 'A British couple whose Thailand retirement dream turned into a nightmare are back home and say they “couldn’t be happier” as they prepare to rebuild their lives. “I wish we’d never gone to Thailand but we can’t turn the clock back and Des and I are going to have a wonderful retirement together,” Mary Byrne, 69, told the BBC. She and her husband Des, 77, were badly beaten by two neighbours after a property dispute in Hua Hin, where they had purchased a villa. They said they acted in self-defence but feared they would be jailed when Thai authorities said they and their assailants would all face charges. \xa0 The incident took place in December 2023 but the charges were not laid until June last year. The couple’s pas

In [12]:
from datetime import datetime

publish_date = "5 Apr 2025"
publish_time = "20:20"

# Combine and parse into datetime
dt = datetime.strptime(f"{publish_date} {publish_time}", "%d %b %Y %H:%M")

print(dt)  # Output: 2025-04-05 20:20:00


2025-04-05 20:20:00


In [13]:
type(dt)

datetime.datetime

In [14]:
dt=dt.strftime("%Y-%m-%d %H:%M:%S")
print(dt)  # Output: 2025-04-05 20:20:00

2025-04-05 20:20:00


In [15]:
type(dt)  # Output: <class 'datetime.datetime'>

str