In [1]:
import requests
import pandas as pd
import re
import bs4
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
import time

In [2]:
def safe_get(url):
    try:
        response = requests.get(url, timeout=30)
        response.raise_for_status()  # Raise an error for HTTP errors (e.g., 404, 500)
        return response
    except RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

In [3]:
def get_data(soup):
    title=soup.find("meta", {"property": "og:title"})["content"]
    if title:
        title = title.strip()
    else:
        title=soup.title.string
        index = title.find("-")
        if index != -1:
            title = title[index+1:].strip()
    
    description = soup.find("meta", {"name": "description"})
    if description:
        description = description.get("content", "").strip()
    else:
        description = "No description found"
        
    image=soup.find("meta", {"property": "og:image"})
    if image:
        image = image.get("content", "").strip()
    else:
        image = ""
        
    site_url = soup.find("meta", {"property": "og:url"})
    if site_url:
        site_url = site_url.get("content", "").strip()
    else:
        site_url = ""
    
    paragraphs = soup.find("div", class_="article-content").find_all("p")
    if not paragraphs:
        content=""
    else:
        paragraph_texts = [p.get_text() for p in paragraphs]
        content=" ".join(paragraph_texts).strip()
    
    info=soup.find("div", class_="article-info--col").find_all("p")[0].get_text()
    if info:
        info=info[info.find(":") + 1:].strip()
        publish_date, publish_time=info.split("at")
    else:
        publish_date = ""
        publish_time = ""
    return {
        "title": title,
        "description": description,
        "content": content,
        "image": image,
        "site_url": site_url,
        "publish_date": publish_date.strip(),
        "publish_time": publish_time.strip()
    }

In [4]:
response = safe_get("https://www.bangkokpost.com/most-recent")

In [5]:
if response is not None:
    soup = BeautifulSoup(response.text, "html.parser")

In [7]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Setup Chrome WebDriver
options = Options()
options.add_argument("--start-maximized")  # optional: start maximized
service = Service('path/to/chromedriver')  # replace with your actual path
driver = webdriver.Chrome(service=service, options=options)

# Open the page with the news
driver.get("https://bangkokpost.com/most-recent")  # replace with actual URL

try:
    # Wait until the 'MORE' button is clickable and then click
    wait = WebDriverWait(driver, 10)
    more_button = wait.until(EC.element_to_be_clickable((By.XPATH, '//a[@class="btn btn-border" and text()="MORE"]')))
    more_button.click()
    print("Clicked the 'MORE' button successfully.")
except Exception as e:
    print("Failed to click the 'MORE' button:", e)
finally:
    driver.quit()


NoSuchDriverException: Message: Unable to obtain driver for chrome; For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors/driver_location


In [6]:
figures = soup.find("div", class_="page--link").find_all("figure")
links=[]
for figure in figures:
    link = figure.find("a")
    if link:
        href = link.get("href")
        if href:
            links.append(href.strip())
links = list(set(links))  # Remove duplicates

In [7]:
links

['/world/2997586/buchenwald-camp-survivor-recounts-liberation-80-years-on',
 '/thailand/general/2997562/sniffer-dogs-withdrawn-from-bangkoks-collapsed-tower-building',
 '/world/2997577/china-holds-military-drills-at-newly-expanded-cambodian-naval-base',
 '/thailand/general/2997569/cat-lovers-eager-to-adopt-small-feline-found-alive-in-collapsed-highrise',
 '/thailand/general/2997546/seven-provinces-flagged-for-summer-storms-gusting-winds',
 '/world/2997596/belgian-prince-seeks-social-security-on-top-of-allowance',
 '/world/2997604/hong-kongs-cathay-pacific-bans-pilots-cabin-crew-from-taking-power-banks-on-flights',
 '/business/general/2997556/dollar-funding-demand-grows-as-risk-aversion-rattles-markets',
 '/thailand/general/2997537/frequent-small-tremors-in-chiang-mai-mae-hong-son']

In [8]:
for link in links:
    print("https://www.bangkokpost.com" + link)

https://www.bangkokpost.com/world/2997586/buchenwald-camp-survivor-recounts-liberation-80-years-on
https://www.bangkokpost.com/thailand/general/2997562/sniffer-dogs-withdrawn-from-bangkoks-collapsed-tower-building
https://www.bangkokpost.com/world/2997577/china-holds-military-drills-at-newly-expanded-cambodian-naval-base
https://www.bangkokpost.com/thailand/general/2997569/cat-lovers-eager-to-adopt-small-feline-found-alive-in-collapsed-highrise
https://www.bangkokpost.com/thailand/general/2997546/seven-provinces-flagged-for-summer-storms-gusting-winds
https://www.bangkokpost.com/world/2997596/belgian-prince-seeks-social-security-on-top-of-allowance
https://www.bangkokpost.com/world/2997604/hong-kongs-cathay-pacific-bans-pilots-cabin-crew-from-taking-power-banks-on-flights
https://www.bangkokpost.com/business/general/2997556/dollar-funding-demand-grows-as-risk-aversion-rattles-markets
https://www.bangkokpost.com/thailand/general/2997537/frequent-small-tremors-in-chiang-mai-mae-hong-son


In [None]:
# res=safe_get("https://www.bangkokpost.com/video/thailand/2996869/deeper-dive-thailand-vaping-debate-takeaways")
# if res:
#     soup = BeautifulSoup(res.text, "html.parser")
    
#     data = get_data(soup)
#     print(soup)

AttributeError: 'NoneType' object has no attribute 'find_all'

In [10]:
news=[]
sleep_for=5
count=0
for link in links:
    time.sleep(sleep_for)  # import time
    target="https://www.bangkokpost.com" + link
    print("Fetching: "+target)
    response= safe_get(target)
    print("Response: "+str(response))
    if response:
        soup = BeautifulSoup(response.text, "html.parser")
        if(soup.find("div", class_="article-content") is None):
            continue
        data = get_data(soup)
        print("Count = ",count,data)
        news.append(data)
        count+=1


Fetching: https://www.bangkokpost.com/world/2997586/buchenwald-camp-survivor-recounts-liberation-80-years-on
Response: <Response [200]>
Count =  0 {'title': 'Buchenwald camp survivor recounts liberation, 80 years on', 'description': 'PARIS - Jacques Moalic, a 102-year-old former Agence France-Presse journalist, remembers vividly the liberation of Buchenwald concentration camp by American troops 80 years ago.', 'content': 'PARIS - Jacques Moalic, a 102-year-old former Agence France-Presse journalist, remembers vividly the liberation of Buchenwald concentration camp by American troops 80 years ago. In an interview with AFP, where he worked for 40 years after World War II, the Buchenwald survivor spoke of his last months in captivity and the arrival of American troops on April 11, 1945. He recounted his ordeal once again, speaking ahead of the 80th anniversary of the camp\'s liberation. Around 56,000 Jews, Roma and Soviet prisoners died there between 1937 and 1945. In 1943, Moalic, then a

In [None]:
news

In [None]:
from datetime import datetime

publish_date = "5 Apr 2025"
publish_time = "20:20"

# Combine and parse into datetime
dt = datetime.strptime(f"{publish_date} {publish_time}", "%d %b %Y %H:%M")

print(dt)  # Output: 2025-04-05 20:20:00


In [None]:
type(dt)

In [None]:
dt=dt.strftime("%Y-%m-%d %H:%M:%S")
print(dt)  # Output: 2025-04-05 20:20:00

In [None]:
type(dt)  # Output: <class 'datetime.datetime'>

In [None]:
import html
from io import BytesIO
def clean_text(text):
    return html.unescape(BeautifulSoup(text, "html.parser").get_text()).replace('\xa0', ' ').strip()

for news in news:
    news['title'] = clean_text(news['title'])
    news['description'] = clean_text(news['description'])
    news['content'] = clean_text(news['content'])

In [None]:
news