In [1]:
import requests
from bs4 import BeautifulSoup
import json
import time

from tqdm import tqdm
import re

from urllib.parse import urljoin

In [9]:
def extract_kbcmw_page_article_links(url, base_url):
    """
    Extracts all article links from a given kbcmw webpage.

    This function scrapes the provided URL and extracts links to individual articles
    found on the page.

    Args:
    url (str): The URL of the kbcmw webpage containing article links.

    Returns:
        {
            "Links": List[],
            "Message": string,
            "Response": int,
            "source_url": string
        }
    Raises:
    requests.RequestException: If there's an error fetching the webpage.
    ValueError: If the expected HTML structure is not found on the page.
    """

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    final_response = {
        "Links": [],
        "Message": "Success",
        "Response": 200,
        "source_url": url
    }
    
    try:
        start_time = time.time()
        response = requests.get(url, headers=headers, timeout=(5, 60-5))
        response.raise_for_status()
        end_time = time.time()

        if end_time-start_time > 50:
            print(f"This ULR Took more then 50s: {url}")
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # # Getting all the links of articles 
        all_links = []
        all_link_article = soup.find("div", class_="modular_classify_item2 clearfix")
        if all_link_article:
            article_block = all_link_article.find_all("li", class_="newscontent")
            if article_block:
                for each_head in article_block:
                    article_links = each_head.find_all("a")
                    if article_links:
                        for each_link in article_links:
                            full_url = urljoin(base_url, each_link.get("href"))
                            all_links.append(full_url)
                        
        final_response["Links"] = all_links
        return final_response
     
    except requests.Timeout:
        final_response["Message"] = "Request timed out"
        final_response["Response"] = 408  # Request Timeout
        return final_response
    except requests.RequestException as e:
        # print(f"An error occurred while fetching the webpage: {e}")
        final_response["Message"] = f"An error occurred while fetching the webpage: {e}"
        final_response["Response"] = getattr(e.response, 'status_code', None)
        return final_response
    except ValueError as e:
        # print(f"An error occurred while parsing the webpage: {e}")
        final_response["Message"] = f"An error occurred while parsing the webpage: {e}"
        final_response["Response"] = 404
        # getattr(e.response, 'status_code', None)
        return final_response
    except Exception as e:
        # print(f"An unexpected error occurred: {e}")
        final_response["Message"] = f"An unexpected error occurred: {e}"
        final_response["Response"] = 500
        return final_response


In [11]:
base_url = "http://ti.kbcmw.com"
url = "http://ti.kbcmw.com/html/news/kbnews/"
extract_kbcmw_page_article_links(url, base_url)

{'Links': ['http://ti.kbcmw.com/html/news/kbnews/kbnews/22558.html',
  'http://ti.kbcmw.com/html/news/kbnews/kbnews/22557.html',
  'http://ti.kbcmw.com/html/news/kbnews/kbnews/22549.html',
  'http://ti.kbcmw.com/html/news/kbnews/kbnews/22548.html',
  'http://ti.kbcmw.com/html/news/kbnews/kbnews/22535.html',
  'http://ti.kbcmw.com/html/news/kbnews/kbnews/22534.html',
  'http://ti.kbcmw.com/html/news/kbnews/kbnews/22533.html',
  'http://ti.kbcmw.com/html/news/kbnews/kbnews/22532.html',
  'http://ti.kbcmw.com/html/news/kbnews/kbnews/22531.html',
  'http://ti.kbcmw.com/html/news/kbnews/kbnews/22528.html',
  'http://ti.kbcmw.com/html/news/kbnews/kbnews/22523.html',
  'http://ti.kbcmw.com/html/news/kbnews/kbnews/22522.html',
  'http://ti.kbcmw.com/html/news/kbnews/kbnews/22508.html',
  'http://ti.kbcmw.com/html/news/kbnews/kbnews/22507.html',
  'http://ti.kbcmw.com/html/news/kbnews/kbnews/22506.html',
  'http://ti.kbcmw.com/html/news/kbnews/22497.html',
  'http://ti.kbcmw.com/html/news/kbnew

In [7]:

def scrape_kbcmw_article_content(url, tags):
    """
    
    
    """


    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    final_response = {
        "data": {
            'title': "",
            'body': {"Audio": "", "Text": []},
            'meta_data': {'URL': url, 'Author': "", 'Date': "", 'Tags': [tags]}
        },
        "Message": "Success",
        "Response": 200
    }
    
    try:
        # Make the request to the URL
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        
        # Parse the page content with BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')
        full_body = soup.find("div", class_="modular_classify_item clearfix")
    
        if full_body:
            title_body = full_body.find("div", class_="article_title")
            if title_body:
                # Extract title
                title = title_body.find('h3')
                if title:
                    title_text = title.get_text(strip=True)
                else:
                    title_text = ""
                final_response['data']["title"] = title_text

                # date
                date = title_body.find('p')
                if date:
                    date = date.get_text(strip=True)
                    final_response['data']['meta_data']["Date"] = date
            
        # Extract body content
        try:
            body = full_body.find("div", class_="article_content")
            if body:
                paragraphs = body.find_all("p")
                if paragraphs:
                    # Extracting all <p> tags for text content
                    final_response['data']['body']["Text"] = [para.get_text(strip=True) for para in paragraphs]
                else:
                    final_response['data']['body']["Text"] = [""]

        except AttributeError as e:
            final_response['data']['body']["Text"] = [f"Error fetching body content{str(e)}"]
        
        return final_response
    except requests.Timeout:
        final_response["Message"] = "Request timed out"
        final_response["Response"] = 408  # Request Timeout
        return final_response
        
    except requests.RequestException as e:
        final_response["Message"] = f"An error occurred while fetching the article: {str(e)}"
        final_response["Response"] = getattr(e.response, 'status_code', 500)
        return final_response
    except Exception as e:
        # print(f"An unexpected error occurred: {e}")
        final_response["Message"] = f"An unexpected error occurred: {e}"
        final_response["Response"] = 500
        return final_response




In [8]:
url = "https://www.khabdha.org/%e0%bd%82%e0%bd%9e%e0%bd%b4%e0%bd%84%e0%bc%8b%e0%bd%86%e0%bd%ba%e0%bd%93%e0%bc%8b%e0%bd%96%e0%bd%bc%e0%bd%91%e0%bc%8b%e0%bd%96%e0%bd%a6%e0%be%92%e0%be%b1%e0%bd%b4%e0%bd%a2%e0%bc%8b%e0%bd%a0%e0%bd%82/"
url = "http://ti.kbcmw.com/html/news/china/22542.html"
scrape_kbcmw_article_content(url, tags="དཔྱད་གཏམ།")

{'data': {'title': 'བོད་རྒྱ་སྐད་བརྡ་རིག་པའི་དབྱར་སྦྱོང་འཛིན་གྲྭ་སྐབས་བདུན་པ་དང་བོད་ཀྱི་སྐད་ཡིག་བགྲོ་གླེང་ཚོགས་འདུ་སྐབས་དྲུག་པ་ལེགས་འགྲུབ་བྱུང་བ།',
  'body': {'Audio': '',
   'Text': ['བོད་རྒྱ་སྐད་བརྡ་རིག་པའི་དབྱར་སྦྱོང་འཛིན་གྲྭ་སྐབས་བདུན་པ་དང་བོད་ཀྱི་སྐད་ཡིག་བགྲོ་གླེང་ཚོགས་འདུ་སྐབས་དྲུག་པ་སྤྱི་ཟླ་༨པའི་ཚེས་༡༡ནས་༢༣བར་ནན་ཁའེ་སློབ་ཆེན་དུ་ལེགས་འགྲུབ་བྱུང་། ཐེངས་འདིའི་དབྱར་སྦྱོང་འཛིན་གྲྭ་དང་བགྲོ་གླེང་ཚོགས་འདུ་ནི་གཉན་རིམས་བྱུང་བའི་རྗེས་ནས་ཐེངས་དང་པོར་ཡུལ་དངོས་སུ་འཚོགས་པ་ཡིན།',
    '',
    'བོད་རྒྱ་སྐད་བརྡ་རིག་པའི་དབྱར་སྦྱོང་འཛིན་གྲྭ་སྐབས་བདུན་པ་ (གཤམ་དུ་དབྱར་སྦྱོང་འཛིན་གྲྭ་ཞེས་འབོད་པར་བྱ) ནི་ལོ་གཉིས་ཅན་གྱི་སྦྱོང་བརྡར་སློབ་གཉེར་གྱི་ལས་གཞི་ཞིག་ཡིན་ལ། ལོ་གཅིག་ལ་སྐད་སྒྲ་དང་སྒྲ་ཁམས་རིག་པ་སྦྱོང་བརྡར་བྱེད་པ་དང་། ལོ་གཞན་དེར་བརྡ་སྤྲོད་སྦྱོང་བརྡར་བྱེད། འདི་ནི་ནན་ཁའེ་སློབ་ཆེན་རྩོམ་རིག་སློབ་གླིང་གི་འཛམ་གླིང་ནན་ཁའེ་ཞེས་པའི་རྒྱལ་སྤྱིའི་འཕེལ་རྒྱས་ཀྱི་ལས་གཞི་གལ་ཆེན་གྱི་གྲས་སུ་གཏོགས། ལས་གཞི་འདི་ད་བར་ཐེངས་བདུན་པར་སླེབས་ཡོད་ལ། རིག་གཞུང་སྡེ་ཁག་གི་དོ་སྣང་ཆེན་པོ་ཐོབ་བཞིན་ཡོད། དབྱར་སྦྱོང་འཛིན་གྲྭ་འདི་ཐེངས་རེར་བདུན་