In [4]:
import requests
from bs4 import BeautifulSoup
import json
import time

from tqdm import tqdm
import re

from urllib.parse import urljoin

In [12]:

def extract_all_tb_tibet_cn_page_article_links(url, base_url):
    """
    Extracts all article links from a given khabdha webpage.

    This function scrapes the provided URL and extracts links to individual articles
    found on the page.

    Args:
    url (str): The URL of the khabdha webpage containing article links.

    Returns:
        {
            "Links": List[],
            "Message": string,
            "Response": int,
            "source_url": string
        }
    Raises:
    requests.RequestException: If there's an error fetching the webpage.
    ValueError: If the expected HTML structure is not found on the page.
    """

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    final_response = {
        "Links": [],
        "Message": "Success",
        "Response": 200,
        "source_url": url
    }
    
    try:
        start_time = time.time()
        response = requests.get(url, headers=headers, timeout=(5, 60-5))
        response.raise_for_status()
        end_time = time.time()

        if end_time-start_time > 50:
            print(f"This ULR Took more then 50s: {url}")
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # # Getting all the links of articles 
        all_links = []

        # Extracting all the articles in the DIV
        article_block = soup.find("div", class_="block_L fl")
        if article_block: 
            all_article = article_block.find_all("a", target="_blank")
            if all_article:
                for each_head in all_article:
                    # article_links = each_head.find("a")
                    if each_head is not None:
                        # Construct the full URL
                        full_url = urljoin(base_url, each_head.get("href"))
                        all_links.append(full_url)

        # if len(all_links) < 4:
        #     all_article = soup.find_all("div", class_="w-post-elm post_image usg_post_image_1 has_ratio with_placeholder")
        #     if all_article:
        #         for each_head in all_article:
        #             article_links = each_head.find("a")
        #             if article_links is not None:
        #                 all_links.append(article_links.get("href"))


        final_response["Links"] = all_links
        return final_response
     
    except requests.Timeout:
        final_response["Message"] = "Request timed out"
        final_response["Response"] = 408  # Request Timeout
        return final_response
    except requests.RequestException as e:
        # print(f"An error occurred while fetching the webpage: {e}")
        final_response["Message"] = f"An error occurred while fetching the webpage: {e}"
        final_response["Response"] = getattr(e.response, 'status_code', None)
        return final_response
    except ValueError as e:
        # print(f"An error occurred while parsing the webpage: {e}")
        final_response["Message"] = f"An error occurred while parsing the webpage: {e}"
        final_response["Response"] = 404
        # getattr(e.response, 'status_code', None)
        return final_response
    except Exception as e:
        # print(f"An unexpected error occurred: {e}")
        final_response["Message"] = f"An unexpected error occurred: {e}"
        final_response["Response"] = 500
        return final_response





In [14]:
base_url = "http://tb.tibet.cn/tb/review/"
url = "http://tb.tibet.cn/tb/review/index_1.html"
extract_all_tb_tibet_cn_page_article_links(url, base_url)

{'Links': ['http://tb.tibet.cn/tb/review/202211/t20221110_7306535.html',
  'http://tb.tibet.cn/tb/review/202211/t20221101_7298772.html',
  'http://tb.tibet.cn/tb/review/202210/t20221026_7296172.html',
  'http://tb.tibet.cn/tb/review/202210/t20221026_7296170.html',
  'http://tb.tibet.cn/tb/review/202209/t20220927_7281113.html',
  'http://tb.tibet.cn/tb/review/202209/t20220907_7270792.html',
  'http://tb.tibet.cn/tb/review/202209/t20220906_7270074.html',
  'http://tb.tibet.cn/tb/review/202208/t20220829_7265494.html',
  'http://tb.tibet.cn/tb/review/202208/t20220829_7265483.html',
  'http://tb.tibet.cn/tb/review/202208/t20220829_7265481.html',
  'http://tb.tibet.cn/tb/review/202208/t20220829_7265474.html',
  'http://tb.tibet.cn/tb/review/202208/t20220829_7265472.html',
  'http://tb.tibet.cn/tb/review/202208/t20220829_7265464.html',
  'http://tb.tibet.cn/tb/review/202208/t20220817_7259574.html',
  'http://tb.tibet.cn/tb/review/202208/t20220816_7258689.html',
  'http://tb.tibet.cn/tb/review

In [16]:

def scrape_tb_tibet__article_content(url, tags):
    """
    
    
    """


    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    final_response = {
        "data": {
            'title': "",
            'body': {"Audio": "", "Text": []},
            'meta_data': {'URL': url, 'Author': "", 'Date': "", 'Tags': [tags]}
        },
        "Message": "Success",
        "Response": 200
    }
    
    try:
        # Make the request to the URL
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        
        # Parse the page content with BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')

        full_body = soup.find('div', id="contentK")
        # print(full_body)
        if full_body:
            # Extract title
            title = soup.find('h1')
            if title:
                title_text = title.get_text(strip=True)
            else:
                title_text = "no"
            final_response['data']["title"] = title_text
            
            metadata = soup.select('div.xinxi span')
            # Extracting Meta Data
            try:
                if len(metadata):
                    # Extract date
                    date = metadata[0].text.split()[-1]
                    # Extract author
                    author = metadata[1].text.split('། ', 1)[-1]
                    
                    final_response['data']['meta_data']["Date"] = date
                    final_response['data']['meta_data']["Author"] = author
               
            except AttributeError:
                final_response['data']['meta_data']["Author"] = "Error fetching author"
                final_response['data']['meta_data']["Date"] = "Error fetching date"
            
        # Extract body content
        try:
            body = full_body.find("div", class_="TRS_Editor")
            if body:
                paragraphs = body.find_all("font")
                if paragraphs:
                    # Extracting all <p> tags for text content
                    final_response['data']['body']["Text"] = [para.get_text(strip=True) for para in paragraphs]
                else:
                    final_response['data']['body']["Text"] = [""]

        except AttributeError as e:
            final_response['data']['body']["Text"] = [f"Error fetching body content{str(e)}"]
        
        return final_response
    except requests.Timeout:
        final_response["Message"] = "Request timed out"
        final_response["Response"] = 408  # Request Timeout
        return final_response
        
    except requests.RequestException as e:
        final_response["Message"] = f"An error occurred while fetching the article: {str(e)}"
        final_response["Response"] = getattr(e.response, 'status_code', 500)
        return final_response
    except Exception as e:
        # print(f"An unexpected error occurred: {e}")
        final_response["Message"] = f"An unexpected error occurred: {e}"
        final_response["Response"] = 500
        return final_response




In [19]:
url = "https://www.khabdha.org/%e0%bd%82%e0%bd%9e%e0%bd%b4%e0%bd%84%e0%bc%8b%e0%bd%86%e0%bd%ba%e0%bd%93%e0%bc%8b%e0%bd%96%e0%bd%bc%e0%bd%91%e0%bc%8b%e0%bd%96%e0%bd%a6%e0%be%92%e0%be%b1%e0%bd%b4%e0%bd%a2%e0%bc%8b%e0%bd%a0%e0%bd%82/"
url = "http://tb.tibet.cn/tb/review/202208/t20220829_7265472.html"
scrape_tb_tibet__article_content(url, tags="དཔྱད་གཏམ།")

{'data': {'title': 'ཕྱི་ཚུལ་རིང་ལུགས་དང་དཔོན་ཉམས་རིང་ལུགས་ཀྱི་སྣང་ཚུལ་མེད་པར་བཟོ་དགོས།',
  'body': {'Audio': '',
   'Text': ['མིག་སྔར། བོད་ལྗོངས་ཀྱི་རིམས་ནད་སྔོན་འགོག་དང་ཚོད་འཛིན་གྱི་ལས་དོན་དཀའ་ལས་ཁག་ཤོས་ཀྱི་འགག་རྩའི་དུས་མཚམས་སུ་སླེབས་ཡོད་མོད། འོན་ཀྱང་ས་ཆ་འགའ་ཞིག་གི་ལས་དོན་ལ་ཕྱི་ཚུལ་རིང་ལུགས་དང་དཔོན་ཉམས་རིང་ལུགས་ཀྱི་སྣང་ཚུལ་ཐོན་ཡོད། བོད་རང་སྐྱོང་ལྗོངས་ཏང་ཨུད་ཀྱི་ཧྲུའུ་ཅི་ཝང་ཅུན་ཀྲེང་གིས་རིམས་ནད་སྔོན་འགོག་དང་ཚོད་འཛིན་གྱི་ལས་དོན་ལ་ལྟ་སྐུལ་མཛུབ་ཁྲིད་དང་ཞིབ་བཤེར་གནང་སྐབས། རྣག་ཐོག་གཙག་འཁེལ་དང་རྨོངས་གཉིད་ལས་སད་དགོས་པའི་ནན་ཚིག་གསུངས་ཡོད། ཁོང་གིས་ཕྱི་ཚུལ་རིང་ལུགས་དང་དཔོན་ཉམས་རིང་ལུགས་ལ་ངོ་རྒོལ་མཐའ་གཅིག་ཏུ་བྱས་ནས་ལས་ཀའི་བྱ་ཐབས་ཁག་དོན་འཁྱོལ་ལ་དམ་འཛིན་ནན་པོ་བྱས་ཏེ་དམ་པ་དོན་གཉེར་གྱི་སྙིང་སྟོབས་ལ་བརྟེན་ནས་རིམས་ནད་སྔོན་འགོག་དང་ཚོད་འཛིན་ཡག་པོ་བྱེད་དགོས་པའི་ནན་བཤད་གནང་བ་རེད།',
    'ཕྱི་ཚུལ་རིང་ལུགས་དང་། དཔོན་ཉམས་རིང་ལུགས་ཀྱི་སྣང་ཚུལ་མེད་པར་བཟོ་དགོས། རིམས་ནད་སྔོན་འགོག་དང་ཚོད་འཛིན་གྱི་གཡུལ་ས་འདིར་མི་ཚང་མ་འཐབ་འཛིང་བ་རེད།',
    'རིམས་འགོག་ལས་དོན་སྤེལ་རིང་གི་ཕྱི་ཚུལ་རིང་ལུགས་དང་དཔོན་ཉམས་རིང་ལུགས་ནི་མི་དམངས

In [6]:
# The HTML content
html_content = """
<span class="posted-by byline"> <svg class="svg-icon" width="24" height="24" aria-hidden="true" role="img" focusable="false" viewBox="0.0 0 1408.0 2048" xmlns="http://www.w3.org/2000/svg"><path d=",534,1088,640z"></path></svg> <a href="https://www.khabdha.org/author/editor/"> <span class="screen-reader-text">By </span>ཁ་བརྡ་ </a> </span>
"""

# Create a BeautifulSoup object
soup = BeautifulSoup(html_content, 'html.parser')

# Find the <a> tag and extract the Tibetan text
tibetan_text = soup.find('a').text.strip()

# Print the extracted Tibetan text
print(tibetan_text)

By ཁ་བརྡ་
