In [11]:
import requests
from bs4 import BeautifulSoup
import json
import time

from tqdm import tqdm
import re

In [2]:

def extract_all_khabdha_page_article_links(url: str):
    """
    Extracts all article links from a given khabdha webpage.

    This function scrapes the provided URL and extracts links to individual articles
    found on the page.

    Args:
    url (str): The URL of the khabdha webpage containing article links.

    Returns:
        {
            "Links": List[],
            "Message": string,
            "Response": int,
            "source_url": string
        }
    Raises:
    requests.RequestException: If there's an error fetching the webpage.
    ValueError: If the expected HTML structure is not found on the page.
    """

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    final_response = {
        "Links": [],
        "Message": "Success",
        "Response": 200,
        "source_url": url
    }
    
    try:
        start_time = time.time()
        response = requests.get(url, headers=headers, timeout=(5, 60-5))
        response.raise_for_status()
        end_time = time.time()

        if end_time-start_time > 50:
            print(f"This ULR Took more then 50s: {url}")
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # # Getting all the links of articles 
        all_links = []

        # Extracting all the articles in the DIV
        all_article = soup.find_all("h2", class_="entry-title")
        if all_article:
            for each_head in all_article:
                article_links = each_head.find("a")
                if article_links is not None:
                    all_links.append(article_links.get("href"))

        # if len(all_links) < 4:
        #     all_article = soup.find_all("div", class_="w-post-elm post_image usg_post_image_1 has_ratio with_placeholder")
        #     if all_article:
        #         for each_head in all_article:
        #             article_links = each_head.find("a")
        #             if article_links is not None:
        #                 all_links.append(article_links.get("href"))


        final_response["Links"] = all_links
        return final_response
     
    except requests.Timeout:
        final_response["Message"] = "Request timed out"
        final_response["Response"] = 408  # Request Timeout
        return final_response
    except requests.RequestException as e:
        # print(f"An error occurred while fetching the webpage: {e}")
        final_response["Message"] = f"An error occurred while fetching the webpage: {e}"
        final_response["Response"] = getattr(e.response, 'status_code', None)
        return final_response
    except ValueError as e:
        # print(f"An error occurred while parsing the webpage: {e}")
        final_response["Message"] = f"An error occurred while parsing the webpage: {e}"
        final_response["Response"] = 404
        # getattr(e.response, 'status_code', None)
        return final_response
    except Exception as e:
        # print(f"An unexpected error occurred: {e}")
        final_response["Message"] = f"An unexpected error occurred: {e}"
        final_response["Response"] = 500
        return final_response





In [3]:
url = "https://www.khabdha.org/page/2/"

extract_all_khabdha_page_article_links(url)

{'Links': ['https://www.khabdha.org/%e0%bd%96%e0%bd%99%e0%bd%93%e0%bc%8b%e0%bd%96%e0%be%b1%e0%bd%bc%e0%bd%a3%e0%bc%8b%e0%bd%96%e0%bd%bc%e0%bd%91%e0%bc%8b%e0%bd%98%e0%bd%b2%e0%bd%a0%e0%bd%b2%e0%bc%8b%e0%bd%98%e0%bd%84%e0%bc%8b/',
  'https://www.khabdha.org/%e0%bd%96%e0%bd%bc%e0%bd%a2%e0%bc%8b%e0%bd%9f%e0%bd%b2%e0%bd%93%e0%bc%8b%e0%bd%94%e0%bd%a0%e0%bd%b2%e0%bc%8b%e0%bd%a2%e0%be%92%e0%be%b1%e0%bd%a3%e0%bc%8b%e0%bd%82%e0%bd%85%e0%bd%ba%e0%bd%a6%e0%bc%8b/',
  'https://www.khabdha.org/%e0%bd%98%e0%bc%8b%e0%bd%8e%e0%bd%b2%e0%bc%8b%e0%bd%a2%e0%be%a1%e0%bd%bc%e0%bc%8b%e0%bd%95%e0%bd%b4%e0%bd%84%e0%bc%8b%e0%bc%8d/',
  'https://www.khabdha.org/%e0%bd%89%e0%bd%b2%e0%bd%93%e0%bc%8b%e0%bd%a4%e0%bd%a6%e0%bc%8b%e0%bd%a6%e0%be%94%e0%bd%bc%e0%bd%93%e0%bc%8b%e0%bd%a2%e0%be%92%e0%be%b1%e0%bc%8b%e0%bd%82%e0%bd%a2%e0%bc%8b%e0%bd%82%e0%be%b1%e0%bd%b2/',
  'https://www.khabdha.org/%e0%bd%89%e0%bd%ba%e0%bc%8b%e0%bd%a3%e0%bd%98%e0%bc%8b%e0%bd%90%e0%bd%ba%e0%bc%8b%e0%bd%9d%e0%bd%93%e0%bc%8b%e0%bd%82%e0%be%b1%e0

In [23]:

def scrape_khabdha_article_content(url,):
    """
    
    
    """


    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    final_response = {
        "data": {
            'title': "",
            'body': {"Audio": "", "Text": []},
            'meta_data': {'URL': url, 'Author': "", 'Date': "", 'Tags': []}
        },
        "Message": "Success",
        "Response": 200
    }
    
    try:
        # Make the request to the URL
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        
        # Parse the page content with BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract title
        title = soup.find('h1', class_="entry-title")
        if title:
            title_text = title.get_text(strip=True) 
        else:
            title_text = ""
        final_response['data']["title"] = title_text

        
        # Extracting Meta Data
        try:
            date_time = soup.find("time", class_="entry-date published")
            final_response['data']['meta_data']["Date"] = date_time.get_text(strip=True) if date_time else "No date"
            
            if date_time == None: 
                date_time = soup.find("time", class_="entry-date published updated")
                final_response['data']['meta_data']["Date"] = date_time.get_text(strip=True) if date_time else "No date"


            author_name = soup.find('span', class_="posted-by byline")
            full_text = author_name.find('a').text.strip() if author_name else ""
            # Use regex to extract only the Tibetan text
            final_response['data']['meta_data']["Author"] = re.search(r'[\u0F00-\u0FFF]+', full_text).group()
        except AttributeError:
            final_response['data']['meta_data']["Author"] = "Error fetching author"
            final_response['data']['meta_data']["Date"] = "Error fetching date"

        category = soup.find("span", class_="cat-links")
        category_tags = category.find_all('a', rel="category tag")
        # Extract and print the category names
        final_response['data']['meta_data']["Tags"] = [tag.text for tag in category_tags]

        # Extract body content
        try:
            body = soup.find('div', class_='pb-content')
            if body:
                paragraphs = body.find_all('p')
                if paragraphs:
                    # Extracting all <p> tags for text content
                    final_response['data']['body']["Text"] = [para.get_text(strip=True) for para in paragraphs]
                else:
                    final_response['data']['body']["Text"] = [""]

        except AttributeError as e:
            final_response['data']['body']["Text"] = [f"Error fetching body content{str(e)}"]
        
        return final_response
    except requests.Timeout:
        final_response["Message"] = "Request timed out"
        final_response["Response"] = 408  # Request Timeout
        return final_response
        
    except requests.RequestException as e:
        final_response["Message"] = f"An error occurred while fetching the article: {str(e)}"
        final_response["Response"] = getattr(e.response, 'status_code', 500)
        return final_response




In [26]:
url = "https://www.khabdha.org/%e0%bd%82%e0%bd%9e%e0%bd%b4%e0%bd%84%e0%bc%8b%e0%bd%86%e0%bd%ba%e0%bd%93%e0%bc%8b%e0%bd%96%e0%bd%bc%e0%bd%91%e0%bc%8b%e0%bd%96%e0%bd%a6%e0%be%92%e0%be%b1%e0%bd%b4%e0%bd%a2%e0%bc%8b%e0%bd%a0%e0%bd%82/"
url = "https://www.khabdha.org/%e0%bd%a6%e0%be%a4%e0%be%b1%e0%bd%b2%e0%bc%8b%e0%bd%a0%e0%bd%90%e0%bd%b4%e0%bd%a6%e0%bc%8b%e0%bd%a6%e0%be%90%e0%bd%96%e0%bd%a6%e0%bc%8b%e0%bd%96%e0%bd%85%e0%bd%b4%e0%bc%8b%e0%bd%96%e0%bd%91%e0%bd%b4-4/"
scrape_khabdha_article_content(url)

{'data': {'title': 'སྤྱི་འཐུས་སྐབས་བཅུ་བདུན་གྱི་གྲོས་ཚོགས་ཚོགས་དུས་དྲུག་པའི་ལས་རིམ་ལ་དཔྱད་པའི་གཏམ།',
  'body': {'Audio': '',
   'Text': ['གཞུང་པ་ཚེ་དབང་བསོད་ནམས།',
    'སྤྱི་འཐུས་སྐབས་བཅུ་བདུན་གྱི་གྲོས་ཚོགས་ཚོགས་དུས་དྲུག་པ་དེ་འདི་ཚེས་བཅུ་དགུ་ཉིན་དབུ་འཛུགས་གནང་ནས་ཚོགས་དུས་འདིར་འཛིན་སྐྱོང་གིས་ལས་བསྡོམས་ཐོག་བགྲོ་གླེང་གནང་རྒྱུ་དང་གཞན་ཡང་ལས་རིམ་ཁག་བཅས་ཉིན་དང་པོར་གདུང་སེམས་མཉམ་སྐྱེད་ཐོག་བགྲོ་གླེང་འགྲོ་བའི་སྐབས་སྤྱི་འཐུས་གཅིག་ནས་གྲོས་ཚོགས་ཀྱིས་ལས་རིམ་དང་འབྲེལ་བ་མེད་པའི་སྐྱབས་རྗེ་ཟམ་གདོང་རིན་པོ་ཆེ་ལ་གཞི་མེད་སྐྱོན་འཛུགས་གཏམ་བཤད་སྤེལ་སྐབས་ཚོགས་གཙོའི་བཏང་འཛིན་མ་གནང་བ་ནས་སྤྱི་འཐུས་གཞན་ཞིག་ནས་ལས་རིམ་རྣམ་བཞག་ལ་འགན་འཁུར་ནས་སྤྱི་འཐུས་དེ་ལ་རྒྱ་མིའི་སོ་པ་འཁྲིད་ནས་ཀ་རྡོར་གང་གསུམ་ལ་འགྲོ་མཁན་དེ་ཁྱོད་མིན་ན་སུ་རེད་ཟེར་བ་ནས་སྤྱི་འཐུས་གཉིས་བར་གཅིག་གི་གཅིག་ལ་ཚིིག་ངན་བེད་སྤྱོད་འོག་ནས་གྲོས་ཚོགས་ལས་རིམ་དེ་སྤྱི་འཐུས་སྒེར་གྱི་ཆབ་སྲིད་འཐབ་རྩོད་ལ་ཁ་ཕྱོགས་ནས་སྤྱི་འཐུས་ནང་ཁུལ་ཕྱོགས་གཏོགས་ལ་འགན་འཁུར་ནས་བདེ་སྲུང་ལས་ཁུང་མར་དྲུད་ནས་འདི་ཚེས་༢༣ ནས་གྲོས་ཚོགས་ཚོགས་མ་ཐུབ་པ་ནི་འདས་པའི་མི་ལོ་དྲུག་བཅུ་ལྷག་ཙམ་རིང་བཙན་བྱོལ་བོད་མིའི་ག

In [6]:
# The HTML content
html_content = """
<span class="posted-by byline"> <svg class="svg-icon" width="24" height="24" aria-hidden="true" role="img" focusable="false" viewBox="0.0 0 1408.0 2048" xmlns="http://www.w3.org/2000/svg"><path d=",534,1088,640z"></path></svg> <a href="https://www.khabdha.org/author/editor/"> <span class="screen-reader-text">By </span>ཁ་བརྡ་ </a> </span>
"""

# Create a BeautifulSoup object
soup = BeautifulSoup(html_content, 'html.parser')

# Find the <a> tag and extract the Tibetan text
tibetan_text = soup.find('a').text.strip()

# Print the extracted Tibetan text
print(tibetan_text)

By ཁ་བརྡ་
