In [4]:
import requests
from bs4 import BeautifulSoup
import json
import time

from tqdm import tqdm
import re

from urllib.parse import urljoin

In [2]:

def extract_all_tibetcm_page_article_links(url, base_url):
    """
    Extracts all article links from a given tibetcm webpage.

    This function scrapes the provided URL and extracts links to individual articles
    found on the page.

    Args:
    url (str): The URL of the tibetcm webpage containing article links.

    Returns:
        {
            "Links": List[],
            "Message": string,
            "Response": int,
            "source_url": string
        }
    Raises:
    requests.RequestException: If there's an error fetching the webpage.
    ValueError: If the expected HTML structure is not found on the page.
    """

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    final_response = {
        "Links": [],
        "Message": "Success",
        "Response": 200,
        "source_url": url
    }
    
    try:
        start_time = time.time()
        response = requests.get(url, headers=headers, timeout=(5, 60-5))
        response.raise_for_status()
        end_time = time.time()

        if end_time-start_time > 50:
            print(f"This ULR Took more then 50s: {url}")
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # # Getting all the links of articles 
        all_links = []
        all_link_article = soup.find("div", id="article")
        if all_link_article:
            article_block = all_link_article.find_all("ul")
            if article_block:
                for each_head in article_block:
                    article_links = each_head.find_all("a")
                    if article_links:
                        for each_link in article_links:
                            full_url = urljoin(base_url, each_link.get("href"))
                            all_links.append(full_url)
                        
        final_response["Links"] = all_links
        return final_response
     
    except requests.Timeout:
        final_response["Message"] = "Request timed out"
        final_response["Response"] = 408  # Request Timeout
        return final_response
    except requests.RequestException as e:
        # print(f"An error occurred while fetching the webpage: {e}")
        final_response["Message"] = f"An error occurred while fetching the webpage: {e}"
        final_response["Response"] = getattr(e.response, 'status_code', None)
        return final_response
    except ValueError as e:
        # print(f"An error occurred while parsing the webpage: {e}")
        final_response["Message"] = f"An error occurred while parsing the webpage: {e}"
        final_response["Response"] = 404
        # getattr(e.response, 'status_code', None)
        return final_response
    except Exception as e:
        # print(f"An unexpected error occurred: {e}")
        final_response["Message"] = f"An unexpected error occurred: {e}"
        final_response["Response"] = 500
        return final_response


In [34]:
base_url = "https://www.tibetcm.com"
url = "https://www.tibetcm.com/news/index_2.html"
extract_all_tibetcm_page_article_links(url, base_url)

{'Links': ['https://www.tibetcm.com/news/2020-04-13/8876.html',
  'https://www.tibetcm.com/news/2020-03-21/8867.html',
  'https://www.tibetcm.com/news/2020-02-13/8846.html',
  'https://www.tibetcm.com/news/2020-02-08/8842.html',
  'https://www.tibetcm.com/news/2019-12-27/8831.html',
  'https://www.tibetcm.com/news/2019-12-25/8829.html',
  'https://www.tibetcm.com/news/2019-12-20/8827.html',
  'https://www.tibetcm.com/news/2019-11-19/8815.html',
  'https://www.tibetcm.com/news/2019-10-11/8808.html',
  'https://www.tibetcm.com/news/2019-09-30/8803.html',
  'https://www.tibetcm.com/news/2019-05-10/8779.html',
  'https://www.tibetcm.com/news/2019-04-26/8772.html',
  'https://www.tibetcm.com/news/2019-04-15/8766.html',
  'https://www.tibetcm.com/news/2019-03-27/8761.html',
  'https://www.tibetcm.com/news/2018-12-25/8734.html',
  'https://www.tibetcm.com/news/2018-10-21/8702.html',
  'https://www.tibetcm.com/news/2018-10-01/8694.html',
  'https://www.tibetcm.com/news/2018-09-25/8691.html',
 

In [13]:

def scrape_tibetcm_article_content(url, tags):
    """
    
    
    """


    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    final_response = {
        "data": {
            'title': "",
            'body': {"Audio": "", "Text": []},
            'meta_data': {'URL': url, 'Author': "", 'Date': "", 'Tags': [tags]}
        },
        "Message": "Success",
        "Response": 200
    }
    
    try:
        # Make the request to the URL
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        
        # Parse the page content with BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')
        tags_body = soup.find("div", id="guide")
        tags = []
        if tags_body:
            tag_list = tags_body.find_all("a")
            if tag_list:
                for tag in tag_list:
                    each_tag = tag.get_text(strip=True)
                    tags.append(each_tag)
                final_response['data']['meta_data']["Tags"] = tags
        
        full_body = soup.find('div', id="article")
        # print(full_body)
        if full_body:
            # Extract title
            title = soup.find('h1')
            if title:
                title_text = title.get_text(strip=True)
            else:
                title_text = ""
            final_response['data']["title"] = title_text
            
            metadata = full_body.find('div', id="info")
            # Extracting Meta Data
            try:
                if len(metadata):
                    # Extract date
                    date = metadata.find('span', id="date")
                    # Extract author
                    author = metadata.find('span', id="author")
                    author = author.get_text(strip=True)
                    final_response['data']['meta_data']["Date"] = date
                    final_response['data']['meta_data']["Author"] = author
            except AttributeError:
                final_response['data']['meta_data']["Author"] = "Error fetching author"
                final_response['data']['meta_data']["Date"] = "Error fetching date"
            
        # Extract body content
        try:
            body = full_body.find("div", id="content")
            if body:
                paragraphs = body.find_all("p")
                if paragraphs:
                    # Extracting all <p> tags for text content
                    final_response['data']['body']["Text"] = [para.get_text(strip=True) for para in paragraphs]
                else:
                    final_response['data']['body']["Text"] = [""]

        except AttributeError as e:
            final_response['data']['body']["Text"] = [f"Error fetching body content{str(e)}"]
        
        return final_response
    except requests.Timeout:
        final_response["Message"] = "Request timed out"
        final_response["Response"] = 408  # Request Timeout
        return final_response
        
    except requests.RequestException as e:
        final_response["Message"] = f"An error occurred while fetching the article: {str(e)}"
        final_response["Response"] = getattr(e.response, 'status_code', 500)
        return final_response
    except Exception as e:
        # print(f"An unexpected error occurred: {e}")
        final_response["Message"] = f"An unexpected error occurred: {e}"
        final_response["Response"] = 500
        return final_response




In [14]:
url = "https://www.khabdha.org/%e0%bd%82%e0%bd%9e%e0%bd%b4%e0%bd%84%e0%bc%8b%e0%bd%86%e0%bd%ba%e0%bd%93%e0%bc%8b%e0%bd%96%e0%bd%bc%e0%bd%91%e0%bc%8b%e0%bd%96%e0%bd%a6%e0%be%92%e0%be%b1%e0%bd%b4%e0%bd%a2%e0%bc%8b%e0%bd%a0%e0%bd%82/"
url = "https://www.tibetcm.com/news/2019-12-20/8827.html"
scrape_tibetcm_article_content(url, tags="དཔྱད་གཏམ།")

{'data': {'title': 'རྒྱལ་ཡོངས་བོད་ཀྱི་སྒྲུང་ཐུང་འགྲན་སྡུར་སྐབས་གཉིས་པའི་བྱ་དགའ་སྟེར་བའི་མཛད་སྒོ་ཟི་ལིང་དུ་བསྡུས་པ།',
  'body': {'Audio': '',
   'Text': ['བོད་ཀྱི་སྒྲུང་ཐུང་གསར་རྩོམ་གྱི་རྩོམ་པ་པོའི་དཔུང་སྡེ་གསོ་སྐྱོང་དང་རྩོམ་པ་པོ་གསར་པ་སྐྱེད་སྲིང་བྱེད་པ་དང་། བོད་ཀྱི་རྩོམ་རིག་ལས་དོན་གོང་འཕེལ་དུ་གཏོང་བ། སྒྲུང་ཐུང་གསར་རྩོམ་ལ་མོས་པའི་རྩོམ་མཁན་ཀུན་ལ་འདྲ་མཉམ་གྱི་སྟེགས་བུ་ཞིག་བསྐྲུན་པ་བཅས་ལ་དམིགས་ནས། མཚོ་སྔོན་བོད་ཡིག་གསར་འགྱུར་ཁང་དང་མཚོ་སྔོན་ཞིང་ཆེན་རྩོམ་པ་པོ་མཐུན་ཚོགས། མཚོ་སྔོན་ཞིང་ཆེན་མི་རིགས་རྩོམ་རིག་ལོ་ཙཱ་མཐུན་ཚོགས་བཅས་ཀྱིས་རྒྱལ་ཡོངས་བོད་ཀྱི་སྒྲུང་ཐུང་འགྲན་སྡུར་སྐབས་གཉིས་པ་སྤེལ་བ་རེད།',
    'ཐེངས་འདིའི་རྩོམ་ཡིག་འགྲན་སྡུར་དེ2019ལོའི་ཟླ4པའི་ཚེས1ཉིན་ནས2019ལོའི་ཟླ11པའི་ཚེས30ཉིན་བར་ཏེ་བསྡོམས་པས་དུས་ཡུན་ཟླ་ངོ་བརྒྱད་ཀྱི་རིང་ལ་བོད་ལྗོངས་ཉིན་རེའི་ཚགས་པར་དང་། ཀྲུང་གོ་བོད་ཀྱི་དྲ་བ།མཚོ་སྔོན་བོད་སྐད་དྲ་ལམ་རྒྱང་སྒྲོག་བརྙན་འཕྲིན་ཁང་། བདེ་ཆེན་བོད་ཡིག་སྨྱན་སྦྱོར། ཁམས་པའི་དྲ་བ། མཚོ་སྔོན་བོད་ཡིག་གསར་འགྱུར་སོགས་བོད་ཀྱི་སྨྱན་སྦྱོར་ཆེ་ཁག་དང་། དེ་མིན་སྐད་འཕྲིན་དང་པོད་ཕྲན་སོགས་ཀྱི་བར་སྟོང་དུ་བརྒྱུད་སྐུར་བྱས་ཏེ་ཀུ

By ཁ་བརྡ་
