In [1]:
import requests
from bs4 import BeautifulSoup
import json
import time

from tqdm import tqdm
import re

from urllib.parse import urljoin

In [4]:
# 

In [2]:
def extract_xizang_news_page_article_links(url, base_url):
    """
    Extracts all article links from a given xizang_news webpage.

    This function scrapes the provided URL and extracts links to individual articles
    found on the page.

    Args:
    url (str): The URL of the xizang_news webpage containing article links.

    Returns:
        {
            "Links": List[],
            "Message": string,
            "Response": int,
            "source_url": string
        }
    Raises:
    requests.RequestException: If there's an error fetching the webpage.
    ValueError: If the expected HTML structure is not found on the page.
    """

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    final_response = {
        "Links": [],
        "Message": "Success",
        "Response": 200,
        "source_url": url
    }
    
    try:
        start_time = time.time()
        response = requests.get(url, headers=headers, timeout=(5, 60-5))
        response.raise_for_status()
        end_time = time.time()

        if end_time-start_time > 50:
            print(f"This ULR Took more then 50s: {url}")
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # # Getting all the links of articles 
        all_links = []
        article_block = soup.find_all("div", id="lcqwJbzJW")
        if article_block:
            for each_head in article_block:
                article_links = each_head.find_all("a")
                if article_links:
                    for each_link in article_links:
                        full_url = urljoin(base_url, each_link.get("href"))
                        all_links.append(full_url)
                        
        final_response["Links"] = all_links
        return final_response
     
    except requests.Timeout:
        final_response["Message"] = "Request timed out"
        final_response["Response"] = 408  # Request Timeout
        return final_response
    except requests.RequestException as e:
        # print(f"An error occurred while fetching the webpage: {e}")
        final_response["Message"] = f"An error occurred while fetching the webpage: {e}"
        final_response["Response"] = getattr(e.response, 'status_code', None)
        return final_response
    except ValueError as e:
        # print(f"An error occurred while parsing the webpage: {e}")
        final_response["Message"] = f"An error occurred while parsing the webpage: {e}"
        final_response["Response"] = 404
        # getattr(e.response, 'status_code', None)
        return final_response
    except Exception as e:
        # print(f"An unexpected error occurred: {e}")
        final_response["Message"] = f"An unexpected error occurred: {e}"
        final_response["Response"] = 500
        return final_response


In [3]:
base_url = "http://www.tbmgar.com/"
url = "http://www.tbmgar.com/lcqwbw.asp?id=282#ndbwNdCam"
extract_tbmgar_page_article_links(url, base_url)

{'Links': ['http://www.tbmgar.com/zwndbw.asp?id=5020&Zhg=001&lcqwid=282&lcqbID=70&NdRak_ID=ZamqowLc#ndbwNdCam',
  'http://www.tbmgar.com/zwndbw.asp?id=4534&Zhg=001&lcqwid=282&lcqbID=70&NdRak_ID=ZamqowLc#ndbwNdCam',
  'http://www.tbmgar.com/zwndbw.asp?id=4516&Zhg=001&lcqwid=282&lcqbID=70&NdRak_ID=ZamqowLc#ndbwNdCam',
  'http://www.tbmgar.com/zwndbw.asp?id=4490&Zhg=001&lcqwid=282&lcqbID=70&NdRak_ID=ZamqowLc#ndbwNdCam',
  'http://www.tbmgar.com/zwndbw.asp?id=4482&Zhg=001&lcqwid=282&lcqbID=70&NdRak_ID=ZamqowLc#ndbwNdCam',
  'http://www.tbmgar.com/zwndbw.asp?id=4462&Zhg=001&lcqwid=282&lcqbID=70&NdRak_ID=ZamqowLc#ndbwNdCam',
  'http://www.tbmgar.com/zwndbw.asp?id=4443&Zhg=001&lcqwid=282&lcqbID=70&NdRak_ID=ZamqowLc#ndbwNdCam',
  'http://www.tbmgar.com/zwndbw.asp?id=4438&Zhg=001&lcqwid=282&lcqbID=70&NdRak_ID=ZamqowLc#ndbwNdCam',
  'http://www.tbmgar.com/zwndbw.asp?id=4406&Zhg=001&lcqwid=282&lcqbID=70&NdRak_ID=ZamqowLc#ndbwNdCam',
  'http://www.tbmgar.com/zwndbw.asp?id=4377&Zhg=001&lcqwid=282&l

In [4]:
# custom_url = 'https://ti.zangdiyg.com/category/index/id/61.html'
# print(custom_url.replace(".html", ""))

https://ti.zangdiyg.com/category/index/id/61


In [28]:

def scrape_tbmgar_article_content(url, tags):
    """
    
    
    """


    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    final_response = {
        "data": {
            'title': "",
            'body': {"Audio": "", "Text": []},
            'meta_data': {'URL': url, 'Author': "", 'Date': "", 'Tags': [tags]}
        },
        "Message": "Success",
        "Response": 200
    }
    
    try:
        # Make the request to the URL
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        
        # Parse the page content with BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')
        tags_body = soup.find("td", class_="xczdNzd")
        tags = []
        if tags_body:
            tag_list = tags_body.find_all("a")
            if tag_list:
                for tag in tag_list:
                    each_tag = tag.get_text(strip=True)
                    tags.append(each_tag)
                final_tag = tags_body.find("font")
                if final_tag:
                    tags.append(final_tag.get_text(strip=True))
                final_response['data']['meta_data']["Tags"] = tags
        
        full_body = soup.find('table', class_="ndbwTT")
        if full_body:
            # Extract title
            title = soup.find('font', class_='NdbwKx')
            if title:
                title_text = title.get_text(strip=True)
            else:
                title_text = ""
            final_response['data']["title"] = title_text
            
            metadata = full_body.find('table', class_="ND_zzbSYg")
            # Extracting Meta Data
            try:
                if metadata:
                    each_meta = metadata.find_all("td")
                    if each_meta:
                        # Extract author
                        author = each_meta[0].get_text(strip=True)
                        # Extract date
                        date_text = each_meta[1].get_text(strip=True)
                        date = re.search(r'\d{4}/\d{1,2}/\d{1,2}', date_text)
                        if date:
                            final_response['data']['meta_data']["Date"] = date.group()
                        final_response['data']['meta_data']["Author"] = author
            except AttributeError:
                final_response['data']['meta_data']["Author"] = "Error fetching author"
                final_response['data']['meta_data']["Date"] = "Error fetching date"
       # Extract body content
        try:
            body = full_body.find("table", id="Nd_mni")
            if body:
                # Extract all text content, including spans
                all_text = body.find_all(string=True)
                
                # Filter out empty strings and strip whitespace
                filtered_text = [text.strip() for text in all_text if text.strip()]
                
                # Remove duplicate consecutive lines (which often occur due to formatting)
                final_text = []
                for line in filtered_text:
                    if not final_text or line != final_text[-1]:
                        final_text.append(line)
                
                final_response['data']['body']["Text"] = final_text
            else:
                final_response['data']['body']["Text"] = [""]
        except AttributeError as e:
            final_response['data']['body']["Text"] = [f"Error fetching body content: {str(e)}"]
        
        return final_response
    except requests.Timeout:
        final_response["Message"] = "Request timed out"
        final_response["Response"] = 408  # Request Timeout
        return final_response
        
    except requests.RequestException as e:
        final_response["Message"] = f"An error occurred while fetching the article: {str(e)}"
        final_response["Response"] = getattr(e.response, 'status_code', 500)
        return final_response
    except Exception as e:
        # print(f"An unexpected error occurred: {e}")
        final_response["Message"] = f"An unexpected error occurred: {e}"
        final_response["Response"] = 500
        return final_response

In [30]:
url = "http://www.tbmgar.com/zwndbw.asp?id=5026&NdRak_ID=ZamqowLc#ndbwNdCam"
# url = "http://www.tbmgar.com/zwndbw.asp?id=4886&NdRak_ID=ZamqowLc#ndbwNdCam"
scrape_tbmgar_article_content(url, tags="དཔྱད་གཏམ།")

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


{'data': {'title': 'སི་ཏུ་རིན་པོ་ཆེས་སྡེ་དགེའི་བཀའ་འགྱུར་ཞུ་དག་མཛད་སྟངས།',
  'body': {'Audio': '',
   'Text': ['བོད་འདིར་སྔ་བའི་སྣར་ཐང་བཀའ་འགྱུར་དང་། །སི་ཏུ་དགེ་བློའི་ཚལ་པ་བཀའ་འགྱུར་དང་། །རྒྱལ་རྩེ་ཐེམས་སྤང་གྲགས་པའི་བཀའ་འགྱུར་རྒྱུན། །རྒྱ་ནག་ཏཱ་མིང་རྒྱལ་པོའི་བཀའ་འགྱུར་རྒྱུན། །འཇང་ཡུལ་རྒྱལ་པོས་བཞེངས་པའི་བཀའ་འགྱུར་པར། །དེང་སང་ལི་ཐང་བཞུགས་པ་འདི་ཉིད་དང་། །ཨ་གཉེན་པཀྵིའི་ཐུགས་དམ་བཀའ་འགྱུར་དང་། །རྒྱལ་དབང་ལྔ་པའི་དགོངས་བཀོད་ལྷོ་རྫོང་གི། །བཀའ་འགྱུར་བཅས་དང་འདུལ་མདོ་ཤེར་ཕྱིན་དང་། །གསང་སྔགས་རྒྱུད་འབུམ་བཅས་པ་སོ་སོ་ཡི། །དཔེ་ཁུངས་བཙུན་པ་རྣམས་དང་གོ་བསྡུར་ཞིང་། །ཁྱད་པར་སྙིགས་དུས་བསྟན་པའི་སྒྲོན་མེ་མཆོག །མཁས་པ་ཆེན་པོ་བུ་སྟོན་རིན་ཆེན་གྲུབ། །གང་གིས་དཀར་ཆག་བཏབ་པ་ཚད་མར་འཛིན། །གཞན་ཡང་བྲིས་པ་ལས་འཛིན་ལེ་ལོ་ཡིས། །ནོར་འཁྲུལ་སོར་ཆུད་ལོག་དཔྱོད་བསྒྱུར་ཉེས་རྣམས། །འཕགས་ཡུལ་རྒྱ་དཔེའི་སྟེང་ནས་བསྒྱུར་ཞིང་ཞུས། །གཏན་ལ་ཕབ་ཅིང་ཐེ་ཚོམ་བྱུང་བ་རྣམས། །རྒྱ་འགྲེལ་དང་བསྟུན་ཚད་མ་ཉིད་དུ་བཅོས། །སྔགས་བཏུ་ཡོད་པ་དེ་ཉིད་དག་དང་བསྟུན། །ལེགས་པར་ལེགས་སྦྱར་སཾ་སྐྲྀ་ཏ་ཡི་སྐད། །འགྲོ་ལྡིང་སྐད་དང་པི་ཤཱ་ཙི་ཡི་སྐད། །ཟུར་ཆག་ཨ་བ་བྷྲཾ་ཤའི་སྐད་དང་ནི། །གསང་བ་