In [3]:
import requests
from bs4 import BeautifulSoup
import json
import time

from tqdm import tqdm
import re

from urllib.parse import urljoin

In [4]:
# sertha

In [5]:
def extract_sertha_page_article_links(url,):
    """
    Extracts all article links from a given sertha webpage.

    This function scrapes the provided URL and extracts links to individual articles
    found on the page.

    Args:
    url (str): The URL of the sertha webpage containing article links.

    Returns:
        {
            "Links": List[],
            "Message": string,
            "Response": int,
            "source_url": string
        }
    Raises:
    requests.RequestException: If there's an error fetching the webpage.
    ValueError: If the expected HTML structure is not found on the page.
    """

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    final_response = {
        "Links": [],
        "Message": "Success",
        "Response": 200,
        "source_url": url
    }
    
    try:
        start_time = time.time()
        response = requests.get(url, headers=headers, timeout=(5, 60-5))
        response.raise_for_status()
        end_time = time.time()

        if end_time-start_time > 50:
            print(f"This ULR Took more then 50s: {url}")
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # # Getting all the links of articles 
        all_links = []
        all_link_article = soup.find("div", class_="elementor-column elementor-col-50 elementor-top-column elementor-element elementor-element-c800525")
        if all_link_article:
            article_block = all_link_article.find_all("h3", class_="elementor-post__title")
            if article_block:
                for each_head in article_block:
                    article_links = each_head.find("a")
                    if article_links:
                        full_url = article_links.get("href")
                        all_links.append(full_url)
                        
        final_response["Links"] = all_links
        return final_response
     
    except requests.Timeout:
        final_response["Message"] = "Request timed out"
        final_response["Response"] = 408  # Request Timeout
        return final_response
    except requests.RequestException as e:
        # print(f"An error occurred while fetching the webpage: {e}")
        final_response["Message"] = f"An error occurred while fetching the webpage: {e}"
        final_response["Response"] = getattr(e.response, 'status_code', None)
        return final_response
    except ValueError as e:
        # print(f"An error occurred while parsing the webpage: {e}")
        final_response["Message"] = f"An error occurred while parsing the webpage: {e}"
        final_response["Response"] = 404
        # getattr(e.response, 'status_code', None)
        return final_response
    except Exception as e:
        # print(f"An unexpected error occurred: {e}")
        final_response["Message"] = f"An unexpected error occurred: {e}"
        final_response["Response"] = 500
        return final_response


In [7]:
url = "https://sertha.net/category/hhdl/"
URL = "https://sertha.net/category/%e0%bd%98%e0%bd%b2%e0%bc%8b%e0%bd%a6%e0%be%a3%e0%bc%8b%e0%bd%84%e0%bd%bc%e0%bc%8b%e0%bd%a6%e0%be%a4%e0%be%b2%e0%bd%bc%e0%bd%91%e0%bc%8d/"
extract_sertha_page_article_links(url)

{'Links': ['https://sertha.net/2024/04/04/%e0%bd%a6%e0%bd%84%e0%bd%a6%e0%bc%8b%e0%bd%a2%e0%be%92%e0%be%b1%e0%bd%a6%e0%bc%8b%e0%bd%80%e0%be%b1%e0%bd%b2%e0%bc%8b%e0%bd%a2%e0%bd%b2%e0%bd%84%e0%bc%8b%e0%bd%96%e0%bd%a6%e0%be%b2%e0%bd%ba%e0%bd%a3/',
  'https://sertha.net/2023/07/06/%e0%bd%a0%e0%bd%81%e0%be%b2%e0%bd%b4%e0%bd%84%e0%bd%a6%e0%bc%8b%e0%bd%a6%e0%be%90%e0%bd%a2%e0%bc%8b%e0%bd%96%e0%bd%a6%e0%bd%b4%e0%bc%8b%e0%bd%96%e0%bd%a0%e0%bd%b2%e0%bc%8b%e0%bd%a4%e0%bd%b2%e0%bd%a6/',
  'https://sertha.net/2023/03/30/%e0%bd%91%e0%bd%82%e0%be%b2%e0%bc%8b%e0%bd%96%e0%bd%bc%e0%bc%8b%e0%bd%a3%e0%bd%a0%e0%bd%84%e0%bc%8b%e0%bd%98%e0%bd%9b%e0%bd%a0%e0%bc%8b%e0%bd%9e%e0%bd%b2%e0%bd%84%e0%bc%8b%e0%bd%96%e0%bd%a2%e0%be%a9/',
  'https://sertha.net/2023/02/16/%e0%bd%81%e0%bd%bc%e0%bd%a2%e0%bc%8b%e0%bd%a1%e0%bd%b4%e0%bd%82%e0%bc%8b%e0%bd%a6%e0%be%90%e0%bd%bc%e0%bd%a2%e0%bc%8b%e0%bd%82%e0%be%b1%e0%bd%b2%e0%bc%8b%e0%bd%96%e0%bd%80%e0%bd%a0%e0%bc%8b%e0%bd%a6/',
  'https://sertha.net/2023/02/05/%e0%bd%96%e0%bd%bc

In [4]:
# custom_url = 'https://ti.zangdiyg.com/category/index/id/61.html'
# print(custom_url.replace(".html", ""))

https://ti.zangdiyg.com/category/index/id/61


In [15]:

import requests
from bs4 import BeautifulSoup
import re
import time
import random
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from datetime import datetime, timedelta

def requests_retry_session(
    retries=3,
    backoff_factor=0.3,
    status_forcelist=(500, 502, 504),
    session=None,
):
    session = session or requests.Session()
    retry = Retry(
        total=retries,
        read=retries,
        connect=retries,
        backoff_factor=backoff_factor,
        status_forcelist=status_forcelist,
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session


def scrape_sertha_article_content(url, tags):
    headers = {
        # ':authority': 'sertha.net',
        # ':method': 'GET',
        # ':path': '/2022/12/04/%e0%bd%a6%e0%be%b2%e0%bd%b2%e0%bd%91%e0%bc%8b%e0%bd%94%e0%bd%a0%e0%bd%b2%e0%bc%8b%e0%bd%82%e0%be%b2%e0%bd%bc%e0%bd%84%e0%bc%8b%e0%bd%81%e0%be%b1%e0%bd%ba%e0%bd%a2%e0%bc%8b%e0%bd%93%e0%bd%a6%e0%bc%8b/',
        # ':scheme': 'https',
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'accept-encoding': 'gzip, deflate, br, zstd',
        'accept-language': 'en-US,en;q=0.9,en-IN;q=0.8',
        'cache-control': 'max-age=0',
        'priority': 'u=0, i',
        'referer': 'https://sertha.net/category/hhdl/%e0%bd%98%e0%bd%9b%e0%bd%91%e0%bc%8b%e0%bd%a2%e0%be%a3%e0%bd%98%e0%bc%8d-%e0%be%8b%e0%bd%82%e0%bd%bc%e0%bd%84%e0%bc%8b%e0%bd%a6%e0%bc%8b%e0%bd%98%e0%bd%86%e0%bd%bc%e0%bd%82/',
        'sec-ch-ua': '"Microsoft Edge";v="129", "Not=A?Brand";v="8", "Chromium";v="129"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"',
        'sec-fetch-dest': 'document',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-user': '?1',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36 Edg/129.0.0.0'
        }
    
    final_response = {
        "data": {
            'title': "",
            'body': {"Audio": "", "Text": []},
            'meta_data': {'URL': url, 'Author': "", 'Date': "", 'Tags': [tags]}
        },
        "Message": "Success",
        "Response": 200
    }
    
    try:
        # Add a random delay before making the request
        time.sleep(random.uniform(1, 3))
        
        # Make the request to the URL using the retry session
        session = requests_retry_session()
        response = session.get(url, headers=headers, allow_redirects=False)
        response.raise_for_status()
        
        # Check for redirect
        if response.is_redirect:
            final_response["Message"] = f"Redirected to: {response.headers['Location']}"
            final_response["Response"] = response.status_code
            return final_response
        
        # Parse the page content with BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')
        tags_body = soup.find("span", class_="elementor-icon-list-text elementor-post-info__item elementor-post-info__item--type-terms")
        tags = []
        if tags_body:
            tag_list = tags_body.find_all("a")
            if tag_list:
                for tag in tag_list:
                    each_tag = tag.get_text(strip=True)
                    tags.append(each_tag)
                final_response['data']['meta_data']["Tags"] = tags
        
        full_body = soup.find('section', class_="elementor-section elementor-top-section elementor-element elementor-element-f29ac48 elementor-section-boxed elementor-section-height-default elementor-section-height-default")
        if full_body:
            # Extract title
            title = full_body.find('h1', class_="elementor-heading-title elementor-size-large")
            if title:
                title_text = title.get_text(strip=True)
                final_response['data']["title"] = title_text
            
            metadata = full_body.find('ul', class_="elementor-inline-items elementor-icon-list-items elementor-post-info")
            # Extracting Meta Data
            try:
                if metadata:
                    date_box = metadata.find("span", class_="elementor-icon-list-text elementor-post-info__item elementor-post-info__item--type-date")
                    if date_box:
                        final_response['data']['meta_data']["Date"] = date_box.get_text(strip=True)

                    author_box = metadata.find("span", class_="elementor-icon-list-text elementor-post-info__item elementor-post-info__item--type-author")
                    if author_box:
                        final_response['data']['meta_data']["Author"] = author_box.get_text(strip=True)
            except AttributeError:
                final_response['data']['meta_data']["Author"] = "Error fetching author"
                final_response['data']['meta_data']["Date"] = "Error fetching date"
            
            # Extract body content
            try:
                body = full_body.find("div", class_="elementor-element elementor-element-3979e3e elementor-widget elementor-widget-theme-post-content")
                if body:
                    paragraphs = body.find_all("p")
                    if paragraphs:
                        final_response['data']['body']["Text"] = [para.get_text(strip=True) for para in paragraphs]
                    else:
                        final_response['data']['body']["Text"] = [""]
            except AttributeError as e:
                final_response['data']['body']["Text"] = [f"Error fetching body content: {str(e)}"]
        
        return final_response
    except requests.Timeout:
        final_response["Message"] = "Request timed out"
        final_response["Response"] = 408  # Request Timeout
        return final_response
    except requests.RequestException as e:
        final_response["Message"] = f"An error occurred while fetching the article: {str(e)}"
        final_response["Response"] = getattr(e.response, 'status_code', 500)
        return final_response
    except Exception as e:
        final_response["Message"] = f"An unexpected error occurred: {e}"
        final_response["Response"] = 500
        return final_response




In [17]:
url = "https://sertha.net/2022/12/04/%e0%bd%a6%e0%be%b2%e0%bd%b2%e0%bd%91%e0%bc%8b%e0%bd%94%e0%bd%a0%e0%bd%b2%e0%bc%8b%e0%bd%82%e0%be%b2%e0%bd%bc%e0%bd%84%e0%bc%8b%e0%bd%81%e0%be%b1%e0%bd%ba%e0%bd%a2%e0%bc%8b%e0%bd%93%e0%bd%a6%e0%bc%8b/"
url = "https://sertha.net/2024/06/03/%e0%bd%86%e0%bd%bc%e0%bd%a3%e0%bc%8b%e0%bd%82%e0%bd%a6%e0%bd%b4%e0%bd%98%e0%bc%8b%e0%bd%a3%e0%bd%98%e0%bc%8b%e0%bd%a1%e0%bd%b2%e0%bd%82/"
scrape_sertha_article_content(url, tags="དཔྱད་གཏམ།")

{'data': {'title': 'ཆོལ་གསུམ་ལམ་ཡིག',
  'body': {'Audio': '',
   'Text': ['མི་ལ་ཙི་ཙི།སྔོན་གྱི་གཏམ།',
    '2015ལོར། གུས་པས་རྐང་འཁོར་བཞོན་ཏེ་དུས་ཡུན་ཟླ་བ་ལྔ་ལྷག་གི་རིང་། སྨད་མདོ་ཁམས་ནས་བར་དབུས་གཙང་། དེ་ནས་སྟོད་མངའ་རིས་བར། ཁྱོན་ལམ་ཐག་སྤྱི་ལེ་ཆིག་ཁྲི་ཉིས་སྟོང་ལྷག་ཅིག་ཡོད་པའི་འགྲུལ་བཞུད་ཐེངས་གཅིག་བྱས་པ་ཡིན། དེ་རིང་ཕྱིར་དྲན་བྱས་ཚེ། སྐབས་དེའི་དཀའ་ཚེགས་ཡོངས་སུ་བརྗེད་དེ། སྣང་བ་བག་བྲོ་པོ་འབའ་ཞིག་ལ་ཡོང་། ལམ་སྐབས་ཀྱི་ཟིན་བྲིས་རྨང་གཞིར་བྱས་ཏེ་དེབ་ཆུང་འདི་འབྲི་བའི་ཁྲོད་ཀྱང་། རྟོག་པའི་རྟ་ཕོ་འཚེར་སྐད་སིང་སིང་སྲབ་ལས་ཤོར་ཏེ་རི་ཆུ་རབ་འབྱམས་དེའི་ཀློང་ལ་ཡང་ཡང་འཆོར། མ་ཟད། ཀྱེ་མ—ད་རུང་དེ་འདྲའི་སྐལ་བ་ཞིག་ལྡན་ན་ཅི་མ་རུང་སྙམ་དུས་ཤིན་ཏུ་མི་ཉུང་།',
    'ཁ་འགྲོ་ལ་ཐུག འཕོང་རྡོ་ལ་ཐོགས། གཏམ་དཔེ་པྲ་ཆལ་འདིས་ཏག་ཏག་དེའི་ཡར་སྔོན་གྱི་ཁོ་བོ་མཚོན་དུ་བཏུབ། འགྲུལ་བཞུད་འདི་འདྲ་ཞིག་དྲན་ཏེ་ལོ་མང་ཐལ་རུང་ཡང་ཡང་ཡལ་ཡོལ་ཕར་འགྱངས་ལ་ལུས། ལོ་དེར་ཏག་ཏག་འཚོ་བ་དང་སེམས་ཁམས་ཀྱི་འབྲག་ལ་དུས་ཚོད་ཀྱི་གས་གཤོང་ཆེ་ཙམ་ཞིག་ཤོར་བ་གྲོགས་སུ་ཤར་ཏེ་གཞི་ནས་རྡོག་འཐོན་རྒྱུ་བྱུང་། དེར་མ་ཟད། རྐང་འཁོར་ལྟ་བུའི་འགྲུལ་ཆས་སྟབས་བདེ་སྤྱད་དེ་གཅིག་པུར་ལམ་དུ་ཐེག་ཚུལ་འདི

In [9]:
import re

def extract_metadata(text):
    metadata = {
        "Source": "",
        "Author": "",
        "Date": ""
    }

    try:
        # Extract source
        source_pattern = r'ཡོང་ཁུངས།\s*(.+?)\s*(?:\||$)'
        source_match = re.search(source_pattern, text)
        if source_match:
            metadata["Source"] = source_match.group(1).strip()

        # Extract date
        date_pattern = r'སྤེལ་དུས།\s*(\d{4}-\d{2}-\d{2})'
        date_match = re.search(date_pattern, text)
        if date_match:
            metadata["Date"] = date_match.group(1)

        # Extract author
        author_pattern = r'རྩོམ་པ་པོ།\s*(.+?)\s*(?:\||$)'
        author_match = re.search(author_pattern, text)
        if author_match:
            metadata["Author"] = author_match.group(1).strip()

    except AttributeError:
        metadata["Source"] = "Error fetching source"
        metadata["Author"] = "Error fetching author"
        metadata["Date"] = "Error fetching date"

    return metadata

# Test cases
test1 = "ཡོང་ཁུངས།  ཀྲུང་གོའི་བོད་ཀྱི་གསར་འགྱུར་དྲ་བ།   |     རྩོམ་པ་པོ།  ཚེ་སྐྱིད། དབྱངས་ཅན་སྒྲོལ་མ།   |     སྤེལ་དུས།  2024-06-19"
test2 = "ཡོང་ཁུངས།  ཀྲུང་གོའི་བོད་ཀྱི་གསར་འགྱུར་དྲ་བ།   |     སྤེལ་དུས།  2024-07-15"

# Process test cases
for i, test in enumerate([test1, test2], 1):
    result = extract_metadata(test)
    print(f"Test {i} results:")
    for key, value in result.items():
        print(f"{key}: {value}")
    print()

Test 1 results:
Source: ཀྲུང་གོའི་བོད་ཀྱི་གསར་འགྱུར་དྲ་བ།
Author: ཚེ་སྐྱིད། དབྱངས་ཅན་སྒྲོལ་མ།
Date: 2024-06-19

Test 2 results:
Source: ཀྲུང་གོའི་བོད་ཀྱི་གསར་འགྱུར་དྲ་བ།
Author: 
Date: 2024-07-15

