In [8]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import pandas as pd
import json
import time

## Extract All Artical Links from all page 

In [55]:
def extract_all_article(url: str):
    """
    Extracts all article links from a given VOT (Voice of Tibet) webpage.

    This function scrapes the provided URL and extracts links to individual articles
    found on the page.

    Args:
    url (str): The URL of the VOT webpage containing article links.

    Returns:
    List[str]: A list of URLs to individual articles.

    Raises:
    requests.RequestException: If there's an error fetching the webpage.
    ValueError: If the expected HTML structure is not found on the page.
    """

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    final_response = {
        "Links": [],
        "Message": "Success",
        "Response": 200
    }
    
    try:
        start_time = time.time()
        response = requests.get(url, headers=headers, timeout=(5, 60-5))
        response.raise_for_status()
        end_time = time.time()
        if end_time-start_time > 50:
            print(f"This ULR Took more then 50s: {url}")
            
        soup = BeautifulSoup(response.content, 'html.parser')
        # Extracting all the articles in the DIV
        all_article = soup.find("div", class_="td_block_inner tdb-block-inner td-fix-index")
        if not all_article:
            raise ValueError("Could not find the main article container on the page.")
        
        # Getting all the links of articles 
        article_links = all_article.find_all("a", class_="td-image-wrap")
        all_links = [link.get("href") for link in article_links if link.get("href")]
        final_response["Links"] = all_links
        return final_response
    
    except requests.Timeout:
        final_response["Message"] = "Request timed out"
        final_response["Response"] = 408  # Request Timeout
        return final_response
    except requests.RequestException as e:
        # print(f"An error occurred while fetching the webpage: {e}")
        final_response["Message"] = f"An error occurred while fetching the webpage: {e}"
        final_response["Response"] = getattr(e.response, 'status_code', None)
        return final_response
    except ValueError as e:
        # print(f"An error occurred while parsing the webpage: {e}")
        final_response["Message"] = f"An error occurred while parsing the webpage: {e}"
        final_response["Response"] = getattr(e.response, 'status_code', None)
        return final_response
    except Exception as e:
        # print(f"An unexpected error occurred: {e}")
        final_response["Message"] = f"An unexpected error occurred: {e}"
        final_response["Response"] = 500
        return final_response


## Getting all links of གོང་ས་མཆོག from VOT
Base link : "https://vot.org/category/%e0%bc%b8%e0%bd%82%e0%bd%bc%e0%bd%84%e0%bc%8b%e0%bd%a6%e0%bc%8b%e0%bd%98%e0%bd%86%e0%bd%bc%e0%bd%82/" + i + "/"

In [18]:

initial_url = "https://vot.org/category/%E0%BC%B8%E0%BD%82%E0%BD%BC%E0%BD%84%E0%BC%8B%E0%BD%A6%E0%BC%8B%E0%BD%98%E0%BD%86%E0%BD%BC%E0%BD%82/"
# All_url_link = []
found_url_link = extract_all_article(initial_url)
All_url_link = {}
key = "Page གོང་ས་མཆོག " + str(1)
All_url_link[key] = found_url_link
found_url_link

{'Links': ['https://vot.org/%e0%bc%b8%e0%bd%82%e0%bd%bc%e0%bd%84%e0%bc%8b%e0%bd%a6%e0%bc%8b%e0%bd%98%e0%bd%86%e0%bd%bc%e0%bd%82%e0%bc%8b%e0%bd%91%e0%bd%ba%e0%bc%8b%e0%bd%a2%e0%bd%b2%e0%bd%84%e0%bc%8b%e0%bd%95%e0%be%b1%e0%bd%b2/',
  'https://vot.org/%e0%bc%b8%e0%bd%82%e0%bd%bc%e0%bd%84%e0%bc%8b%e0%bd%a6%e0%bc%8b%e0%bd%98%e0%bd%86%e0%bd%bc%e0%bd%82%e0%bc%8b%e0%bd%82%e0%bd%b2%e0%bd%a6%e0%bc%8b-swiss-%e0%bd%91%e0%bd%84%e0%bc%8b-liechtenstein/',
  'https://vot.org/%e0%bd%a8%e0%bc%8b%e0%bd%a2%e0%bd%b2%e0%bd%a0%e0%bd%b2%e0%bc%8b%e0%bd%93%e0%bd%b2%e0%bd%a0%e0%bd%b4%e0%bc%8b%e0%bd%a1%e0%bd%bc%e0%bd%82%e0%bc%8b%e0%bd%82%e0%bd%b2%e0%bc%8b-ubs-arena-%e0%bd%a2/',
  'https://vot.org/vot%e0%bd%82%e0%bd%84%e0%bd%a6%e0%bd%98%e0%bd%86%e0%bd%82%e0%bd%a3%e0%bd%96%e0%bd%a2%e0%bd%93%e0%bd%96%e0%bd%9e-2/',
  'https://vot.org/wisdom-of-happiness-%e0%bd%9e%e0%bd%ba%e0%bd%a6%e0%bc%8b%e0%bc%b8%e0%bd%82%e0%bd%bc%e0%bd%84%e0%bc%8b%e0%bd%a6%e0%bc%8b%e0%bd%98%e0%bd%86%e0%bd%bc%e0%bd%82%e0%bc%8b%e0%bd%91%e0%bd%84%e0%

In [None]:
custom_url = "https://vot.org/category/%e0%bc%b8%e0%bd%82%e0%bd%bc%e0%bd%84%e0%bc%8b%e0%bd%a6%e0%bc%8b%e0%bd%98%e0%bd%86%e0%bd%bc%e0%bd%82/page/"
for i in tqdm(range(2,214)):
    final_url = custom_url + str(i) +"/"
    found_url_link = extract_all_article(final_url)
    key = "Page གོང་ས་མཆོག " + str(i)
    All_url_link[key] = found_url_link
    # All_url_link.extend(found_url_link)

  3%|▎         | 6/212 [00:05<03:03,  1.13it/s]


KeyboardInterrupt: 

In [None]:
All_url_link

In [None]:
print(f"Total page processed: {len(All_url_link)}")

## Saving it as Json File

In [None]:
with open("ALL_link_གོང་ས་མཆོག.json", "w") as outfile:
    json.dump(All_url_link , outfile, indent=4)

In [None]:
with open('ALL_link_གོང་ས་མཆོག.json', 'r') as openfile:
    # Reading from json file
    Loaded_All_url_link = json.load(openfile)

## Getting all links of བོད། from VOT
Base link : "https://vot.org/category/%e0%bd%96%e0%bd%bc%e0%bd%91%e0%bc%8d/page/" + i + "/"

Total page = 174


In [21]:
custom_url_བོད = "https://vot.org/category/%e0%bd%96%e0%bd%bc%e0%bd%91%e0%bc%8d/page/"
All_url_link = {}
total_page = 174 + 1
for i in tqdm(range(1,total_page)):
    final_url = custom_url_བོད + str(i) +"/"
    found_url_link = extract_all_article(final_url)
    key = "Page བོད " + str(i)
    All_url_link[key] = found_url_link

100%|██████████| 174/174 [12:44<00:00,  4.40s/it]


In [22]:
print(f"Total page in བོད།: {len(All_url_link)}")

Total page in བོད།: 174


## Saving it as Json File

In [23]:
with open("ALL_link_བོད།.json", "w") as outfile:
    json.dump(All_url_link , outfile, indent=4)

In [24]:
with open('ALL_link_བོད།.json', 'r') as openfile:
    # Reading from json file
    Loaded_All_url_link = json.load(openfile)

## Getting all links of བཙན་བྱོལ། from VOT
Base link : "https://vot.org/category/%e0%bd%96%e0%bd%99%e0%bd%93%e0%bc%8b%e0%bd%96%e0%be%b1%e0%bd%bc%e0%bd%a3%e0%bc%8d/page/" + i + "/"

Total page = 118

In [25]:
custom_url_བཙན = "https://vot.org/category/%e0%bd%96%e0%bd%99%e0%bd%93%e0%bc%8b%e0%bd%96%e0%be%b1%e0%bd%bc%e0%bd%a3%e0%bc%8d/page/"
All_url_link = {}
total_page = 118 + 1
for i in tqdm(range(1, total_page)):
    final_url = custom_url_བཙན + str(i) +"/"
    found_url_link = extract_all_article(final_url)
    key = "Page བཙན་བྱོལ " + str(i)
    All_url_link[key] = found_url_link

100%|██████████| 118/118 [04:15<00:00,  2.17s/it]


In [26]:
print(f"Total page in བཙན་བྱོལ: {len(All_url_link)}")

Total page in བཙན་བྱོལ: 118


## Saving it as Json File

In [27]:
with open("ALL_link_བཙན.json", "w") as outfile:
    json.dump(All_url_link , outfile, indent=4)

In [28]:
with open('ALL_link_བཙན.json', 'r') as openfile:
    # Reading from json file
    Loaded_All_url_link = json.load(openfile)

## Getting all links of རྒྱ་ནག from VOT
Base link : "https://vot.org/category/%e0%bd%a2%e0%be%92%e0%be%b1%e0%bc%8b%e0%bd%93%e0%bd%82/page/" + i + "/"

Total page = 123

In [29]:
custom_url_རྒྱནག = "https://vot.org/category/%e0%bd%a2%e0%be%92%e0%be%b1%e0%bc%8b%e0%bd%93%e0%bd%82/page/"
All_url_link = {}
total_page = 123 + 1
for i in tqdm(range(1, total_page)):
    final_url = custom_url_རྒྱནག + str(i) +"/"
    found_url_link = extract_all_article(final_url)
    key = "Page རྒྱནག " + str(i)
    All_url_link[key] = found_url_link

100%|██████████| 123/123 [04:32<00:00,  2.22s/it]


In [30]:
print(f"Total page in རྒྱནག: {len(All_url_link)}")

Total page in རྒྱནག: 123


## Saving it as Json File

In [31]:
with open("ALL_link_རྒྱནག.json", "w") as outfile:
    json.dump(All_url_link , outfile, indent=4)

In [32]:
with open('ALL_link_རྒྱནག.json', 'r') as openfile:
    # Reading from json file
    Loaded_All_url_link = json.load(openfile)

## Getting all Articles links of རྒྱལ་སྤྱི from VOT
Base link : "https://vot.org/category/%e0%bd%a2%e0%be%92%e0%be%b1%e0%bd%a3%e0%bc%8b%e0%bd%a6%e0%be%a4%e0%be%b1%e0%bd%b2%e0%bc%8d/page/" + i + "/"

Total page = 218

In [46]:
custom_url_རྒྱལསྤྱི = "https://vot.org/category/%e0%bd%a2%e0%be%92%e0%be%b1%e0%bd%a3%e0%bc%8b%e0%bd%a6%e0%be%a4%e0%be%b1%e0%bd%b2%e0%bc%8d/page/"
All_url_link = {}
total_page = 218 + 1
for i in tqdm(range(1, total_page)):
    final_url = custom_url_རྒྱལསྤྱི + str(i) +"/"
    found_url_link = extract_all_article(final_url)
    key = "Page རྒྱལསྤྱི " + str(i)
    All_url_link[key] = found_url_link

100%|██████████| 218/218 [09:52<00:00,  2.72s/it]


In [47]:
print(f"Total page in རྒྱལསྤྱི: {len(All_url_link)}")

Total page in རྒྱལསྤྱི: 218


### checking error

In [53]:
error_counter = 0
for page_id in range(1, len(All_url_link)):
    page_key = "Page རྒྱལསྤྱི " + str(page_id)
    try:
        All_url_link.get(page_key)
        if  All_url_link.get(page_key)["Response"]!= 200:
            error_counter += 1
    except Exception as e:
        print(page_key, e)

print(f"Total error in རྒྱལསྤྱི: {error_counter}")

Page རྒྱལསྤྱི 72 'NoneType' object is not subscriptable
Total error in རྒྱལསྤྱི: 11


## Saving it as Json File

In [50]:
with open("ALL_link_རྒྱལསྤྱི.json", "w") as outfile:
    json.dump(All_url_link , outfile, indent=4)

In [51]:
with open('ALL_link_རྒྱལསྤྱི.json', 'r') as openfile:
    # Reading from json file
    Loaded_All_url_link = json.load(openfile)

### Fixing error in data fitching 

In [56]:
def failed_article_link(file_json, original_URL):
    """
    it re-runs the failed URL from json file and add the success web-scraped 
    
    """
    for page_id in range(1, len(All_url_link)):
        page_key = "Page རྒྱལསྤྱི " + str(page_id)
        try:
            All_url_link.get(page_key)
            if  All_url_link.get(page_key)["Response"]!= 200:
                failed_url = original_URL + str(page_id) +"/"
                found_url_link = extract_all_article(failed_url)
                file_json[page_key] = found_url_link
                
        except Exception as e:
            print(page_key, e)
            failed_url = original_URL + str(page_id) +"/"
            found_url_link = extract_all_article(failed_url)
            file_json[page_key] = found_url_link
    return file_json


In [57]:
with open("ALL_link_རྒྱལསྤྱི.json", "w") as outfile:
    json.dump(All_url_link , outfile, indent=4)

custom_url_རྒྱལསྤྱི = "https://vot.org/category/%e0%bd%a2%e0%be%92%e0%be%b1%e0%bd%a3%e0%bc%8b%e0%bd%a6%e0%be%a4%e0%be%b1%e0%bd%b2%e0%bc%8d/page/"

New_All_url_link = failed_article_link(All_url_link, custom_url_རྒྱལསྤྱི)


Page རྒྱལསྤྱི 72 'NoneType' object is not subscriptable


In [60]:
error_counter = 0
for page_id in range(1, len(New_All_url_link)):
    page_key = "Page རྒྱལསྤྱི " + str(page_id)
    try:
        if  New_All_url_link.get(page_key)["Response"]!= 200:
            error_counter += 1
    except Exception as e:
        print(page_key, e)

print(f"Total error in རྒྱལསྤྱི: {error_counter}")

Total error in རྒྱལསྤྱི: 0


In [59]:
len(New_All_url_link)

218

In [61]:
with open("ALL_link_རྒྱལསྤྱི.json", "w") as outfile:
    json.dump(All_url_link , outfile, indent=4)

In [7]:
custom_url_བོད = "https://vot.org/category/%e0%bd%96%e0%bd%bc%e0%bd%91%e0%bc%8d/page/14/"
found_url_link = extract_all_article(custom_url_བོད)
found_url_link


{'Links': ['https://vot.org/%e0%bd%a2%e0%be%92%e0%be%b1%e0%bc%8b%e0%bd%82%e0%bd%9e%e0%bd%b4%e0%bd%84%e0%bc%8b%e0%bd%82%e0%bd%b2%e0%bd%a6%e0%bc%8b%e0%bd%9a%e0%bd%bc%e0%bd%84%e0%bc%8b%e0%bd%9f%e0%bd%bc%e0%bd%82%e0%bc%8b%e0%bd%81/',
  'https://vot.org/%e0%bd%96%e0%bd%bc%e0%bd%91%e0%bc%8b%e0%bd%98%e0%bd%b2%e0%bc%8b%e0%bd%a6%e0%be%90%e0%bd%91%e0%bc%8b%e0%bd%a1%e0%bd%b2%e0%bd%82%e0%bc%8b%e0%bd%a3%e0%bd%a6%e0%bc%8b%e0%bd%a0%e0%bd%82%e0%bd%b4%e0%bd%a3-2/',
  'https://vot.org/%e0%bd%a2%e0%be%94%e0%bc%8b%e0%bd%96%e0%bc%8b%e0%bd%81%e0%bd%b4%e0%bd%a3%e0%bc%8b%e0%bd%91%e0%bd%b4%e0%bc%8b%e0%bc%b8%e0%bd%a6%e0%be%90%e0%be%b1%e0%bd%96%e0%bd%a6%e0%bc%8b%e0%bd%a2%e0%be%97%e0%bd%ba/',
  'https://vot.org/%e0%bd%81%e0%bd%98%e0%bd%a6%e0%bd%91%e0%bd%80%e0%bd%a2%e0%bd%98%e0%bd%9b%e0%bd%a6%e0%bd%81%e0%bd%a3%e0%bd%91/',
  'https://vot.org/%e0%bd%96%e0%bd%91%e0%bd%93%e0%bd%84%e0%bd%82%e0%bd%96%e0%bd%91%e0%bd%98%e0%bd%91%e0%bd%a2%e0%bd%98%e0%bd%a6%e0%bd%a3/',
  'https://vot.org/%e0%bd%91%e0%bd%96%e0%bd%93%e0%bd%a6