In [2]:
import requests
from bs4 import BeautifulSoup
from typing import Dict, Any, List
import time
from tqdm import tqdm 
import json

In [1]:
import VOATibetan_utils

## with the help of VOATibetan_utils.py we can use two main function
1. **extract_all_VOATibetan_article_links: Extracts all article links from a given VOATibetan_utils webpage.**


------------
-----------
------------
------------
------------

In [39]:
def loop_article_page(total_page, custom_url, key_code):
    """
    
    """
    return_file = {
        "Data": [],
        "message": "success",
        "response": 200
    }
    All_url_links = {}
    
    try:
        for i in tqdm(range(0, total_page)):
            final_url = custom_url + str(i) 
            found_url_links, load_more = VOATibetan_utils.extract_all_VOATibetan_article_links(final_url)
            key = key_code + str(i)
            All_url_links[key] = found_url_links
            if load_more==False:
                print(f"Final page number: {i}")
                break
        return_file["Data"] = All_url_links
        # print(final_url)
        return return_file
    
    except Exception as e:
        return_file["Data"] = All_url_links
        return_file["message"] = e
        return_file["response"] = 404
        return return_file

In [14]:
def check_error_in_links(All_url_link, page_code, print_each_error=False):
    """
    
    """

    error_counter = 0
    for page_id in range(1, len(All_url_link)):
        page_key = page_code + str(page_id)
        try:
            All_url_link.get(page_key)
            if  All_url_link.get(page_key)["Response"]!= 200:
                error_counter += 1
                if print_each_error:
                    print(page_key, All_url_link.get(page_key)["message"])
        except Exception as e:
            print(page_key, e)

    print(f"Total error in {page_code}: {error_counter}")

In [15]:
def save_json(path, file_name, data):
    """
    
    """
    with open(path+file_name, "w") as outfile:
        json.dump(data, outfile, indent=4)
        print(f"Successfully saved: {file_name}")

------------
------------
------------



# A. Extracting all Article links from བོད། 
- Base url: https://www.voatibetan.com/z/2253?p=0
- Custom URL: https://www.voatibetan.com/z/2253?p= + str(i) 
- Total page: unknown

In [6]:
total_page = 1000 # for custom check
custom_url= "https://www.voatibetan.com/z/2253?p="
article_tag = "བོད།"
key_code = "Page " + article_tag + " "
print(f"Page code: {key_code}")

all_links = loop_article_page(total_page, custom_url, key_code)


Page code: Page བོད། 


  0%|          | 0/1000 [00:00<?, ?it/s]

 10%|█         | 100/1000 [01:35<14:18,  1.05it/s]

Final page number: 100





In [7]:
print(f"Total page in {article_tag}: {len(all_links['Data'])}")

Total page in བོད།: 101


In [8]:
check_error_in_links(all_links['Data'], key_code, print_each_error=True)

Total error in Page བོད། : 0


In [9]:
# Saving the final file
path = "./data/"
file_name = f"VOATibetan_ALL_link_{article_tag}.json"
save_json(path, file_name, all_links['Data'])

Successfully saved: VOATibetan_ALL_link_བོད།.json


------------
------------
------------



# B. Extracting all Article links from ཨ་རི། 
- Base url: https://www.voatibetan.com/z/2252?p=1
- Custom URL: https://www.voatibetan.com/z/2252?p= + str(i) 
- Total page: unknown

In [21]:
total_page = 1000 # for custom check
custom_url= "https://www.voatibetan.com/z/2252?p="
article_tag = "ཨ་རི།"
key_code = "Page " + article_tag + " "
print(f"Page code: {key_code}")

all_links = loop_article_page(total_page, custom_url, key_code)


Page code: Page ཨ་རི། 


 10%|█         | 100/1000 [01:22<12:25,  1.21it/s]

Final page number: 100





In [22]:
print(f"Total page in {article_tag}: {len(all_links['Data'])}")

Total page in ཨ་རི།: 101


In [23]:
check_error_in_links(all_links['Data'], key_code, print_each_error=True)

Total error in Page ཨ་རི། : 0


In [24]:
# Saving the final file
path = "./data/"
file_name = f"VOATibetan_ALL_link_{article_tag}.json"
save_json(path, file_name, all_links['Data'])

Successfully saved: VOATibetan_ALL_link_ཨ་རི།.json


------------
------------
------------



# C. Extracting all Article links from རྒྱ་ནག 
- Base url: https://www.voatibetan.com/z/2244?p=1
- Custom URL: https://www.voatibetan.com/z/2244?p= + str(i) 
- Total page: unknown

In [25]:
total_page = 1000 # for custom check
custom_url= "https://www.voatibetan.com/z/2244?p="
article_tag = "རྒྱ་ནག"
key_code = "Page " + article_tag + " "
print(f"Page code: {key_code}")

all_links = loop_article_page(total_page, custom_url, key_code)


Page code: Page རྒྱ་ནག 


 10%|█         | 100/1000 [01:25<12:53,  1.16it/s]

Final page number: 100





In [26]:
print(f"Total page in {article_tag}: {len(all_links['Data'])}")

Total page in རྒྱ་ནག: 101


In [27]:
check_error_in_links(all_links['Data'], key_code, print_each_error=True)

Total error in Page རྒྱ་ནག : 0


In [29]:
# Saving the final file
path = "./data/"
file_name = f"VOATibetan_ALL_link_{article_tag}.json"
save_json(path, file_name, all_links['Data'])

Successfully saved: VOATibetan_ALL_link_རྒྱ་ནག.json


------------
------------
------------



# D. Extracting all Article links from འཛམ་གླིང་། 
- Base url: https://www.voatibetan.com/z/2254?p=1
- Custom URL: https://www.voatibetan.com/z/2254?p= + str(i) 
- Total page: unknown

In [40]:
total_page = 1000 # for custom check
custom_url= "https://www.voatibetan.com/z/2254?p="
article_tag = "འཛམ་གླིང་།"
key_code = "Page " + article_tag + " "
print(f"Page code: {key_code}")

all_links = loop_article_page(total_page, custom_url, key_code)


Page code: Page འཛམ་གླིང་། 


 10%|█         | 100/1000 [01:28<13:20,  1.12it/s]

Final page number: 100





In [41]:
print(f"Total page in {article_tag}: {len(all_links['Data'])}")

Total page in འཛམ་གླིང་།: 101


In [42]:
check_error_in_links(all_links['Data'], key_code, print_each_error=True)

Total error in Page འཛམ་གླིང་། : 0


In [43]:
# Saving the final file
path = "./data/"
file_name = f"VOATibetan_ALL_link_{article_tag}.json"
save_json(path, file_name, all_links['Data'])

Successfully saved: VOATibetan_ALL_link_འཛམ་གླིང་།.json


In [37]:
all_links

{'Data': {'Page འཛམ་གླིང་། 0': {'Links': [None,
    'https://www.voatibetan.com/a/we-re-going-nowhere---thai-opposition-figure-says-court-interventions-must-stop/7750908.html',
    'https://www.voatibetan.com/a/japan-s-governing-party-to-choose-its-head-who-will-also-be-the-new-pm-on-sept-27-/7750903.html',
    'https://www.voatibetan.com/a/mongolia-excludes-russia-china-pipeline-from-its-national-action-program/7750893.html',
    'https://www.voatibetan.com/a/ukrainian-president-says-the-push-into-russia-s-kursk-region-is-to-create-a-buffer-zone-there/7749423.html',
    'https://www.voatibetan.com/a/indian-medics-refuse-to-end-protests-over-doctor-s-rape-and-murder-/7749404.html',
    'https://www.voatibetan.com/a/ukraine-claims-destruction-of-second-bridge-in-russia-s-kursk-region-/7748214.html'],
   'Message': 'Success',
   'Response': 200,
   'source_url': 'https://www.voatibetan.com/z/2254?p=0'}},
 'message': AttributeError("'ValueError' object has no attribute 'response'"),
 'res