In [2]:
import requests
from bs4 import BeautifulSoup
from typing import Dict, Any, List
import time
from tqdm import tqdm 
import json

In [3]:
import RFA_utils

## with the help of RFA_utils we can use two main function
1. **extract_all_RFA_article_links: Extracts all article links from a given RFA webpage.**
2. scrape_rfa_article: Scrapes an article from the RFA website.


------------
-----------
------------
------------
------------

In [4]:
def loop_article_page(total_page, custom_url, key_code):
    """
    
    """
    return_file = {
        "Data": [],
        "message": "success",
        "response": 200
    }
    All_url_links = {}
    
    try:
        for i in tqdm(range(0, total_page)):
            final_url = custom_url + str(i*15) 
            found_url_links = RFA_utils.extract_all_RFA_article_links(final_url)
            key = key_code + str(i)
            All_url_links[key] = found_url_links
        return_file["Data"] = All_url_links
        return return_file
    
    except Exception as e:
        return_file["Data"] = All_url_links
        return_file["message"] = e
        return_file["response"] = 404
        return return_file

In [5]:
def check_error_in_links(All_url_link, page_code, print_each_error=False):
    """
    
    """

    error_counter = 0
    for page_id in range(1, len(All_url_link)):
        page_key = page_code + str(page_id)
        try:
            All_url_link.get(page_key)
            if  All_url_link.get(page_key)["Response"]!= 200:
                error_counter += 1
                if print_each_error:
                    print(page_key, All_url_link.get(page_key)["message"])
        except Exception as e:
            print(page_key, e)

    print(f"Total error in {page_code}: {error_counter}")

In [6]:
def save_json(path, file_name, data):
    """
    
    """
    with open(path+file_name, "w") as outfile:
        json.dump(data, outfile, indent=4)
        print(f"Successfully saved: {file_name}")

------------
------------
------------



# A. Extracting all Article links from ༸གོང་ས་མཆོག 
- Base url: https://www.rfa.org/tibetan/dalai-lama/story_archive?b_start:int=15
- Custom URL: https://www.rfa.org/tibetan/dalai-lama/story_archive?b_start:int= + str(i)
- Total page:116

In [19]:
total_page = 116 + 1
custom_url= "https://www.rfa.org/tibetan/dalai-lama/story_archive?b_start:int="
article_tag = "གོང་ས་མཆོག"
key_code = "Page " + article_tag + " "
print(f"Page code: {key_code}")

all_links = loop_article_page(total_page, custom_url, key_code)


Page code: Page གོང་ས་མཆོག 


 99%|█████████▉| 116/117 [02:29<00:01,  1.29s/it]


In [20]:
print(f"Total page in {article_tag}: {len(all_links['Data'])}")

Total page in གོང་ས་མཆོག: 116


In [21]:
check_error_in_links(all_links['Data'], key_code, print_each_error=True)

Total error in Page གོང་ས་མཆོག : 0


In [22]:
# Saving the final file
path = "./data/"
file_name = f"RFA_ALL_link_{article_tag}.json"
save_json(path, file_name, all_links['Data'])

Successfully saved: RFA_ALL_link_གོང་ས་མཆོག.json


------------
------------
------------



# B. Extracting all Article links from བོད།
- Base url: https://www.rfa.org/tibetan/tibet/story_archive?b_start:int=15
- Custom URL: https://www.rfa.org/tibetan/tibet/story_archive?b_start:int= + str(i)
- Total page:159

In [24]:
total_page = 159 + 1
custom_url= "https://www.rfa.org/tibetan/tibet/story_archive?b_start:int="
article_tag = "བོད།"
key_code = "Page " + article_tag + " "
print(f"Page code: {key_code}")

all_links = loop_article_page(total_page, custom_url, key_code)

Page code: Page བོད། 


 99%|█████████▉| 159/160 [03:06<00:01,  1.17s/it]


In [25]:
print(f"Total page in {article_tag}: {len(all_links['Data'])}")

Total page in བོད།: 159


In [26]:
check_error_in_links(all_links['Data'], key_code, print_each_error=True)

Page བོད། 54 'message'
Total error in Page བོད། : 1


In [27]:
# Saving the final file
path = "./data/"
file_name = f"RFA_ALL_link_{article_tag}.json"
save_json(path, file_name, all_links['Data'])

Successfully saved: RFA_ALL_link_བོད།.json


------------
------------
------------



# C. Extracting all Article links from བཙན་བྱོལ།
- Base url: https://www.rfa.org/tibetan/exile/story_archive?b_start:int=15
- Custom URL: https://www.rfa.org/tibetan/exile/story_archive?b_start:int= + str(i)
- Total page:257

In [16]:
total_page = 257 + 1
custom_url= "https://www.rfa.org/tibetan/exile/story_archive?b_start:int="
article_tag = "བཙན་བྱོལ།"
key_code = "Page " + article_tag + " "
print(f"Page code: {key_code}")

all_links = loop_article_page(total_page, custom_url, key_code)


Page code: Page བཙན་བྱོལ། 


100%|████████████████████████████████████████▊| 257/258 [00:48<00:00,  5.35it/s]


In [17]:
print(f"Total page in {article_tag}: {len(all_links['Data'])}")

Total page in བཙན་བྱོལ།: 257


In [18]:
check_error_in_links(all_links['Data'], key_code, print_each_error=True)

Total error in Page བཙན་བྱོལ། : 0


In [19]:
# Saving the final file
path = "./data/"
file_name = f"RFA_ALL_link_{article_tag}.json"
save_json(path, file_name, all_links['Data'])

Successfully saved: RFA_ALL_link_བཙན་བྱོལ།.json


------------
------------
------------



# D. Extracting all Article links from འཛམ་གླིང༌།
- Base url: https://www.rfa.org/tibetan/world/story_archive?b_start:int=15
- Custom URL: https://www.rfa.org/tibetan/world/story_archive?b_start:int= + str(i)
- Total page:199

In [20]:
total_page = 199 + 1
custom_url= "https://www.rfa.org/tibetan/world/story_archive?b_start:int="
article_tag = "འཛམ་གླིང༌།"
key_code = "Page " + article_tag + " "
print(f"Page code: {key_code}")

all_links = loop_article_page(total_page, custom_url, key_code)


Page code: Page འཛམ་གླིང༌། 


 99%|████████████████████████████████████████▌| 198/200 [01:55<00:01,  1.71it/s]


In [21]:
print(f"Total page in {article_tag}: {len(all_links['Data'])}")

Total page in འཛམ་གླིང༌།: 198


In [22]:
check_error_in_links(all_links['Data'], key_code, print_each_error=True)

Total error in Page འཛམ་གླིང༌། : 0


In [23]:
# Saving the final file
path = "./data/"
file_name = f"RFA_ALL_link_{article_tag}.json"
save_json(path, file_name, all_links['Data'])


Successfully saved: RFA_ALL_link_འཛམ་གླིང༌།.json


In [None]:
------------
------------
------------



# E. Extracting all Article links from རྒྱ་དཀར་ནག
- Base url: https://www.rfa.org/tibetan/indiaandchina/story_archive?b_start:int=15
- Custom URL: https://www.rfa.org/tibetan/indiaandchina/story_archive?b_start:int= + str(i)
- Total page:133

In [32]:
total_page = 133 + 1
custom_url= "https://www.rfa.org/tibetan/indiaandchina/story_archive?b_start:int="
article_tag = "རྒྱ་དཀར་ནག"
key_code = "Page " + article_tag + " "
print(f"Page code: {key_code}")

all_links = loop_article_page(total_page, custom_url, key_code)


Page code: Page རྒྱ་དཀར་ནག 


 99%|████████████████████████████████████████▋| 133/134 [02:08<00:00,  1.03it/s]


In [33]:
print(f"Total page in {article_tag}: {len(all_links['Data'])}")

Total page in རྒྱ་དཀར་ནག: 133


In [34]:
check_error_in_links(all_links['Data'], key_code, print_each_error=True)

Total error in Page རྒྱ་དཀར་ནག : 0


In [36]:
# Saving the final file
path = "./data/"
file_name = f"RFA_ALL_link_{article_tag}.json"
save_json(path, file_name, all_links['Data'])


Successfully saved: RFA_ALL_link_རྒྱ་དཀར་ནག.json


In [None]:
------------ 
------------
------------



# F. Extracting all Article links from སྤྱི་ཚོགས།
- Base url: https://www.rfa.org/tibetan/society/story_archive?b_start:int=15
- Custom URL: https://www.rfa.org/tibetan/society/story_archive?b_start:int= + str(i)
- Total page:241

In [42]:
total_page = 241 + 1
custom_url= "https://www.rfa.org/tibetan/society/story_archive?b_start:int="
article_tag = "སྤྱི་ཚོགས།"
key_code = "Page " + article_tag + " "
print(f"Page code: {key_code}")

all_links = loop_article_page(total_page, custom_url, key_code)


Page code: Page སྤྱི་ཚོགས། 


100%|████████████████████████████████████████▊| 241/242 [04:43<00:01,  1.17s/it]


In [43]:
print(f"Total page in {article_tag}: {len(all_links['Data'])}")

Total page in སྤྱི་ཚོགས།: 241


In [44]:
check_error_in_links(all_links['Data'], key_code, print_each_error=True)

Total error in Page སྤྱི་ཚོགས། : 0


In [45]:
file_name = f"RFA_ALL_link_{article_tag}.json"
save_json(path, file_name, all_links['Data'])

Successfully saved: RFA_ALL_link_སྤྱི་ཚོགས།.json


------------
------------
------------



# G. Extracting all Article links from གསར་འགྱུར།
- Base url: https://www.rfa.org/tibetan/sargyur/story_archive?b_start:int=15
- Custom URL: https://www.rfa.org/tibetan/sargyur/story_archive?b_start:int= + str(i)
- Total page:1941

In [48]:
total_page = 1941 + 1
custom_url= "https://www.rfa.org/tibetan/sargyur/story_archive?b_start:int="
article_tag = "གསར་འགྱུར།"
key_code = "Page " + article_tag + " "
print(f"Page code: {key_code}")

all_links = loop_article_page(total_page, custom_url, key_code)


Page code: Page གསར་འགྱུར། 


100%|███████████████████████████████████████| 1942/1942 [18:45<00:00,  1.73it/s]


In [49]:
print(f"Total page in {article_tag}: {len(all_links['Data'])}")

Total page in གསར་འགྱུར།: 1942


In [50]:
check_error_in_links(all_links['Data'], key_code, print_each_error=True)

Page གསར་འགྱུར། 441 'message'
Page གསར་འགྱུར། 442 'message'
Page གསར་འགྱུར། 443 'message'
Page གསར་འགྱུར། 444 'message'
Page གསར་འགྱུར། 445 'message'
Page གསར་འགྱུར། 446 'message'
Page གསར་འགྱུར། 447 'message'
Page གསར་འགྱུར། 448 'message'
Page གསར་འགྱུར། 449 'message'
Page གསར་འགྱུར། 450 'message'
Page གསར་འགྱུར། 451 'message'
Page གསར་འགྱུར། 452 'message'
Page གསར་འགྱུར། 453 'message'
Page གསར་འགྱུར། 454 'message'
Page གསར་འགྱུར། 455 'message'
Page གསར་འགྱུར། 456 'message'
Page གསར་འགྱུར། 457 'message'
Page གསར་འགྱུར། 458 'message'
Page གསར་འགྱུར། 459 'message'
Page གསར་འགྱུར། 460 'message'
Page གསར་འགྱུར། 461 'message'
Page གསར་འགྱུར། 462 'message'
Page གསར་འགྱུར། 463 'message'
Page གསར་འགྱུར། 464 'message'
Page གསར་འགྱུར། 465 'message'
Page གསར་འགྱུར། 466 'message'
Page གསར་འགྱུར། 467 'message'
Page གསར་འགྱུར། 468 'message'
Page གསར་འགྱུར། 469 'message'
Page གསར་འགྱུར། 470 'message'
Page གསར་འགྱུར། 471 'message'
Page གསར་འགྱུར། 472 'message'
Page གསར་འགྱུར། 473 'message'
Page གསར་འ

In [51]:
file_name = f"RFA_ALL_link_{article_tag}.json"
save_json(path, file_name, all_links['Data'])

Successfully saved: RFA_ALL_link_གསར་འགྱུར།.json
