In [2]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import pandas as pd
import json

from typing import Dict, Any

  from pandas.core import (


In [60]:
def scrape_vot_article(url: str) -> Dict[str, Any]:
    """
    Scrapes an article from the VOT (Voice of Tibet) website.

    Args:
    url (str): The URL of the VOT article to scrape.

    Returns:
    Dict[str, Any]: A dictionary containing the scraped information and status details:
        {
            'data': {
                'title': str,
                'body': {
                    'Audio': str,
                    'Text': List[str]
                },
                'meta_data': {
                    'Author': str,
                    'Date': str,
                    'Tags': List[str],
                    'URL': str
                }
            },
            'Message': str,
            'Response': int
        }
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    final_response = {
        "data": {
            'title': "",
            'body': {"Audio": "", "Text": []},
            'meta_data': {'URL': url, 'Author': "", 'Date': "", 'Tags': []}
        },
        "Message": "Success",
        "Response": 200
    }
    
    try:
        response = requests.get(url, headers=headers, timeout=120)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract title
        title = soup.find('h1', class_='tdb-title-text')
        final_response['data']['title'] = title.text.strip() if title else "Title not found"

        # Extracting Meta Data
        try:
            meta_data_body = soup.find('div', class_="vc_column_inner tdi_85 wpb_column vc_column_container tdc-inner-column td-pb-span6")
            if meta_data_body:
                author_name = meta_data_body.find('a', class_="tdb-author-name")
                final_response['data']['meta_data']["Author"] = author_name.get_text() if author_name else "Author not found"
                
                date_time = meta_data_body.find('time', class_="entry-date updated td-module-date")
                final_response['data']['meta_data']["Date"] = date_time.get_text() if date_time else "Date not found"
        except AttributeError:
            final_response['data']['meta_data']["Author"] = "Error fetching author"
            final_response['data']['meta_data']["Date"] = "Error fetching date"

        # Getting tag meta data 
        try:
            tag_meta = soup.find('ul', class_='tdb-tags')
            if tag_meta:
                tag_meta = tag_meta.select('li a')
                final_response['data']['meta_data']["Tags"] = [tag.text for tag in tag_meta]
        except AttributeError:
            final_response['data']['meta_data']["Tags"] = []

        # Extract body content
        try:
            body = soup.find('div', class_='td_block_wrap tdb_single_content tdi_100 td-pb-border-top td_block_template_1 td-post-content tagdiv-type')
            if body:
                # Extracting all <p> tags for text content
                paragraphs = body.find_all('p')
                final_response['data']['body']["Text"] = [para.get_text(strip=True) for para in paragraphs]

                # Find the audio tag and get its src attribute
                audio = body.find('figure', class_='wp-block-audio')
                if audio:
                    audio_tag = audio.find('audio')
                    if audio_tag:
                        final_response['data']['body']["Audio"] = audio_tag.get('src', "No audio source found")
        except AttributeError:
            final_response['data']['body']["Text"] = ["Error fetching body content"]
        
        return final_response
    except requests.Timeout:
        final_response["Message"] = "Request timed out"
        final_response["Response"] = 408  # Request Timeout
        return final_response
        
    except requests.RequestException as e:
        final_response["Message"] = f"An error occurred while fetching the article: {str(e)}"
        final_response["Response"] = getattr(e.response, 'status_code', 500)
        return final_response


### Test Code for Extracting the Article content

In [3]:
# Usage
url = "https://vot.org/%e0%bd%96%e0%bd%bc%e0%bd%91%e0%bc%8b%e0%bd%80%e0%be%b1%e0%bd%b2%e0%bc%8b%e0%bd%82%e0%bd%9e%e0%bd%bc%e0%bd%93%e0%bc%8b%e0%bd%93%e0%bd%b4%e0%bc%8b%e0%bd%a3%e0%be%b7%e0%bd%93%e0%bc%8b%e0%bd%9a-20/"
article = scrape_vot_article(url)

article

{'data': {'title': 'བོད་ཀྱི་གཞོན་ནུ་ལྷན་ཚོགས་ཀྱི་འགན་འཛིན་ལྷན་ཚོགས་ཐེངས་ ༥༤ པ་མཇུག་བསྒྲིལ་འདུག',
  'body': {'Audio': 'https://vot.org/wp-content/uploads/2024/08/Gangtok-TYC-54-Working-Committee-Meet-Concludes.mp3',
   'Text': ['ཁ་སྔོན་ཕྱི་ཚེས་ ༥ ནས་འགོ་འཛུགས་བྱུང་བའི་བོད་ཀྱི་གཞོན་ནུ་ལྷན་ཚོགས་ཀྱི་འགན་འཛིན་ལྷན་ཚོགས་ཐེངས་ ༥༤ དེ་ཕྱི་ཚེས་ ༡༠ ཉིན་མཇུག་བསྒྲིལ་ཡོད་པ་དང་།\xa0\xa0 བོད་ཀྱི་གཞོན་ནུ་ལྷན་ཚོགས་ཀྱི་ངོས་ནས་ད་རེས་ཀྱི་འགན་འཛིན་ལྷན་ཚོགས་འདིའི་བརྒྱུད།\xa0 རྒྱ་ནག་གཞུང་གིས་བོད་མིའི་ངོ་བོ་རྩ་མེད་བཟོ་བའི་སྲིད་བྱུས་འོག\xa0\xa0 བོད་ཀྱི་ཆོས་དང་རིག་གཞུང་སྐད་ཡིག་བཅས་པར་ཕྱོགས་གང་ས་ནས་དམ་དྲག་ཇེ་ཆེར་བྱེད་བཞིན་པ་དེར་ངོ་རྒོལ་གྱིས་བསྒྲགས་གཏམ་ཞིག་སྤེལ་ཡོད་པ་རེད།',
    'ད་རེས་ཀྱི་འགན་འཛིན་ལྷན་ཚོགས་ཐེངས་ ༥༤ པ་དེའི་ཐོག\xa0\xa0 རྒྱ་གར་དང་བལ་ཡུལ་བཅས་ས་གནས་ཁག་ ༣༨ ནས་ཚོགས་བཅར་བ་མི་གྲངས་\xa0 ༢༠༠ ཙམ་ཞིག་འདུ་འཛོམས་བྱུང་ཡོད་པ་དང་།\xa0 ཉིན་གྲངས་ལྔའི་རིང་ཚོགས་བཅར་བ་ཕན་ཚུན་བཀའ་བསྡུར་དང་འབྲེལ་གྲོས་ཆོད་དོན་ཚན་བརྒྱད་གཏན་འབེབས་གནང་ཐུབ་པ་མ་ཟད།\xa0 སྤྱི་མོས་གྲོས་ཆོད་ཞིག་གཏན་འབེབས་བྱུང་ཡོད་པ་དེའི་ནང་བོད་ཀྱི་སྐད་ཡིག་མི་ཉམས་རྒྱ

----------
----------
------------
--------
------------

## Extracting all the article content from VOT >> གོང་ས་མཆོག page

In [12]:
with open('ALL_link_གོང་ས་མཆོག.json', 'r') as openfile:
    # Reading from json file
    Loaded_All_VOT_CAT1_url_link = json.load(openfile)

In [32]:
Total_lenght = len(Loaded_All_VOT_CAT1_url_link)

print(f"Total count in the differnt article in གོང་ས་མཆོག: {Total_lenght}")

Total count in the differnt article in གོང་ས་མཆོག: 213


### running batch of 50 pages to prevent runtime error 
#### A. First 50 batches


In [26]:
# running batch of 50 pages 
n = 50 
all_article = {}
for i in tqdm(range(1, n)):
    page_key = "Page གོང་ས་མཆོག " + str(i)
    all_link_page = Loaded_All_VOT_CAT1_url_link[page_key]
    page_article_index = 1
    for each_URL in all_link_page:
        article_content = scrape_vot_article(each_URL)
        article_key = page_key + "_Article_" + str(page_article_index)
        page_article_index += 1
        all_article[article_key] = article_content


  0%|          | 0/49 [00:00<?, ?it/s]

100%|██████████| 49/49 [17:35<00:00, 21.53s/it]


In [27]:
print(f"Total Article extarcted from གོང་ས་མཆོག: {len(all_article)}")

Total Article extarcted from གོང་ས་མཆོག: 441


#### Checking Failure count in གོང་ས་མཆོག in first 50 batch

In [28]:
Failure_count = 0
for each_article_key in all_article:
    if all_article[each_article_key]["Response"] != 200:
        print(each_article_key)
        Failure_count += 1

print(f"total Failure in the གོང་ས་མཆོག article: {Failure_count}")

total Failure in the གོང་ས་མཆོག article: 0


#### Running next bacth of གོང་ས་མཆོག 
#### B. next 50 batches


In [29]:
start = n
n = n + 50
for i in tqdm(range(start, n)):
    page_key = "Page གོང་ས་མཆོག " + str(i)
    all_link_page = Loaded_All_VOT_CAT1_url_link[page_key]
    page_article_index = 1
    for each_URL in all_link_page:
        article_content = scrape_vot_article(each_URL)
        article_key = page_key + "_Article_" + str(page_article_index)
        page_article_index += 1
        all_article[article_key] = article_content

100%|██████████| 50/50 [18:08<00:00, 21.78s/it]


In [30]:
print(f"Total Article extarcted from གོང་ས་མཆོག: {len(all_article)}")

Total Article extarcted from གོང་ས་མཆོག: 891


#### Checking Failure count in གོང་ས་མཆོག next 50 batch

In [None]:
Failure_count = 0
for each_article_key in all_article:
    if all_article[each_article_key]["Response"] != 200:
        print(each_article_key)
        Failure_count += 1

print(f"total Failure in the གོང་ས་མཆོག article: {Failure_count}")

total Failure in the གོང་ས་མཆོག article: 0


### Saving first 100 process 

In [31]:
with open("ALL_content_གོང་ས་མཆོག.json", "w") as outfile:
    json.dump(all_article , outfile, indent=5)

#### Running next bacth of གོང་ས་མཆོག 
#### C. Next 50 batches total of 150 process

In [33]:
start = n
n = n + 50
for i in tqdm(range(start, n)):
    page_key = "Page གོང་ས་མཆོག " + str(i)
    all_link_page = Loaded_All_VOT_CAT1_url_link[page_key]
    page_article_index = 1
    for each_URL in all_link_page:
        article_content = scrape_vot_article(each_URL)
        article_key = page_key + "_Article_" + str(page_article_index)
        page_article_index += 1
        all_article[article_key] = article_content

100%|██████████| 50/50 [21:22<00:00, 25.66s/it]


In [34]:
print(f"Total Article extarcted from གོང་ས་མཆོག: {len(all_article)}")

Total Article extarcted from གོང་ས་མཆོག: 1341


#### Checking Failure count in གོང་ས་མཆོག next 50 batch

In [35]:
Failure_count = 0
for each_article_key in all_article:
    if all_article[each_article_key]["Response"] != 200:
        print(each_article_key)
        Failure_count += 1

print(f"total Failure in the གོང་ས་མཆོག article: {Failure_count}")

total Failure in the གོང་ས་མཆོག article: 0


#### D. Running next bacth of གོང་ས་མཆོག (Total of 200 process)


In [36]:
start = n
n = n + 50
for i in tqdm(range(start, n)):
    page_key = "Page གོང་ས་མཆོག " + str(i)
    all_link_page = Loaded_All_VOT_CAT1_url_link[page_key]
    page_article_index = 1
    for each_URL in all_link_page:
        article_content = scrape_vot_article(each_URL)
        article_key = page_key + "_Article_" + str(page_article_index)
        page_article_index += 1
        all_article[article_key] = article_content

100%|██████████| 50/50 [21:17<00:00, 25.55s/it]


In [37]:
print(f"Total Article extarcted from གོང་ས་མཆོག: {len(all_article)}")

Total Article extarcted from གོང་ས་མཆོག: 1791


#### Checking Failure count in གོང་ས་མཆོག next 50 batch

In [38]:
Failure_count = 0
for each_article_key in all_article:
    if all_article[each_article_key]["Response"] != 200:
        print(each_article_key)
        Failure_count += 1

print(f"total Failure in the གོང་ས་མཆོག article: {Failure_count}")

total Failure in the གོང་ས་མཆོག article: 0


### Saving next 100 process total 200 process 

In [39]:
with open("ALL_content_གོང་ས་མཆོག.json", "w") as outfile:
    json.dump(all_article , outfile, indent=5)

#### Running the last 13 remaning process total of 213 process

In [41]:
start = n
n = n + 14
for i in tqdm(range(start, n)):
    page_key = "Page གོང་ས་མཆོག " + str(i)
    all_link_page = Loaded_All_VOT_CAT1_url_link[page_key]
    page_article_index = 1
    for each_URL in all_link_page:
        article_content = scrape_vot_article(each_URL)
        article_key = page_key + "_Article_" + str(page_article_index)
        page_article_index += 1
        all_article[article_key] = article_content

 43%|████▎     | 6/14 [01:46<02:07, 15.92s/it]

In [None]:
print(f"Total Article extarcted from གོང་ས་མཆོག: {len(all_article)}")

In [None]:
Failure_count = 0
for each_article_key in all_article:
    if all_article[each_article_key]["Response"] != 200:
        print(each_article_key)
        Failure_count += 1

print(f"total Failure in the གོང་ས་མཆོག article: {Failure_count}")

### Saving next 13 process total 213 process 

In [None]:
with open("ALL_content_གོང་ས་མཆོག.json", "w") as outfile:
    json.dump(all_article , outfile, indent=5)

----------
-------------
-----------
--------------

## Extracting all the article content from VOT >> བོད page

In [100]:
with open('ALL_link_བོད།.json', 'r') as openfile:
    # Reading from json file
    Loaded_All_VOT_CAT2_url_link = json.load(openfile)

In [101]:
Total_lenght = len(Loaded_All_VOT_CAT2_url_link)
print(f"Total count in the differnt article in བོད།: {Total_lenght}")

Total count in the differnt article in བོད།: 174


### running batch of 50 pages to prevent runtime error 
#### A. First 50 batches


In [107]:
# running batch of 50 pages 
start = 1
if Total_lenght - start > 50:
    n = 50
else:
    n = Total_lenght - start
all_article = {}

for i in tqdm(range(start, n)):
    page_key = "Page བོད " + str(i)
    all_link_page = Loaded_All_VOT_CAT2_url_link[page_key]["Links"]
    page_article_index = 1
    for each_URL in all_link_page:
        article_content = scrape_vot_article(each_URL)
        article_key = page_key + "_Article_" + str(page_article_index)
        page_article_index += 1
        all_article[article_key] = article_content

100%|██████████| 49/49 [18:32<00:00, 22.70s/it]


In [108]:
print(f"Total Article extarcted from བོད: {len(all_article)}")

Total Article extarcted from བོད: 441


#### Checking Failure count in བོད in first 50 batch

In [109]:
Failure_count = 0
for each_article_key in all_article:
    if all_article[each_article_key]["Response"] != 200:
        print(each_article_key)
        Failure_count += 1

print(f"total Failure in the བོད article: {Failure_count}")

total Failure in the བོད article: 0


### Saving First 50 batch total 50 process 


In [110]:
with open("ALL_content_བོད.json", "w") as outfile:
    json.dump(all_article , outfile, indent=5)

#### Running next bacth of བོད
#### B. Next 50 batches total of 100 process

In [111]:
# running batch of 50 pages 
start = n
if Total_lenght - start > 50:
    n = n + 50
else:
    n = n + Total_lenght - start
    
for i in tqdm(range(start, n)):
    page_key = "Page བོད " + str(i)
    all_link_page = Loaded_All_VOT_CAT2_url_link[page_key]["Links"]
    page_article_index = 1
    for each_URL in all_link_page:
        article_content = scrape_vot_article(each_URL)
        article_key = page_key + "_Article_" + str(page_article_index)
        page_article_index += 1
        all_article[article_key] = article_content

100%|██████████| 50/50 [17:41<00:00, 21.23s/it]


In [112]:
print(f"Total Article extarcted from བོད: {len(all_article)}")

Total Article extarcted from བོད: 891


#### Checking Failure count in བཙན་བྱོལ། in next 50 batch

In [113]:
Failure_count = 0
for each_article_key in all_article:
    if all_article[each_article_key]["Response"] != 200:
        print(each_article_key)
        Failure_count += 1

print(f"total Failure in the བོད article: {Failure_count}")

total Failure in the བོད article: 0


### Saving next 50 process total 100 process 

In [None]:
with open("ALL_content_བོད.json", "w") as outfile:
    json.dump(all_article , outfile, indent=5)

#### Running next bacth of བོད 
#### C. Running the next batch 150 process

In [117]:
# running batch of 50 pages 
start = n
if Total_lenght - start > 50:
    n = n + 50
else:
    n = n + Total_lenght - start
for i in tqdm(range(start, n)):
    page_key = "Page བོད " + str(i)
    all_link_page = Loaded_All_VOT_CAT2_url_link[page_key]["Links"]
    page_article_index = 1
    for each_URL in all_link_page:
        article_content = scrape_vot_article(each_URL)
        article_key = page_key + "_Article_" + str(page_article_index)
        page_article_index += 1
        all_article[article_key] = article_content

100%|██████████| 50/50 [17:39<00:00, 21.20s/it]


In [118]:
print(f"Total Article extarcted from བོད: {len(all_article)}")

Total Article extarcted from བོད: 1341


#### Checking Failure count in བོད in next 50 batch

In [119]:
Failure_count = 0
for each_article_key in all_article:
    if all_article[each_article_key]["Response"] != 200:
        print(each_article_key)
        Failure_count += 1

print(f"total Failure in the བོད article: {Failure_count}")

total Failure in the བོད article: 0


### Saving next 50 process total 150 process 

In [120]:
with open("ALL_content_བོད.json", "w") as outfile:
    json.dump(all_article , outfile, indent=5)

#### Running Final bacth of བོད 
#### D. Running the last 34 remaning process total of 174 process

In [121]:
# running batch of 50 pages 
start = n
if Total_lenght - start > 50:
    n = n + 50
else:
    n = n + Total_lenght - start
for i in tqdm(range(start, n)):
    page_key = "Page བོད " + str(i)
    all_link_page = Loaded_All_VOT_CAT2_url_link[page_key]["Links"]
    page_article_index = 1
    for each_URL in all_link_page:
        article_content = scrape_vot_article(each_URL)
        article_key = page_key + "_Article_" + str(page_article_index)
        page_article_index += 1
        all_article[article_key] = article_content

100%|██████████| 24/24 [08:17<00:00, 20.72s/it]


In [122]:
print(f"Total Article extarcted from བོད: {len(all_article)}")

Total Article extarcted from བོད: 1557


#### Checking Failure count in བོད in next 50 batch

In [123]:
Failure_count = 0
for each_article_key in all_article:
    if all_article[each_article_key]["Response"] != 200:
        print(each_article_key)
        Failure_count += 1

print(f"total Failure in the བོད article: {Failure_count}")

total Failure in the བོད article: 0


### Saving next 24 process total 174 process 

In [124]:
with open("ALL_content_བོད.json", "w") as outfile:
    json.dump(all_article , outfile, indent=5)

------------
------------
------------
------------
## Extracting all the article content from VOT >> རྒྱ་ནག page


In [None]:
with open('ALL_link_རྒྱནག.json', 'r') as openfile:
    # Reading from json file
    Loaded_All_VOT_CAT4_url_link = json.load(openfile)

In [None]:
Total_lenght = len(Loaded_All_VOT_CAT4_url_link)
print(f"Total count in the differnt article in རྒྱ་ནག: {Total_lenght}")

Total count in the differnt article in རྒྱ་ནག: 123


### running batch of 50 pages to prevent runtime error 
#### A. First 50 batches


In [None]:
# running batch of 50 pages 
start = 1
if Total_lenght - start > 50:
    n = 50
else:
    n = n + Total_lenght - start
all_article = {}
for i in tqdm(range(start, n)):
    page_key = "Page རྒྱནག " + str(i)
    all_link_page = Loaded_All_VOT_CAT4_url_link[page_key]["Links"]
    page_article_index = 1
    for each_URL in all_link_page:
        article_content = scrape_vot_article(each_URL)
        article_key = page_key + "_Article_" + str(page_article_index)
        page_article_index += 1
        all_article[article_key] = article_content

100%|██████████| 49/49 [18:27<00:00, 22.60s/it]


In [None]:
print(f"Total Article extarcted from རྒྱ་ནག: {len(all_article)}")

Total Article extarcted from རྒྱ་ནག: 441


#### Checking Failure count in རྒྱ་ནག in first 50 batch

In [None]:
Failure_count = 0
for each_article_key in all_article:
    if all_article[each_article_key]["Response"] != 200:
        print(each_article_key)
        Failure_count += 1

print(f"total Failure in the རྒྱ་ནག article: {Failure_count}")

total Failure in the རྒྱ་ནག article: 0


### Saving First 50 batch total 50 process 

In [None]:
with open("ALL_content_རྒྱནག.json", "w") as outfile:
    json.dump(all_article , outfile, indent=5)

#### Running next bacth of རྒྱནག 
#### B. Next 50 batches total of 100 process

In [None]:
# running batch of 50 pages 
start = n
if Total_lenght - start > 50:
    n = n + 50
else:
    n = n + Total_lenght - start

for i in tqdm(range(start, n)):
    page_key = "Page རྒྱནག " + str(i)
    all_link_page = Loaded_All_VOT_CAT4_url_link[page_key]["Links"]
    page_article_index = 1
    for each_URL in all_link_page:
        article_content = scrape_vot_article(each_URL)
        article_key = page_key + "_Article_" + str(page_article_index)
        page_article_index += 1
        all_article[article_key] = article_content

100%|██████████| 50/50 [20:01<00:00, 24.04s/it]


In [None]:
print(f"Total Article extarcted from རྒྱནག: {len(all_article)}")

Total Article extarcted from རྒྱནག: 891


#### Checking Failure count in རྒྱནག in next 50 batch

In [None]:
Failure_count = 0
for each_article_key in all_article:
    if all_article[each_article_key]["Response"] != 200:
        print(each_article_key)
        Failure_count += 1

print(f"total Failure in the རྒྱནག article: {Failure_count}")

total Failure in the རྒྱནག article: 0


### Saving next 50 process total 100 process 

In [None]:
with open("ALL_content_རྒྱནག.json", "w") as outfile:
    json.dump(all_article , outfile, indent=5)

#### Running Final bacth of རྒྱནག
#### C. Running the last 23 remaning process total of 123 process

In [None]:
# running batch of 50 pages 
start = n
if Total_lenght - start > 50:
    n = n + 50
else:
    n = n + Total_lenght - start

for i in tqdm(range(start, n)):
    page_key = "Page རྒྱནག " + str(i)
    all_link_page = Loaded_All_VOT_CAT4_url_link[page_key]["Links"]
    page_article_index = 1
    for each_URL in all_link_page:
        article_content = scrape_vot_article(each_URL)
        article_key = page_key + "_Article_" + str(page_article_index)
        page_article_index += 1
        all_article[article_key] = article_content

100%|██████████| 23/23 [07:59<00:00, 20.87s/it]


In [None]:
print(f"Total Article extarcted from རྒྱནག: {len(all_article)}")

#### Checking Failure count in རྒྱནག in next 50 batch

In [None]:
Failure_count = 0
for each_article_key in all_article:
    if all_article[each_article_key]["Response"] != 200:
        print(each_article_key)
        Failure_count += 1

print(f"total Failure in the རྒྱནག article: {Failure_count}")

### Saving next 18 process total 118 process 

In [None]:
with open("ALL_content_རྒྱནག.json", "w") as outfile:
    json.dump(all_article , outfile, indent=5)

------------
------------
------------
------------
## Extracting all the article content from VOT >> བཙན་བྱོལ། page


In [57]:
with open('ALL_link_བཙན.json', 'r') as openfile:
    # Reading from json file
    Loaded_All_VOT_CAT3_url_link = json.load(openfile)

In [58]:
Total_lenght = len(Loaded_All_VOT_CAT3_url_link)
print(f"Total count in the differnt article in བཙན་བྱོལ།: {Total_lenght}")

Total count in the differnt article in བཙན་བྱོལ།: 118


### running batch of 50 pages to prevent runtime error 
#### A. First 50 batches


In [61]:
# running batch of 50 pages 
start = 1
if Total_lenght - start > 50:
    n = 50
else:
    n = n + Total_lenght - start
all_article = {}
for i in tqdm(range(start, n)):
    page_key = "Page བཙན་བྱོལ " + str(i)
    all_link_page = Loaded_All_VOT_CAT3_url_link[page_key]["Links"]
    page_article_index = 1
    for each_URL in all_link_page:
        article_content = scrape_vot_article(each_URL)
        article_key = page_key + "_Article_" + str(page_article_index)
        page_article_index += 1
        all_article[article_key] = article_content

100%|██████████| 49/49 [17:59<00:00, 22.04s/it]


In [62]:
print(f"Total Article extarcted from བཙན་བྱོལ།: {len(all_article)}")

Total Article extarcted from བཙན་བྱོལ།: 441


#### Checking Failure count in བཙན་བྱོལ། in first 50 batch

In [63]:
Failure_count = 0
for each_article_key in all_article:
    if all_article[each_article_key]["Response"]:
        if all_article[each_article_key]["Response"] != 200:
            print(each_article_key)
            Failure_count += 1
    else: 
        print(each_article_key)
        Failure_count += 1

print(f"total Failure in the བཙན་བྱོལ། article: {Failure_count}")

total Failure in the བཙན་བྱོལ། article: 0


### Saving First 50 batch total 50 process 

In [64]:
with open("ALL_content_བཙན་བྱོལ།.json", "w") as outfile:
    json.dump(all_article , outfile, indent=5)

#### Running next bacth of བཙན་བྱོལ 
#### B. Next 50 batches total of 100 process

In [65]:
# running batch of 50 pages 
start = n
if Total_lenght - start > 50:
    n = n + 50
else:
    n = n + Total_lenght - start
    
for i in tqdm(range(start, n)):
    page_key = "Page བཙན་བྱོལ " + str(i)
    all_link_page = Loaded_All_VOT_CAT3_url_link[page_key]["Links"]
    page_article_index = 1
    for each_URL in all_link_page:
        article_content = scrape_vot_article(each_URL)
        article_key = page_key + "_Article_" + str(page_article_index)
        page_article_index += 1
        all_article[article_key] = article_content

100%|██████████| 50/50 [16:10<00:00, 19.40s/it]


In [66]:
print(f"Total Article extarcted from བཙན་བྱོལ།: {len(all_article)}")

Total Article extarcted from བཙན་བྱོལ།: 891


#### Checking Failure count in བཙན་བྱོལ། in next 50 batch

In [67]:
Failure_count = 0
for each_article_key in all_article:
    if all_article[each_article_key]["Response"] != 200:
        print(each_article_key)
        Failure_count += 1

print(f"total Failure in the བཙན་བྱོལ། article: {Failure_count}")

total Failure in the བཙན་བྱོལ། article: 0


### Saving next 50 process total 100 process 

In [72]:
with open("ALL_content_བཙན་བྱོལ།.json", "w") as outfile:
    json.dump(all_article , outfile, indent=5)

#### Running Final bacth of བཙན་བྱོལ 
#### C. Running the last 18 remaning process total of 118 process

In [73]:
# running batch of 50 pages 
start = n
if Total_lenght - start > 50:
    n = n + 50
else:
    n = n + Total_lenght - start
for i in tqdm(range(start, n)):
    page_key = "Page བཙན་བྱོལ " + str(i)
    all_link_page = Loaded_All_VOT_CAT3_url_link[page_key]["Links"]
    page_article_index = 1
    for each_URL in all_link_page:
        article_content = scrape_vot_article(each_URL)
        article_key = page_key + "_Article_" + str(page_article_index)
        page_article_index += 1
        all_article[article_key] = article_content

0it [00:00, ?it/s]


In [74]:
print(f"Total Article extarcted from བཙན་བྱོལ།: {len(all_article)}")

Total Article extarcted from བཙན་བྱོལ།: 1053


#### Checking Failure count in བཙན་བྱོལ། in next 50 batch

In [75]:
Failure_count = 0
for each_article_key in all_article:
    if all_article[each_article_key]["Response"] != 200:
        print(each_article_key)
        Failure_count += 1

print(f"total Failure in the བཙན་བྱོལ། article: {Failure_count}")

total Failure in the བཙན་བྱོལ། article: 0


### Saving next 18 process total 118 process 

In [76]:
with open("ALL_content_བཙན་བྱོལ།.json", "w") as outfile:
    json.dump(all_article , outfile, indent=5)

------------
------------
------------
------------
## Extracting all the article content from VOT >> རྒྱ་ནག page


In [77]:
with open('ALL_link_རྒྱནག.json', 'r') as openfile:
    # Reading from json file
    Loaded_All_VOT_CAT4_url_link = json.load(openfile)

In [78]:
Total_lenght = len(Loaded_All_VOT_CAT4_url_link)
print(f"Total count in the differnt article in རྒྱ་ནག: {Total_lenght}")

Total count in the differnt article in རྒྱ་ནག: 123


### running batch of 50 pages to prevent runtime error 
#### A. First 50 batches


In [79]:
# running batch of 50 pages 
start = 1
if Total_lenght - start > 50:
    n = 50
else:
    n = n + Total_lenght - start
all_article = {}
for i in tqdm(range(start, n)):
    page_key = "Page རྒྱནག " + str(i)
    all_link_page = Loaded_All_VOT_CAT4_url_link[page_key]["Links"]
    page_article_index = 1
    for each_URL in all_link_page:
        article_content = scrape_vot_article(each_URL)
        article_key = page_key + "_Article_" + str(page_article_index)
        page_article_index += 1
        all_article[article_key] = article_content

100%|██████████| 49/49 [18:27<00:00, 22.60s/it]


In [80]:
print(f"Total Article extarcted from རྒྱ་ནག: {len(all_article)}")

Total Article extarcted from རྒྱ་ནག: 441


#### Checking Failure count in རྒྱ་ནག in first 50 batch

In [81]:
Failure_count = 0
for each_article_key in all_article:
    if all_article[each_article_key]["Response"] != 200:
        print(each_article_key)
        Failure_count += 1

print(f"total Failure in the རྒྱ་ནག article: {Failure_count}")

total Failure in the རྒྱ་ནག article: 0


### Saving First 50 batch total 50 process 

In [82]:
with open("ALL_content_རྒྱནག.json", "w") as outfile:
    json.dump(all_article , outfile, indent=5)

#### Running next bacth of རྒྱནག 
#### B. Next 50 batches total of 100 process

In [83]:
# running batch of 50 pages 
start = n
if Total_lenght - start > 50:
    n = n + 50
else:
    n = n + Total_lenght - start

for i in tqdm(range(start, n)):
    page_key = "Page རྒྱནག " + str(i)
    all_link_page = Loaded_All_VOT_CAT4_url_link[page_key]["Links"]
    page_article_index = 1
    for each_URL in all_link_page:
        article_content = scrape_vot_article(each_URL)
        article_key = page_key + "_Article_" + str(page_article_index)
        page_article_index += 1
        all_article[article_key] = article_content

100%|██████████| 50/50 [20:01<00:00, 24.04s/it]


In [84]:
print(f"Total Article extarcted from རྒྱནག: {len(all_article)}")

Total Article extarcted from རྒྱནག: 891


#### Checking Failure count in རྒྱནག in next 50 batch

In [85]:
Failure_count = 0
for each_article_key in all_article:
    if all_article[each_article_key]["Response"] != 200:
        print(each_article_key)
        Failure_count += 1

print(f"total Failure in the རྒྱནག article: {Failure_count}")

total Failure in the རྒྱནག article: 0


### Saving next 50 process total 100 process 

In [86]:
with open("ALL_content_རྒྱནག.json", "w") as outfile:
    json.dump(all_article , outfile, indent=5)

#### Running Final bacth of རྒྱནག
#### C. Running the last 23 remaning process total of 123 process

In [96]:
# running batch of 50 pages 
start = n
if Total_lenght - start > 50:
    n = n + 50
else:
    n = n + Total_lenght - start

for i in tqdm(range(start, n)):
    page_key = "Page རྒྱནག " + str(i)
    all_link_page = Loaded_All_VOT_CAT4_url_link[page_key]["Links"]
    page_article_index = 1
    for each_URL in all_link_page:
        article_content = scrape_vot_article(each_URL)
        article_key = page_key + "_Article_" + str(page_article_index)
        page_article_index += 1
        all_article[article_key] = article_content

100%|██████████| 23/23 [07:59<00:00, 20.87s/it]


In [97]:
print(f"Total Article extarcted from རྒྱནག: {len(all_article)}")

Total Article extarcted from རྒྱནག: 1098


#### Checking Failure count in རྒྱནག in next 50 batch

In [98]:
Failure_count = 0
for each_article_key in all_article:
    if all_article[each_article_key]["Response"] != 200:
        print(each_article_key)
        Failure_count += 1

print(f"total Failure in the རྒྱནག article: {Failure_count}")

total Failure in the རྒྱནག article: 0


### Saving next 18 process total 118 process 

In [99]:
with open("ALL_content_རྒྱནག.json", "w") as outfile:
    json.dump(all_article , outfile, indent=5)

------------
------------
------------
------------
## Extracting all the article content from VOT >> རྒྱལསྤྱི page


In [143]:
with open('ALL_link_རྒྱལསྤྱི.json', 'r') as openfile:
    # Reading from json file
    Loaded_All_VOT_CAT5_url_link = json.load(openfile)

In [144]:
Total_lenght = len(Loaded_All_VOT_CAT5_url_link)
print(f"Total count in the differnt article in རྒྱལསྤྱི: {Total_lenght}")

Total count in the differnt article in རྒྱལསྤྱི: 218


### running batch of 50 pages to prevent runtime error 
#### A. First 50 batches


In [145]:
# running batch of 50 pages 
start = 1
if Total_lenght - start > 50:
    n = 50
else:
    n = n + Total_lenght - start
all_article = {}
for i in tqdm(range(start, n)):
    page_key = "Page རྒྱལསྤྱི " + str(i)
    all_link_page = Loaded_All_VOT_CAT5_url_link[page_key]["Links"]
    page_article_index = 1
    for each_URL in all_link_page:
        article_content = scrape_vot_article(each_URL)
        article_key = page_key + "_Article_" + str(page_article_index)
        page_article_index += 1
        all_article[article_key] = article_content

100%|██████████| 49/49 [21:19<00:00, 26.10s/it]


In [146]:
print(f"Total Article extarcted from རྒྱལསྤྱི: {len(all_article)}")

Total Article extarcted from རྒྱལསྤྱི: 441


#### Checking Failure count in རྒྱལསྤྱི in first 50 batch

In [148]:
Failure_count = 0
for each_article_key in all_article:
    if all_article[each_article_key]["Response"] != 200:
        print(each_article_key)
        Failure_count += 1

print(f"total Failure in the རྒྱལསྤྱི article: {Failure_count}")

total Failure in the རྒྱལསྤྱི article: 0


### Saving First 50 batch total 50 process 

In [149]:
with open("ALL_content_རྒྱལསྤྱི.json", "w") as outfile:
    json.dump(all_article , outfile, indent=5)

#### Running next bacth of རྒྱལསྤྱི 
#### B. Next 50 batches total of 100 process

In [150]:
# running batch of 50 pages 
start = n
if Total_lenght - start > 50:
    n = n + 50
else:
    n = n + Total_lenght - start

for i in tqdm(range(start, n)):
    page_key = "Page རྒྱལསྤྱི " + str(i)
    all_link_page = Loaded_All_VOT_CAT5_url_link[page_key]["Links"]
    page_article_index = 1
    for each_URL in all_link_page:
        article_content = scrape_vot_article(each_URL)
        article_key = page_key + "_Article_" + str(page_article_index)
        page_article_index += 1
        all_article[article_key] = article_content

100%|██████████| 50/50 [19:26<00:00, 23.34s/it]


In [151]:
print(f"Total Article extarcted from རྒྱལསྤྱི: {len(all_article)}")

Total Article extarcted from རྒྱལསྤྱི: 891


#### Checking Failure count in རྒྱལསྤྱི in next 50 batch

In [152]:
Failure_count = 0
for each_article_key in all_article:
    if all_article[each_article_key]["Response"] != 200:
        print(each_article_key)
        Failure_count += 1

print(f"total Failure in the རྒྱལསྤྱི article: {Failure_count}")

total Failure in the རྒྱལསྤྱི article: 0


### Saving next 50 process total 100 process 

In [153]:
with open("ALL_content_རྒྱལསྤྱི.json", "w") as outfile:
    json.dump(all_article , outfile, indent=5)

In [154]:
len(all_article)

891

#### Running Final bacth of རྒྱལསྤྱི
#### C. Running the next 50 remaning process total of 150 process

In [155]:
# running batch of 50 pages 
start = n
if Total_lenght - start > 50:
    n = n + 50
else:
    n = n + Total_lenght - start

for i in tqdm(range(start, n)):
    page_key = "Page རྒྱལསྤྱི " + str(i)
    all_link_page = Loaded_All_VOT_CAT5_url_link[page_key]["Links"]
    page_article_index = 1
    for each_URL in all_link_page:
        article_content = scrape_vot_article(each_URL)
        article_key = page_key + "_Article_" + str(page_article_index)
        page_article_index += 1
        all_article[article_key] = article_content

100%|██████████| 50/50 [20:56<00:00, 25.12s/it]


In [156]:
print(f"Total Article extarcted from རྒྱལསྤྱི: {len(all_article)}")

Total Article extarcted from རྒྱལསྤྱི: 1341


#### Checking Failure count in རྒྱནག in next 50 batch

In [157]:
Failure_count = 0
for each_article_key in all_article:
    if all_article[each_article_key]["Response"] != 200:
        print(each_article_key)
        Failure_count += 1

print(f"total Failure in the རྒྱལསྤྱི article: {Failure_count}")

total Failure in the རྒྱལསྤྱི article: 0


### Saving next 50 process total 150 process 

In [158]:
with open("ALL_content_རྒྱལསྤྱི.json", "w") as outfile:
    json.dump(all_article , outfile, indent=5)

#### Running next bacth of རྒྱལསྤྱི
#### D. Running the next 50 remaning process total of 200 process

In [159]:
# running batch of 50 pages 
start = n
if Total_lenght - start > 50:
    n = n + 50
else:
    n = n + Total_lenght - start

for i in tqdm(range(start, n)):
    page_key = "Page རྒྱལསྤྱི " + str(i)
    all_link_page = Loaded_All_VOT_CAT5_url_link[page_key]["Links"]
    page_article_index = 1
    for each_URL in all_link_page:
        article_content = scrape_vot_article(each_URL)
        article_key = page_key + "_Article_" + str(page_article_index)
        page_article_index += 1
        all_article[article_key] = article_content

100%|██████████| 50/50 [18:56<00:00, 22.73s/it]


In [160]:
print(f"Total Article extarcted from རྒྱལསྤྱི: {len(all_article)}")

Total Article extarcted from རྒྱལསྤྱི: 1791


#### Checking Failure count in རྒྱལསྤྱི in next 50 batch

In [161]:
Failure_count = 0
for each_article_key in all_article:
    if all_article[each_article_key]["Response"] != 200:
        print(each_article_key)
        Failure_count += 1

print(f"total Failure in the རྒྱལསྤྱི article: {Failure_count}")

total Failure in the རྒྱལསྤྱི article: 0


### Saving next 50 process total 200 process 

In [162]:
with open("ALL_content_རྒྱལསྤྱི.json", "w") as outfile:
    json.dump(all_article , outfile, indent=5)

#### Running Final bacth of རྒྱལསྤྱི
#### E. Running the next 18 remaning process total of 218 process

In [163]:
# running batch of 50 pages 
start = n
if Total_lenght - start > 50:
    n = n + 50
else:
    n = n + Total_lenght - start

for i in tqdm(range(start, n)):
    page_key = "Page རྒྱལསྤྱི " + str(i)
    all_link_page = Loaded_All_VOT_CAT5_url_link[page_key]["Links"]
    page_article_index = 1
    for each_URL in all_link_page:
        article_content = scrape_vot_article(each_URL)
        article_key = page_key + "_Article_" + str(page_article_index)
        page_article_index += 1
        all_article[article_key] = article_content

100%|██████████| 18/18 [06:56<00:00, 23.13s/it]


In [164]:
print(f"Total Article extarcted from རྒྱལསྤྱི: {len(all_article)}")

Total Article extarcted from རྒྱལསྤྱི: 1953


#### Checking Failure count in རྒྱནག in next 50 batch

In [165]:
Failure_count = 0
for each_article_key in all_article:
    if all_article[each_article_key]["Response"] != 200:
        print(each_article_key)
        Failure_count += 1

print(f"total Failure in the རྒྱལསྤྱི article: {Failure_count}")

total Failure in the རྒྱལསྤྱི article: 0


### Saving next 18 process total 218 process 

In [166]:
with open("ALL_content_རྒྱལསྤྱི.json", "w") as outfile:
    json.dump(all_article , outfile, indent=5)