In [1]:
import json
from bs4 import Tag
from tqdm import tqdm
import sertha_utils
import os
from concurrent.futures import ThreadPoolExecutor, as_completed


In [2]:
file_name_code = "sertha"

In [3]:

class CustomJSONEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, Tag):
            return obj.get_text()
        return str(obj)

def save_json(path, file_name, data):
    try: 
        with open(path + file_name, "w", encoding='utf-8') as outfile:
            json.dump(data, outfile, indent=4, ensure_ascii=False, cls=CustomJSONEncoder)
        print(f"Successfully saved: {file_name}")
    except Exception as e:
        print(f"Error saving {file_name}: {str(e)}")

# def read_json(path, file_name):
#     try:
#         with open(path+file_name, 'r', encoding='utf-8') as openfile:
#             Loaded_file = json.load(openfile)
#             print(f"Successfully loaded: {file_name}")
#         return Loaded_file
#     except Exception as e:
#         print(f"Error loading {file_name}: {str(e)}")
#         return None

def scrape_article(url, page_key_code):
    try:
        article_content = sertha_utils.scrape_sertha_article_content(url, tags=page_key_code)
        return article_content
    except Exception as e:
        print(f"Error scraping {url}: {str(e)}")
        return None

def get_content(All_links_data, Total_lenght, page_key_code, page_key_list):
    all_article = {}
    
    with ThreadPoolExecutor(max_workers=10) as executor:
        future_to_url = {}
        start = 1
        for i in range(start, Total_lenght+1):
            page_key = page_key_code + str(i)
            all_link_page = All_links_data[page_key]["Links"]
            
            for url in all_link_page:
                future = executor.submit(scrape_article, url, page_key_code)
                future_to_url[future] = (page_key, url)
        
        for future in tqdm(as_completed(future_to_url), total=len(future_to_url)):
            page_key, url = future_to_url[future]
            try:
                article_content = future.result()
                if article_content:
                    article_key = f"{page_key}_{file_name_code}_Article_{len([k for k in all_article if k.startswith(page_key)]) + 1}"
                    all_article[article_key] = article_content
            except Exception as e:
                print(f"Error processing {url}: {str(e)}")
    
    Failure_count = sum(1 for article in all_article.values() if article["Response"] != 200)
    print(f"Total Failure in the {page_key_list[1]} article: {Failure_count}")
    
    save_file_name = f"{file_name_code}_ALL_content_{page_key_list[1]}.json"
    print(save_file_name)
    path = "./data/parallel_content/"
    save_json(path, save_file_name, all_article)

def process_json_file(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            All_links_data = json.load(file)
            Total_lenght = len(All_links_data)
            print(f"Total page in {os.path.basename(file_path)}: {Total_lenght}")
        
            print(f"page key name: {list(All_links_data.keys())[-1]}")
            page_key_list = list(All_links_data.keys())[-1].split(" ")
            
            page_key_code = "Page "+page_key_list[1]+" "
            print(f"Page key code: {page_key_code}")
            get_content(All_links_data, Total_lenght, page_key_code, page_key_list)
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON in file {os.path.basename(file_path)}: {str(e)}")
    except Exception as e:
        print(f"Error processing file {os.path.basename(file_path)}: {str(e)}")

def get_json_files(directory):
    json_files = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.json')]
    
    with ThreadPoolExecutor(max_workers=10) as executor:
        list(tqdm(executor.map(process_json_file, json_files), total=len(json_files)))
    
    print(f"Processed {len(json_files)} files")


In [4]:
%%time
# Example usage
directory_path = './data/links/'
get_json_files(directory_path)

Total page in sertha_ALL_link_༧གོང་ས་མཆོག.json: 3
page key name: Page ༧གོང་ས་མཆོག 3
Page key code: Page ༧གོང་ས་མཆོག 
Total page in sertha_ALL_link_རྩོམ་རིག.json: 5
page key name: Page རྩོམ་རིག 5
Page key code: Page རྩོམ་རིག 
Total page in sertha_ALL_link_ལོ་རྒྱུས།.json: 2
page key name: Page ལོ་རྒྱུས། 2
Page key code: Page ལོ་རྒྱུས། 
Total page in sertha_ALL_link_སློབ་གསོ།.json: 1
page key name: Page སློབ་གསོ། 1
Page key code: Page སློབ་གསོ། 
Total page in sertha_ALL_link_གསར་རྩོམ།.json: 1
page key name: Page གསར་རྩོམ། 1
Page key code: Page གསར་རྩོམ། 
Total page in sertha_ALL_link_གསར་འགྱུར།.json: 2
page key name: Page གསར་འགྱུར། 2
Page key code: Page གསར་འགྱུར། 
Total page in sertha_ALL_link_མི་སྣ་ངོ་སྤྲོད།.json: 1
page key name: Page མི་སྣ་ངོ་སྤྲོད། 1
Page key code: Page མི་སྣ་ངོ་སྤྲོད། 
Total page in sertha_ALL_link_སྲོལ་རྒྱུན།.json: 1
page key name: Page སྲོལ་རྒྱུན། 1
Page key code: Page སྲོལ་རྒྱུན། 


  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s][A[A


  0%|          | 0/7 [00:00<?, ?it/s][A[A[A
  0%|          | 0/8 [00:00<?, ?it/s][A



  0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A




  0%|          | 0/8 [00:00<?, ?it/s][A[A[A[A[A





  0%|          | 0/29 [00:00<?, ?it/s][A[A[A[A[A[A






  0%|          | 0/18 [00:00<?, ?it/s][A[A[A[A[A[A[A







  0%|          | 0/60 [00:00<?, ?it/s][A[A[A[A[A[A[A[A





  3%|▎         | 1/29 [00:06<03:14,  6.94s/it][A[A[A[A[A[A



  5%|▌         | 1/20 [00:07<02:19,  7.32s/it][A[A[A[A







  2%|▏         | 1/60 [00:07<07:12,  7.34s/it][A[A[A[A[A[A[A[A



 10%|█         | 2/20 [00:07<00:55,  3.10s/it][A[A[A[A


 14%|█▍        | 1/7 [00:08<00:48,  8.03s/it][A[A[A







  5%|▌         | 3/60 [00:08<02:03,  2.16s/it][A[A[A[A[A[A[A[A


 29%|██▊       | 2/7 [00:08<00:16,  3.40s/it][A[A[A





  7%|▋         | 2/29 [00:08<01:38,  3.65s/it][A

Total Failure in the སྲོལ་རྒྱུན། article: 0
sertha_ALL_content_སྲོལ་རྒྱུན།.json
Successfully saved: sertha_ALL_content_སྲོལ་རྒྱུན།.json









 67%|██████▋   | 6/9 [00:12<00:03,  1.02s/it]][A[A[A[A[A[A[A



 45%|████▌     | 9/20 [00:12<00:11,  1.07s/it][A[A[A[A






 39%|███▉      | 7/18 [00:13<00:09,  1.11it/s][A[A[A[A[A[A[A





 24%|██▍       | 7/29 [00:13<00:24,  1.13s/it][A[A[A[A[A[A







 10%|█         | 6/60 [00:13<01:36,  1.78s/it][A[A[A[A[A[A[A[A


 78%|███████▊  | 7/9 [00:13<00:01,  1.11it/s][A[A[A




 62%|██████▎   | 5/8 [00:13<00:05,  1.98s/it][A[A[A[A[A



 50%|█████     | 10/20 [00:13<00:09,  1.02it/s][A[A[A[A






 44%|████▍     | 8/18 [00:13<00:08,  1.21it/s][A[A[A[A[A[A[A





 28%|██▊       | 8/29 [00:13<00:20,  1.03it/s][A[A[A[A[A[A







 89%|████████▉ | 8/9 [00:13<00:00,  1.36it/s]][A[A[A[A[A[A[A[A




100%|██████████| 9/9 [00:13<00:00,  1.49s/it][A[A[A[A[A


Total Failure in the སློབ་གསོ། article: 0
sertha_ALL_content_སློབ་གསོ།.json
Successfully saved: sertha_ALL_content_སློབ་གསོ།.json





100%|██████████| 7/7 [00:14<00:00,  2.00s/it][A[A[A








 15%|█▌        | 9/60 [00:14<00:46,  1.10it/s][A[A[A[A[A[A[A[A






Total Failure in the གསར་རྩོམ། article: 0
sertha_ALL_content_གསར་རྩོམ།.json


100%|██████████| 8/8 [00:14<00:00,  1.03it/s][A[A[A[A[A





 31%|███       | 9/29 [00:14<00:19,  1.01it/s][A[A[A[A[A[A







 17%|█▋        | 10/60 [00:14<00:36,  1.35it/s][A[A[A[A[A[A[A[A






 56%|█████▌    | 10/18 [00:14<00:05,  1.39it/s][A[A[A[A[A[A[A



 55%|█████▌    | 11/20 [00:14<00:09,  1.06s/it][A[A[A[A





100%|██████████| 8/8 [00:14<00:00,  1.81s/it]s][A[A[A[A[A[A


Successfully saved: sertha_ALL_content_གསར་རྩོམ།.json
Total Failure in the མི་སྣ་ངོ་སྤྲོད། article: 0
sertha_ALL_content_མི་སྣ་ངོ་སྤྲོད།.json
Successfully saved: sertha_ALL_content_མི་སྣ་ངོ་སྤྲོད།.json










 18%|█▊        | 11/60 [00:14<00:34,  1.43it/s][A[A[A[A[A[A[A[A







 20%|██        | 12/60 [00:15<00:26,  1.84it/s][A[A[A[A[A[A[A[A





 38%|███▊      | 11/29 [00:15<00:14,  1.28it/s][A[A[A[A[A[A



 60%|██████    | 12/20 [00:15<00:08,  1.03s/it][A[A[A[A



 65%|██████▌   | 13/20 [00:15<00:05,  1.32it/s][A[A[A[A



 70%|███████   | 14/20 [00:16<00:04,  1.46it/s][A[A[A[A





 41%|████▏     | 12/29 [00:16<00:14,  1.14it/s][A[A[A[A[A[A






 61%|██████    | 11/18 [00:16<00:07,  1.08s/it][A[A[A[A[A[A[A



 80%|████████  | 16/20 [00:16<00:02,  1.85it/s][A[A[A[A



 85%|████████▌ | 17/20 [00:17<00:01,  2.03it/s][A[A[A[A





 45%|████▍     | 13/29 [00:17<00:13,  1.17it/s][A[A[A[A[A[A






 67%|██████▋   | 12/18 [00:17<00:06,  1.03s/it][A[A[A[A[A[A[A







 23%|██▎       | 14/60 [00:17<00:40,  1.13it/s][A[A[A[A[A[A[A[A






 72%|███████▏  | 13/18 [00:17<00:03,  1.26it/s][A[A[A[A[A[A[A



 90%|█

Total Failure in the ལོ་རྒྱུས། article: 0
sertha_ALL_content_ལོ་རྒྱུས།.json
Successfully saved: sertha_ALL_content_ལོ་རྒྱུས།.json








 62%|██████▏   | 18/29 [00:22<00:11,  1.04s/it][A[A[A[A[A[A





 69%|██████▉   | 20/29 [00:22<00:05,  1.62it/s][A[A[A[A[A[A







 33%|███▎      | 20/60 [00:22<00:34,  1.16it/s][A[A[A[A[A[A[A[A



100%|██████████| 20/20 [00:22<00:00,  1.46s/it][A[A[A[A





100%|██████████| 20/20 [00:22<00:00,  1.14s/it][A[A[A[A[A[A








 40%|████      | 24/60 [00:22<00:13,  2.74it/s][A[A[A[A[A[A[A[A

Total Failure in the གསར་འགྱུར། article: 0
sertha_ALL_content_གསར་འགྱུར།.json
Successfully saved: sertha_ALL_content_གསར་འགྱུར།.json








 83%|████████▎ | 24/29 [00:23<00:01,  3.23it/s][A[A[A[A[A[A







 42%|████▏     | 25/60 [00:23<00:11,  3.15it/s][A[A[A[A[A[A[A[A





 90%|████████▉ | 26/29 [00:27<00:02,  1.06it/s][A[A[A[A[A[A







 43%|████▎     | 26/60 [00:27<00:41,  1.21s/it][A[A[A[A[A[A[A[A







 45%|████▌     | 27/60 [00:28<00:34,  1.05s/it][A[A[A[A[A[A[A[A





 93%|█████████▎| 27/29 [00:29<00:02,  1.10s/it][A[A[A[A[A[A







 47%|████▋     | 28/60 [00:29<00:35,  1.12s/it][A[A[A[A[A[A[A[A







 48%|████▊     | 29/60 [00:29<00:27,  1.14it/s][A[A[A[A[A[A[A[A







 52%|█████▏    | 31/60 [00:29<00:16,  1.77it/s][A[A[A[A[A[A[A[A







 53%|█████▎    | 32/60 [00:30<00:16,  1.69it/s][A[A[A[A[A[A[A[A





100%|██████████| 29/29 [00:30<00:00,  1.06s/it][A[A[A[A[A[A








 55%|█████▌    | 33/60 [00:30<00:13,  1.98it/s][A[A[A[A[A[A[A[A
 12%|█▎        | 1/8 [00:30<03:36, 30.94s/it][A

Total Failure in the ༧གོང་ས་མཆོག article: 0
sertha_ALL_content_༧གོང་ས་མཆོག.json
Successfully saved: sertha_ALL_content_༧གོང་ས་མཆོག.json










 57%|█████▋    | 34/60 [00:31<00:12,  2.07it/s][A[A[A[A[A[A[A[A







 58%|█████▊    | 35/60 [00:31<00:10,  2.38it/s][A[A[A[A[A[A[A[A







 60%|██████    | 36/60 [00:32<00:13,  1.82it/s][A[A[A[A[A[A[A[A







 63%|██████▎   | 38/60 [00:32<00:07,  2.92it/s][A[A[A[A[A[A[A[A







 67%|██████▋   | 40/60 [00:32<00:04,  4.19it/s][A[A[A[A[A[A[A[A







 68%|██████▊   | 41/60 [00:38<00:26,  1.37s/it][A[A[A[A[A[A[A[A







 73%|███████▎  | 44/60 [00:38<00:13,  1.20it/s][A[A[A[A[A[A[A[A







 75%|███████▌  | 45/60 [00:39<00:12,  1.25it/s][A[A[A[A[A[A[A[A







 78%|███████▊  | 47/60 [00:39<00:07,  1.78it/s][A[A[A[A[A[A[A[A







 80%|████████  | 48/60 [00:40<00:05,  2.03it/s][A[A[A[A[A[A[A[A







 82%|████████▏ | 49/60 [00:40<00:05,  1.95it/s][A[A[A[A[A[A[A[A







 83%|████████▎ | 50/60 [00:41<00:05,  1.99it/s][A[A[A[A[A[A[A[A







 85%|████████▌ | 51/60 [00:43<00:08,  1.07it

Total Failure in the རྩོམ་རིག article: 0
sertha_ALL_content_རྩོམ་རིག.json
Successfully saved: sertha_ALL_content_རྩོམ་རིག.json
Processed 8 files
CPU times: user 24.6 s, sys: 657 ms, total: 25.2 s
Wall time: 49.7 s





In [None]:
# 14:44

In [None]:
json_files = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.json')]
json_files