In [1]:
import json
from bs4 import Tag
from tqdm import tqdm
import kangbatv_utils
import os
from concurrent.futures import ThreadPoolExecutor, as_completed


In [2]:
file_name_code= "kangbatv"

In [10]:

class CustomJSONEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, Tag):
            return obj.get_text()
        return str(obj)

def save_json(path, file_name, data):
    try: 
        with open(path + file_name, "w", encoding='utf-8') as outfile:
            json.dump(data, outfile, indent=4, ensure_ascii=False, cls=CustomJSONEncoder)
        print(f"Successfully saved: {file_name}")
    except Exception as e:
        print(f"Error saving {file_name}: {str(e)}")

def read_json(path, file_name):
    try:
        with open(path+file_name, 'r', encoding='utf-8') as openfile:
            Loaded_file = json.load(openfile)
            print(f"Successfully loaded: {file_name}")
        return Loaded_file
    except Exception as e:
        print(f"Error loading {file_name}: {str(e)}")
        return None

def scrape_article(url, page_key_code):
    try:
        article_content = kangbatv_utils.scrape_kangbatv_article_content(url, tags=page_key_code)
        return article_content
    except Exception as e:
        print(f"Error scraping {url}: {str(e)}")
        return None

def get_content(All_links_data, Total_lenght, page_key_code, page_key_list):
    all_article = {}
    
    with ThreadPoolExecutor(max_workers=10) as executor:
        future_to_url = {}
        start = 1
        for i in range(start, Total_lenght+1):
            page_key = page_key_code + str(i)
            all_link_page = All_links_data[page_key]["Links"]
            
            for url in all_link_page:
                future = executor.submit(scrape_article, url, page_key_code)
                future_to_url[future] = (page_key, url)
        
        for future in tqdm(as_completed(future_to_url), total=len(future_to_url)):
            page_key, url = future_to_url[future]
            try:
                article_content = future.result()
                if article_content:
                    article_key = f"{page_key}_scrape_{file_name_code}_Article_{len([k for k in all_article if k.startswith(page_key)]) + 1}"
                    all_article[article_key] = article_content
            except Exception as e:
                print(f"Error processing {url}: {str(e)}")
    
    Failure_count = sum(1 for article in all_article.values() if article["Response"] != 200)
    print(f"Total Failure in the {page_key_list[1]} article: {Failure_count}")
    
    save_file_name = f"scrape_{file_name_code}_ALL_content_{page_key_list[1]}.json"
    print(save_file_name)
    path = "./data/parallel_content/"
    save_json(path, save_file_name, all_article)

def process_json_file(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            All_links_data = json.load(file)
            Total_lenght = len(All_links_data)
            print(f"Total page in {os.path.basename(file_path)}: {Total_lenght}")
        
            print(f"page key name: {list(All_links_data.keys())[-1]}")
            page_key_list = list(All_links_data.keys())[-1].split(" ")
            
            page_key_code = "Page "+page_key_list[1]+" "
            print(f"Page key code: {page_key_code}")
            get_content(All_links_data, Total_lenght, page_key_code, page_key_list)
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON in file {os.path.basename(file_path)}: {str(e)}")
    except Exception as e:
        print(f"Error processing file {os.path.basename(file_path)}: {str(e)}")

def get_json_files(directory):
    json_files = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.json')]
    print(f"Total Files {len(json_files)}")
    with ThreadPoolExecutor(max_workers=10) as executor:
        list(tqdm(executor.map(process_json_file, json_files), total=len(json_files)))
    
    print(f"Processed {len(json_files)} files")


In [6]:
%%time
# Example usage
directory_path = './data/links/'
# directory_path = './data/test_link/'
get_json_files(directory_path)

Total Files 83
Total page in kangbatv_ALL_link_ཀྲུང་དབྱང་གསར་འགྱུར་མཉམ་བསྒྲགས།.json: 20
page key name: Page ཀྲུང་དབྱང་གསར་འགྱུར་མཉམ་བསྒྲགས། 20
Page key code: Page ཀྲུང་དབྱང་གསར་འགྱུར་མཉམ་བསྒྲགས། 
Total page in kangbatv_ALL_link_ཁམས་པའི་བརྙན་འཕྲིན་གསར་འགྱུར།.json: 20
page key name: Page ཁམས་པའི་བརྙན་འཕྲིན་གསར་འགྱུར། 20
Page key code: Page ཁམས་པའི་བརྙན་འཕྲིན་གསར་འགྱུར། 
Total page in kangbatv_ALL_link_རྒྱལ་ནང་གསར་འགྱུར།.json: 20
page key name: Page རྒྱལ་ནང་གསར་འགྱུར། 20
Page key code: Page རྒྱལ་ནང་གསར་འགྱུར། 
Total page in kangbatv_ALL_link_རྒྱལ་སྤྱིའི་གསར་འགྱུར།.json: 20
page key name: Page རྒྱལ་སྤྱིའི་གསར་འགྱུར། 20
Page key code: Page རྒྱལ་སྤྱིའི་གསར་འགྱུར། 
Total page in kangbatv_ALL_link_བོད་ཁུལ་གསར་འགྱུར།.json: 20
page key name: Page བོད་ཁུལ་གསར་འགྱུར། 20
Page key code: Page བོད་ཁུལ་གསར་འགྱུར། 
Total page in kangbatv_ALL_link_གློག་བརྙན།.json: 6
page key name: Page གློག་བརྙན། 6
Page key code: Page གློག་བརྙན། 
Total page in kangbatv_ALL_link_ཐད་གཏོང་།.json: 1
page key name: Page ཐད་གཏ

0it [00:00, ?it/s]


Total Failure in the ཐད་གཏོང་། article: 0
scrape_kangbatv_ALL_content_ཐད་གཏོང་།.json
Successfully saved: scrape_kangbatv_ALL_content_ཐད་གཏོང་།.json
Total page in kangbatv_ALL_link_འཆད་ཁྲིད་སྒྲོན་མེ།.json: 11
page key name: Page འཆད་ཁྲིད་སྒྲོན་མེ། 11
Page key code: Page འཆད་ཁྲིད་སྒྲོན་མེ། 
Total page in kangbatv_ALL_link_བྱམས་པའི་ཁ་བརྡ།.json: 10
page key name: Page བྱམས་པའི་ཁ་བརྡ། 10
Page key code: Page བྱམས་པའི་ཁ་བརྡ། 
Total page in kangbatv_ALL_link_ཁམས་པའི་དགའ་ཚལ།.json: 20
page key name: Page ཁམས་པའི་དགའ་ཚལ། 20
Page key code: Page ཁམས་པའི་དགའ་ཚལ། 
Total page in kangbatv_ALL_link_དྲི་ཟའི་གླུ་དབྱངས།.json: 10
page key name: Page དྲི་ཟའི་གླུ་དབྱངས། 10
Page key code: Page དྲི་ཟའི་གླུ་དབྱངས། 


  0%|          | 0/83 [00:00<?, ?it/s]
  0%|          | 0/217 [00:00<?, ?it/s][A

  0%|          | 0/378 [00:00<?, ?it/s][A[A


  0%|          | 0/378 [00:00<?, ?it/s][A[A[A



  0%|          | 0/760 [00:00<?, ?it/s][A[A[A[A




  0%|          | 0/413 [00:00<?, ?it/s][A[A[A[A[A





  0%|          | 0/500 [00:00<?, ?it/s][A[A[A[A[A[A






  0%|          | 0/500 [00:00<?, ?it/s][A[A[A[A[A[A[A







  0%|          | 0/800 [00:00<?, ?it/s][A[A[A[A[A[A[A[A









  0%|          | 0/760 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A








  0%|          | 0/500 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A
  0%|          | 1/217 [00:06<23:29,  6.53s/it][A
  1%|          | 2/217 [00:06<09:51,  2.75s/it][A
  1%|▏         | 3/217 [00:07<05:58,  1.68s/it][A




  0%|          | 1/413 [00:06<48:03,  7.00s/it][A[A[A[A[A






  0%|          | 1/500 [00:07<58:19,  7.01s/it][A[A[A[A[A[A[A



  0%|          | 1/760 [00:07<1:29:16,  7.06s/it][A

Total Failure in the གློག་བརྙན། article: 0
scrape_kangbatv_ALL_content_གློག་བརྙན།.json
Successfully saved: scrape_kangbatv_ALL_content_གློག་བརྙན།.json
Total page in kangbatv_ALL_link_དགའ་སྤྲོའི་ཕོ་ཉ།.json: 6
page key name: Page དགའ་སྤྲོའི་ཕོ་ཉ། 6
Page key code: Page དགའ་སྤྲོའི་ཕོ་ཉ། 



  0%|          | 0/208 [00:00<?, ?it/s][A

 59%|█████▊    | 222/378 [02:53<02:04,  1.25it/s][A[A


 58%|█████▊    | 221/378 [02:53<02:49,  1.08s/it][A[A[A







 27%|██▋       | 219/800 [02:53<08:08,  1.19it/s][A[A[A[A[A[A[A[A



 29%|██▊       | 218/760 [02:53<07:26,  1.22it/s][A[A[A[A







 28%|██▊       | 220/800 [02:53<06:08,  1.57it/s][A[A[A[A[A[A[A[A




 53%|█████▎    | 217/413 [02:53<02:27,  1.33it/s][A[A[A[A[A






 42%|████▏     | 208/500 [02:54<04:53,  1.00s/it][A[A[A[A[A[A[A






 42%|████▏     | 209/500 [02:54<03:41,  1.31it/s][A[A[A[A[A[A[A









 28%|██▊       | 215/760 [02:54<05:47,  1.57it/s][A[A[A[A[A[A[A[A[A[A


 59%|█████▊    | 222/378 [02:54<02:40,  1.03s/it][A[A[A






 42%|████▏     | 210/500 [02:54<03:07,  1.54it/s][A[A[A[A[A[A[A



 29%|██▉       | 219/760 [02:54<07:01,  1.28it/s][A[A[A[A





 42%|████▏     | 209/500 [02:54<05:48,  1.20s/it][A[A[A[A[A[A


 59%|█████▉    | 223/

In [None]:
# 14:44

In [8]:
pwd()

'/workspace/web_scrab/new_news_Articles/kangbatv'

In [9]:
directory_parallel = './data/parallel_content/'
json_files = [os.path.join(directory_parallel, f) for f in os.listdir(directory_parallel) if f.endswith('.json')]
json_files

['./data/parallel_content/scrape_kangbatv_ALL_content_ཐད་གཏོང་།.json',
 './data/parallel_content/scrape_kangbatv_ALL_content_གློག་བརྙན།.json',
 './data/parallel_content/scrape_kangbatv_ALL_content_བྱམས་པའི་ཁ་བརྡ།.json',
 './data/parallel_content/scrape_kangbatv_ALL_content_དྲི་ཟའི་གླུ་དབྱངས།.json',
 './data/parallel_content/scrape_kangbatv_ALL_content_འཆད་ཁྲིད་སྒྲོན་མེ།.json',
 './data/parallel_content/scrape_kangbatv_ALL_content_དགའ་སྤྲོའི་ཕོ་ཉ།.json',
 './data/parallel_content/scrape_kangbatv_ALL_content_བོད་ཁུལ་གསར་འགྱུར།.json',
 './data/parallel_content/scrape_kangbatv_ALL_content_རྒྱལ་སྤྱིའི་གསར་འགྱུར།.json',
 './data/parallel_content/scrape_kangbatv_ALL_content_རྒྱལ་ནང་གསར་འགྱུར།.json',
 './data/parallel_content/scrape_kangbatv_ALL_content_སྐལ་བཟང་མེ་ཏོག.json',
 './data/parallel_content/scrape_kangbatv_ALL_content_ཁམས་པའི་བརྙན་འཕྲིན་གསར་འགྱུར།.json',
 './data/parallel_content/scrape_kangbatv_ALL_content_ཁམས་པའི་དགའ་ཚལ།.json',
 './data/parallel_content/scrape_kangbatv_ALL_content_

## fixing the missing file

In [16]:
file_name = "kangbatv_ALL_link_多彩文明 荣耀光影：首届金熊猫奖.json"

In [17]:
All_links_data = read_json(directory_path, file_name)

Successfully loaded: kangbatv_ALL_link_多彩文明 荣耀光影：首届金熊猫奖.json


In [21]:

Total_lenght = len(All_links_data)
print(f"Total page in {file_name}: {Total_lenght}")

print(f"page key name: {list(All_links_data.keys())[-1]}")
page_key_list = list(All_links_data.keys())[-1].split(" ")

page_key_code = "Page "+"多彩文明 荣耀光影：首届金熊猫奖"+" "
print(f"Page key code: {page_key_code}")
# get_content(All_links_data, Total_lenght, page_key_code, page_key_list)

Total page in kangbatv_ALL_link_多彩文明 荣耀光影：首届金熊猫奖.json: 16
page key name: Page 多彩文明 荣耀光影：首届金熊猫奖 16
Page key code: Page 多彩文明 荣耀光影：首届金熊猫奖 


In [23]:
start = 1
all_article = {}

for i in tqdm(range(start, Total_lenght+1)):
    page_key = page_key_code + str(i)
    all_link_page = All_links_data[page_key]["Links"]
    page_article_index = 1
    
    for each_URL in all_link_page:
        article_content = kangbatv_utils.scrape_kangbatv_article_content(each_URL, tags=page_key_code)
        article_key = page_key + "_kangbatv_Article_" + str(page_article_index)
        page_article_index += 1
        all_article[article_key] = article_content

100%|██████████| 16/16 [48:15<00:00, 180.99s/it]


In [24]:
Failure_count = 0
for each_article_key in all_article:
    if all_article[each_article_key]["Response"] != 200:
        print(each_article_key)
        Failure_count += 1

print(f"total Failure in the {page_key_list[1]} article: {Failure_count}")

total Failure in the 多彩文明 article: 0


In [26]:
save_file_name = f"kangbatv_ALL_content_多彩文明 荣耀光影：首届金熊猫奖.json"
save_json(directory_parallel, save_file_name, all_article)

Successfully saved: kangbatv_ALL_content_多彩文明 荣耀光影：首届金熊猫奖.json
