In [None]:
import os
import json
from itertools import chain
import time
from hanlp_restful import HanLPClient
HanLP = HanLPClient('https://www.hanlp.com/api', auth="", language='zh')
parent_folder = 'Lin'

# === Step 1: Read texts and construct file_list and all_text ===
def split_into_blocks(text, segment_size=150):
    return [text[i:i + segment_size] for i in range(0, len(text), segment_size)]

def join(list_of_lists):
    return list(chain.from_iterable(list_of_lists))

# === Data storage ===
results_dict = {}
query_timestamps = []

# === Traverse all .txt files under subfolders ===
for root, dirs, files in os.walk(parent_folder):
    for file in sorted(files):
        if file.endswith('.txt'):
            full_path = os.path.join(root, file)
            fname = os.path.splitext(file)[0]
            group_label = os.path.basename(root)  # subfolder name as group

            print(f"📄 Processing: {fname} (Group: {group_label})")
            with open(full_path, 'r', encoding='utf-8') as f:
                text = f.read()

            all_tokens = []
            all_pos_tags = []
            all_ner_entities = []

            big_blocks = split_into_blocks(text, 14000)
            for big_block in big_blocks:
                small_blocks = split_into_blocks(big_block, 150)

                # Handle HanLP rate limit: 60 requests/minute
                if len(query_timestamps) >= 60:
                    gap = time.time() - query_timestamps[-60]
                    if gap < 70:
                        wait_time = 70 - gap
                        print(f"⏳ Sleeping for {wait_time:.1f} seconds to respect rate limit")
                        time.sleep(wait_time)
                query_timestamps.append(time.time())

                # === HanLP: POS + NER ===
                data = HanLP.parse(small_blocks, tasks=['pos/ctb', 'ner/msra'])
                tokens = data['tok/fine']
                pos_tags = data['pos/ctb']
                ner_entities = data['ner/msra']

                all_tokens += join(tokens)
                all_pos_tags += join(pos_tags)
                all_ner_entities += ner_entities  # list of [entity, type]

            # === Save result for one file ===
            results_dict[fname] = {
                "filename": fname,
                "group": group_label,
                "tokens": all_tokens,
                "pos": all_pos_tags,
                "ner": all_ner_entities
            }

# === Save all to JSON ===
output_file = "all_Lin_Shu_token_pos_ner_by_subfolder.json"
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(list(results_dict.values()), f, ensure_ascii=False, indent=2)

print(f"✅ Done. Tokenized and tagged {len(results_dict)} files. Saved to: {output_file}")


📄 Processing: Huya yu xi lu (Group: Original Novels)
📄 Processing: Jie wai tanhua (Group: Original Novels)
📄 Processing: Jinghua bi xue lu (Group: Original Novels)
📄 Processing: Jinguo Yangqiu (Group: Original Novels)
📄 Processing: Jinling qiu (Group: Original Novels)
📄 Processing: Yuan hai ling guang (Group: Original Novels)
📄 Processing: Aiji jinta poushi ji (Group: Translated Novels)
📄 Processing: Aisilan qingxia zhuan (Group: Translated Novels)
📄 Processing: Chan chao ji shang (Group: Translated Novels)
📄 Processing: Chan chao ji xu (Group: Translated Novels)
📄 Processing: Feizhou yanshui chou cheng lu (Group: Translated Novels)
📄 Processing: Gu gui yi jin ji (Group: Translated Novels)
📄 Processing: Haiwai xuanqv lu (Group: Translated Novels)
📄 Processing: Haoshi shu lie (Group: Translated Novels)
📄 Processing: Lubinxun piaoliuji xuji (Group: Translated Novels)
📄 Processing: Lubinxun piaoliuji (Group: Translated Novels)
📄 Processing: Manhuang zhiyi (Group: Translated Novels)
📄 Proc