In [2]:
import json
with open("results_dict_ner_tag_trans.json", "r", encoding='utf-8') as f:
     results_dict = json.load(f)

In [4]:
from collections import Counter

# Initialize a dictionary to store phrase frequencies for each file
phrase_freq_dict = {}

# Define the specific phrases we care about
phrases_we_care = {"羅馬"}

# Define the tags we care about
tags_we_care = {'LOC', 'NORP', 'GPE'}

# Iterate over each file in the results_dict
for file_name, file_data in results_dict.items():
    # Extract all_tags from the file data
    all_tags = file_data.get('all_tags', [])
    
    # Count the frequency of each specific phrase (only for tags in tags_we_care)
    phrase_counter = Counter(
        phrase for phrase, tag, _, _ in all_tags
        if tag in tags_we_care and phrase in phrases_we_care
    )
    
    # Store the phrase frequencies in the dictionary
    phrase_freq_dict[file_name] = {
        'phrase_freq': dict(phrase_counter),
        'total_freq': sum(phrase_counter.values())  # Sum of all phrase frequencies
    }

# Sort files by the total frequency of phrases (descending order)
sorted_files = sorted(phrase_freq_dict.items(), key=lambda x: x[1]['total_freq'], reverse=True)

# Print the results
for file_name, data in sorted_files:
    print(f"File: {file_name}")
    print("Phrase Frequencies:")
    for phrase in phrases_we_care:
        freq = data['phrase_freq'].get(phrase, 0)  # Get frequency or 0 if phrase not found
        print(f"  {phrase}: {freq}")
    print(f"Total Frequency: {data['total_freq']}")
    print("-" * 40)  # Separator for readability

File: 埃及金塔剖尸记.txt
Phrase Frequencies:
  羅馬: 120
Total Frequency: 120
----------------------------------------
File: 航海复仇记.txt
Phrase Frequencies:
  羅馬: 23
Total Frequency: 23
----------------------------------------
File: 澳洲历险记.txt
Phrase Frequencies:
  羅馬: 20
Total Frequency: 20
----------------------------------------
File: 三千年艳尸记.txt
Phrase Frequencies:
  羅馬: 14
Total Frequency: 14
----------------------------------------
File: 侠女郎.txt
Phrase Frequencies:
  羅馬: 14
Total Frequency: 14
----------------------------------------
File: 续侠隐记.txt
Phrase Frequencies:
  羅馬: 12
Total Frequency: 12
----------------------------------------
File: 太平洋遇险记.txt
Phrase Frequencies:
  羅馬: 7
Total Frequency: 7
----------------------------------------
File: 鲁滨孙飘流续记.txt
Phrase Frequencies:
  羅馬: 7
Total Frequency: 7
----------------------------------------
File: 撷兰记.txt
Phrase Frequencies:
  羅馬: 6
Total Frequency: 6
----------------------------------------
File: 锺乳髑艛.txt
Phrase Frequencies:
  羅馬: 5
Total 

In [6]:
from collections import Counter

# Initialize a dictionary to store phrase frequencies for each file
phrase_freq_dict = {}

# Define the specific phrases we care about
phrases_we_care = {"中國", "支那","華","震旦","漢","中華","支那帝國","中華民國","清帝國","清國"}

# Define the tags we care about
tags_we_care = {'LOC', 'NORP', 'GPE'}

# Initialize a dictionary to store the sum of frequencies across all files
total_freq_across_files = {phrase: 0 for phrase in phrases_we_care}

# Iterate over each file in the results_dict
for file_name, file_data in results_dict.items():
    # Extract all_tags from the file data
    all_tags = file_data.get('all_tags', [])
    
    # Count the frequency of each specific phrase (only for tags in tags_we_care)
    phrase_counter = Counter(
        phrase for phrase, tag, _, _ in all_tags
        if tag in tags_we_care and phrase in phrases_we_care
    )
    
    # Store the phrase frequencies in the dictionary
    phrase_freq_dict[file_name] = {
        'phrase_freq': dict(phrase_counter),
        'total_freq': sum(phrase_counter.values())  # Sum of all phrase frequencies
    }
    
    # Update the total frequency across all files
    for phrase, freq in phrase_counter.items():
        total_freq_across_files[phrase] += freq

# Sort files by the total frequency of phrases (descending order)
sorted_files = sorted(phrase_freq_dict.items(), key=lambda x: x[1]['total_freq'], reverse=True)

# Print the results for each file
for file_name, data in sorted_files:
    print(f"File: {file_name}")
    print("Phrase Frequencies:")
    for phrase in phrases_we_care:
        freq = data['phrase_freq'].get(phrase, 0)  # Get frequency or 0 if phrase not found
        print(f"  {phrase}: {freq}")
    print(f"Total Frequency: {data['total_freq']}")
    print("-" * 40)  # Separator for readability

# Print the sum of frequencies across all files
print("Sum of Frequencies Across All Files:")
for phrase, total_freq in total_freq_across_files.items():
    print(f"  {phrase}: {total_freq}")

File: 鲁滨孙飘流续记.txt
Phrase Frequencies:
  漢: 2
  支那: 23
  清國: 0
  中華: 0
  震旦: 0
  清帝國: 0
  支那帝國: 0
  華: 4
  中華民國: 0
  中國: 25
Total Frequency: 54
----------------------------------------
File: 新舞台三.txt
Phrase Frequencies:
  漢: 0
  支那: 0
  清國: 0
  中華: 0
  震旦: 0
  清帝國: 0
  支那帝國: 0
  華: 0
  中華民國: 0
  中國: 40
Total Frequency: 40
----------------------------------------
File: 鲁滨孙漂流记(大陆报).txt
Phrase Frequencies:
  漢: 0
  支那: 24
  清國: 0
  中華: 0
  震旦: 0
  清帝國: 0
  支那帝國: 0
  華: 0
  中華民國: 0
  中國: 5
Total Frequency: 29
----------------------------------------
File: 北京巴黎間競車日記.txt
Phrase Frequencies:
  漢: 0
  支那: 0
  清國: 0
  中華: 1
  震旦: 0
  清帝國: 0
  支那帝國: 0
  華: 1
  中華民國: 0
  中國: 25
Total Frequency: 27
----------------------------------------
File: 寰球旅行记.txt
Phrase Frequencies:
  漢: 1
  支那: 2
  清國: 0
  中華: 1
  震旦: 0
  清帝國: 0
  支那帝國: 0
  華: 2
  中華民國: 0
  中國: 14
Total Frequency: 20
----------------------------------------
File: 新舞台二.txt
Phrase Frequencies:
  漢: 0
  支那: 17
  清國: 0
  中華: 0
  震旦: 0
  清帝國: 0