# Evaluate Term Dispersion Scores on the GENIA Corpus Data and Reproduce Results Reported in the Mannuscript 

Authors: Samuel Sarria Hurtado, Uyen "Rachel" Lai, and Paul Sheridan

Last update: 2025-06-03

Description: Evaluate the following term dispersion scores on the Genia corpus data:
- Inverse Document Frequency (IDF)
- Inverse Collection Frequency (ICF)
- Chi-square
- Church and Gale (CG)
- Irvine and Callison-Burch (ICB)
- Derivation of Proportions (DoP)
- Residual ICF (RICF)

Calculate average P@k scores for each scoring function using the GENIA terms as ground truth. Also, evaluate scoring functions for their ability to filter out stopwords. 

## Preliminaries

In [None]:
# Imports
import sys
import json
import pandas as pd
sys.path.append('../0-base-functions/')
import wordstats
from sklearn.feature_extraction.text import CountVectorizer
import random
import numpy as np
import scipy
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from io import StringIO
from numpy import nan
from tqdm import tqdm
import rbo

## Load the GENIA Corpus Data

In particular, we load the
- preprocessed GENIA corpus documents,
- and gold standard biological terms (i.e., lexical units) and their associated semantic classes (i.e., sems).

In [2]:
# Load the preprocessed GENIA corpus documents
genia_corpus_path = '../1-preprocessing/GENIAcorpus3.02-preprocessed.json'

with open(genia_corpus_path, "r") as j:
  genia_corpus = json.loads(j.read())

# Load gold standard terms 
genia_keywords_path = '../1-preprocessing/GENIAcorpus3.02-keywords.tsv'

with open(genia_keywords_path, "r") as c:
  genia_lexical_units_and_sems = pd.read_csv(c, sep='\t')

genia_lexical_units = genia_lexical_units_and_sems.lex.to_numpy()

## Prepare the GENIA Corpus Data for Analysis

Prepare the corpus vocabulary.

In [3]:
# Compile the GENIA corpus vocabulary
pre_vocab = []
for i in range(len(genia_corpus)):
  pre_vocab.append(genia_corpus[i].split())

vocab = []
for i in range(len(pre_vocab)):
  for j in range(len(pre_vocab[i])):
    vocab.append(pre_vocab[i][j])

vocab = list(set(vocab))
vocab.sort()

Convert GENIA documents into term-in-document matrix of token counts.

In [4]:
# Helper function to ensure that CountVectorizer doesn't ignore any terms
def analyzer_custom(doc):
  return doc.split()

# Generate term-in-document counts
counter = CountVectorizer(lowercase=False, vocabulary=vocab, analyzer=analyzer_custom)
collection = counter.transform(genia_corpus)

## Evaluate Term Dispersion Scores for Selected Measures

Calculate bag-of-words model word statistics and related quantities.

In [5]:
# Calculate word statistics and related quantities
m = len(counter.get_feature_names_out()) # vocab size
d = collection.shape[0] # collection size
N_i = wordstats.get_Ni(collection)
N_j = wordstats.get_Nj(collection)
N = wordstats.get_N(N_j)
B_ij = wordstats.get_Bij(collection)
B_i = wordstats.get_Bi(B_ij)
B_j = wordstats.get_Bj(B_ij)
DF = wordstats.get_DF(B_i, d)
CF = wordstats.get_CF(N_i)
nij_by_nj = wordstats.get_nij_by_nj(collection, N_j)
thetas = np.array(range(1, max(N_i.A[0]) + 1))/N
opt_thetas = wordstats.get_opt_thetas(N, m, d, N_i, N_j, B_i, thetas)

Evaluate word dispersion scores.

In [None]:
# Calculate word dispersion scores according the various measures used in this study
IDF = wordstats.get_IDF(DF)
ICF = wordstats.get_ICF(CF)
Chisq = wordstats.get_Chisq(collection)
CG = wordstats.get_CG(N_i, B_i)
ICB = wordstats.get_ICB(nij_by_nj, B_i)
DoP = wordstats.get_DoP(collection, N_i, N_j, N)
RICF = wordstats.get_RICF(opt_thetas, N, ICF)

Arrange term dispersion scores into a data frame.

In [7]:
# Initialize bursty scores data frame
term_scores_df = pd.DataFrame(data=
                         {'term': counter.get_feature_names_out(),
                          'IDF': IDF.A[0],
                          'ICF': ICF.A[0],
                          'Chi-sq': Chisq,
                          'CG': CG.A[0],
                          'ICB': ICB.A[0],
                          'DoP': DoP.A[0],
                          'RICF': RICF.A[0]})

In [None]:
# Print term dispersion scores to console
display(term_scores_df)

In [9]:
# Write to TSV
term_scores_df.to_csv('term-dispersion-scores.tsv', sep='\t')

## Compile GENIA Corpus Summary Statistics

This is the result of Table 3 from the manuscript.

In [10]:
# Collect lexical units belonging to a given high-level semantic class
def get_high_level_semantic_class_words(high_level_class_lst):
  words = []
  for k, v in lex_sem_dct.items():
    if v in high_level_class_lst:
      words.append(k)
  return words

In [11]:
# Hardcode the semantic classes according to their high-level designations.
amino_acid_sems = ['G#amino_acid_monomer', 'G#peptide', 'G#protein_N/A',
              'G#protein_complex', 'G#protein_domain_or_region',
              'G#protein_family_or_group', 'G#protein_molecule',
              'G#protein_substructure', 'G#protein_subunit',
              'G#other_organic_compound', 'G#organic', 'G#inorganic', 'G#atom',
              'G#carbohydrate', 'G#lipid']
nucleotide_sems = ['G#nucleotide', 'G#polynucleotide', 'G#DNA_N/A',
        'G#DNA_domain_or_region', 'G#DNA_family_or_group', 'G#DNA_molecule',
        'G#DNA_substructure', 'G#RNA_N/A', 'G#RNA_domain_or_region',
        'G#RNA_family_or_group', 'G#RNA_molecule', 'G#RNA_substructure']
multi_cell_sems = ['G#virus', 'G#mono_cell', 'G#multi_cell', 'G#body_part', 'G#tissue']
cell_sems = ['G#cell_type', 'G#cell_component', 'G#cell_line', 'G#other_artificial_source']
other_sems = ['G#other_name']
high_level_semantic_class_names = ['amino_acid', 'nucleotide', 'multi_cell', 'cell', 'other']
high_level_semantic_class_lex_units = [genia_lexical_units, amino_acid_sems, nucleotide_sems, multi_cell_sems, cell_sems, other_sems]

In [12]:
# Create dictionary of lexical units and their associated semantic classes
sem = np.array(genia_lexical_units_and_sems['sem'])
lex = np.array(genia_lexical_units_and_sems['lex'])
lex_sem_dct = dict(zip(lex, sem))

# Count number of semantic classes in each high-level class
sub_class = [len(amino_acid_sems), len(nucleotide_sems), len(multi_cell_sems), len(cell_sems), len(other_sems)]

# Count number of distinct lexical units in each high-level semantic class
amino_acid = get_high_level_semantic_class_words(amino_acid_sems)
nucleotide = get_high_level_semantic_class_words(nucleotide_sems)
multi_cell = get_high_level_semantic_class_words(multi_cell_sems)
cell = get_high_level_semantic_class_words(cell_sems)
other = get_high_level_semantic_class_words(other_sems)

# Count number of lexical units used as annotations for each high-level semantic class
high_level_class_words = [genia_lexical_units, amino_acid, nucleotide, multi_cell, cell, other]
high_level_class_words_counter = [0, 0, 0, 0, 0, 0]
for i in range(len(high_level_class_words)):
  for j in range(len(vocab)):
    if vocab[j] in high_level_class_words[i]:
      high_level_class_words_counter[i] += N_i.A[0][j]

In [13]:
# Create dictionary of lexical units and their associated semantic classes
sem = np.array(genia_lexical_units_and_sems['sem'])
lex = np.array(genia_lexical_units_and_sems['lex'])
lex_sem_dct = dict(zip(lex, sem))

# Count number of semantic subclasses in each high-level class
subclass_counts = [len(amino_acid_sems), len(nucleotide_sems), len(multi_cell_sems), len(cell_sems), len(other_sems)]

# Gather distinct lexical units for each high-level semantic class
amino_acid = get_high_level_semantic_class_words(amino_acid_sems)
nucleotide = get_high_level_semantic_class_words(nucleotide_sems)
multi_cell = get_high_level_semantic_class_words(multi_cell_sems)
cell = get_high_level_semantic_class_words(cell_sems)
other = get_high_level_semantic_class_words(other_sems)

# Count number of lexical units used as annotations for each high-level semantic class
lex_by_semamtic_class_counts = [0, 0, 0, 0, 0, 0]
for i in range(len(high_level_semantic_class_names)):
  for j in range(len(vocab)):
    if vocab[j] in high_level_semantic_class_names[i]:
      lex_by_semamtic_class_counts[i] += N_i.A[0][j]

In [14]:
# Define the data
high_level_semantic_classes = ['all', 'amino_acid', 'nucleotide', 'multi_cell', 'cell', 'other']
annotations_count_zip = zip(high_level_semantic_classes, high_level_class_words_counter)
annotations_dict = dict(annotations_count_zip)
lex_unit_counts = [len(amino_acid), len(nucleotide), len(multi_cell), len(cell), len(other)]
subclass_counts = [len(amino_acid_sems), len(nucleotide_sems), len(multi_cell_sems), len(cell_sems), len(other_sems)]
annotations = [
    annotations_dict['amino_acid'],
    annotations_dict['nucleotide'],
    annotations_dict['multi_cell'],
    annotations_dict['cell'],
    annotations_dict['other']]

# Create a data frame
genia_summary_stats_df = pd.DataFrame({
    'Semantic class': ['amino_acid', 'nucleotide', 'multi_cell', 'cell', 'other'],
    'Sub-class': subclass_counts,
    'Unique terms': lex_unit_counts,
    "Annotations": annotations
})

In [None]:
# Print GENIA summary statistics
display(genia_summary_stats_df)

In [16]:
# Write to CSV
genia_summary_stats_df.to_csv('table-3/table-3.csv', index=False)

## Terminology Extraction Task Experiment

Here we reproduce the result of Table 4 from the manuscript.

In [50]:
def top_k(dct, k):
  keys = dct.keys()
  values = []
  for key in keys:
    values.append(dct[key][:k])
  keys_values_pair = zip(keys, values)
  return dict(keys_values_pair)
    
def resort(term_scores_df):
    sorted_terms = []
    bursty_measure_names = term_scores_df.columns.values.tolist()[1:]

    for measure in bursty_measure_names:
        # Copy the data frame and add a random column
        temp_df = term_scores_df.copy()
        temp_df['random'] = np.random.rand(len(temp_df))
        
        # Sort by the measure and the random column
        sorted_df = temp_df[['term', measure, 'random']].sort_values(by=[measure, 'random'], ascending=[False, True])
        
        # Append the sorted terms to the list
        sorted_terms.append(np.array(sorted_df['term']))
        
        # Drop the random column from the temporary data frame
        temp_df.drop(columns='random', inplace=True)
    
    sorted_terms = np.array(sorted_terms)
    measure_term_pair = zip(bursty_measure_names, sorted_terms)
    sorted_measures = dict(measure_term_pair)
    return sorted_measures
    
def calculate_means(nested_list):
    result = []
    num_outer = len(nested_list)
    num_inner = len(nested_list[0])

    for i in range(num_inner):
        means = {}
        for column in nested_list[0][i].columns:
            values = [nested_list[outer][i][column].values for outer in range(num_outer)]
            mean_values = np.mean(values, axis=0)
            means[column] = mean_values
        result.append(pd.DataFrame(means, index=nested_list[0][i].index))
    return result

# Function to calculate the standard deviation of the corresponding data frames
def calculate_sds(nested_list):
    result = []
    num_outer = len(nested_list)
    num_inner = len(nested_list[0])

    for i in range(num_inner):
        std_devs = {}
        for column in nested_list[0][i].columns:
            values = [nested_list[outer][i][column].values for outer in range(num_outer)]
            std_values = np.std(values, axis=0, ddof=1)
            std_devs[column] = std_values
        result.append(pd.DataFrame(std_devs, index=nested_list[0][i].index))
    return result

def create_p_k(v, sorted_measures):
  measures = sorted_measures.keys()
  counts = [[], [], [], [], [], [], [], [], [], []]
  p_k_dct = dict(zip(measures, counts))  
  for measure in p_k_dct.keys():
      for value in at_values:
              p_k_dct[measure].append(count_words(top_k(sorted_measures, value)[measure], v)/value)
  result = pd.DataFrame(p_k_dct)
  result.index = at_values
  return result

def create_r_k(v, sorted_measures):
  measures = sorted_measures.keys()
  counts = [[], [], [], [], [], [], [], [], [], []]
  r_k_dct = dict(zip(measures, counts))  
  for measure in r_k_dct.keys():
      for value in at_values:
              r_k_dct[measure].append(count_words(top_k(sorted_measures, value)[measure], v)/value)
  result = pd.DataFrame(r_k_dct)
  result.index = at_values
  return result

def create_f1_k(v, sorted_measures):
  measures = sorted_measures.keys()
  counts = [[], [], [], [], [], [], [], [], [], []]
  f1_k_dct = dict(zip(measures, counts))  
  for measure in f1_k_dct.keys():
      for value in at_values:
              f1_k_dct[measure].append(count_words(top_k(sorted_measures, value)[measure], v)/value)
  result = pd.DataFrame(f1_k_dct)
  result.index = at_values
  return result

def create_RBO(v, sorted_measures):
  RICF = sorted_measures["RICF"]
  measures = sorted_measures.keys()
  counts = [[], [], [], [], [], [], [], [], [], []]
  rbo_dct = dict(zip(measures, counts))  
  for measure in rbo_dct.keys():
      for value in at_values:
          S= top_k(sorted_measures, value)[measure]
          T = RICF[0:value]
          rbo_dct[measure].append(rbo.RankingSimilarity(S, T).rbo())
  result = pd.DataFrame(rbo_dct)
  result.index = at_values
  return result



def count_words(lst, imp_words):
  counter = 0
  for x in lst:
    if x in imp_words:
      counter += 1
  return counter

In [None]:
# Initialize a random seed to ensure results can be replicated
np.random.seed(641369)

# These are the P@k scores for the different categories of domain-specific words
at_values = np.array([10, 50, 100, 500, 1000, 5000])
categories = {'all': genia_lexical_units , 'amino_acid': amino_acid, 'nucleotide': nucleotide, 'multi_cell': multi_cell, 'cell': cell, 'other': other}
all_pk_scores = []
sds_dfs = []
all_rbo_scores = []
# R = 100
R = 1 # For testing purposes; change back to 100 for final results
for r in tqdm(range(R)):
 pk_scores = []
 sorted_measures = resort(term_scores_df)
 print(sorted_measures)
 for k, v in categories.items():
     pk = create_p_k(v, sorted_measures)
     rbo_scores= create_RBO(v, sorted_measures)
     pk_scores.append(pk)
 all_pk_scores.append(pk_scores)
 all_rbo_scores.append(rbo_scores)  
    

In [None]:
sorted_measures
_value = np.array([10, 50, 100, 500, 1000, 5000])
RICF = sorted_measures["RICF"]
for i in sorted_measures.key():

In [None]:
# Display the resulting data frames
print("Mean P@k scores:")
with pd.option_context('display.precision', 4):
    display(calculate_means(all_pk_scores)[0])
    display(calculate_means(all_pk_scores)[1])
    display(calculate_means(all_pk_scores)[2])
    display(calculate_means(all_pk_scores)[3])
    display(calculate_means(all_pk_scores)[4])
    display(calculate_means(all_pk_scores)[5])

In [None]:
print(all_rbo_scores)

# with pd.option_context('display.precision', 4):
#     display(calculate_means(all_rbo_scores)[0])
#     display(calculate_means(all_rbo_scores)[1])
#     display(calculate_means(all_rbo_scores)[2])
#     display(calculate_means(all_rbo_scores)[3])
#     display(calculate_means(all_rbo_scores)[4])
#     display(calculate_means(all_rbo_scores)[5])

In [None]:
print("P@k scores standard deviations:")
with pd.option_context('display.precision', 4):
    display(calculate_sds(all_pk_scores)[0])
    display(calculate_sds(all_pk_scores)[1])
    display(calculate_sds(all_pk_scores)[2])
    display(calculate_sds(all_pk_scores)[3])
    display(calculate_sds(all_pk_scores)[4])
    display(calculate_sds(all_pk_scores)[5])

In [22]:
# Calculate mean P@k scores and write to CSV
pd.DataFrame(calculate_means(all_pk_scores)[0]).to_csv('table-4/table-4-all-means.csv', index=False)
pd.DataFrame(calculate_means(all_pk_scores)[1]).to_csv('table-4/table-4-amino_acid-means.csv', index=False)
pd.DataFrame(calculate_means(all_pk_scores)[2]).to_csv('table-4/table-4-nucleotide-means.csv', index=False)
pd.DataFrame(calculate_means(all_pk_scores)[3]).to_csv('table-4/table-4-multicell-means.csv', index=False)
pd.DataFrame(calculate_means(all_pk_scores)[4]).to_csv('table-4/table-4-cell-means.csv', index=False)
pd.DataFrame(calculate_means(all_pk_scores)[5]).to_csv('table-4/table-4-other-means.csv', index=False)

# Calculate standard deviations for P@k scores and write to CSV
pd.DataFrame(calculate_sds(all_pk_scores)[0]).to_csv('table-4/table-4-all-sds.csv', index=False)
pd.DataFrame(calculate_sds(all_pk_scores)[1]).to_csv('table-4/table-4-amino_acid-sds.csv', index=False)
pd.DataFrame(calculate_sds(all_pk_scores)[2]).to_csv('table-4/table-4-nucleotide-sds.csv', index=False)
pd.DataFrame(calculate_sds(all_pk_scores)[3]).to_csv('table-4/table-4-multicell-sds.csv', index=False)
pd.DataFrame(calculate_sds(all_pk_scores)[4]).to_csv('table-4/table-4-cell-sds.csv', index=False)
pd.DataFrame(calculate_sds(all_pk_scores)[5]).to_csv('table-4/table-4-other-sds.csv', index=False)

## Top 10 Ranked Terms Example

Here we reproduce the result of Table 5 from the manuscript.

In [23]:
# Initialize a random seed to ensure results can be replicated
np.random.seed(641369)

In [None]:
ranked_terms_df = resort(term_scores_df)
top_10_ranked_terms_df = pd.DataFrame(top_k(ranked_terms_df, 10))
display(top_10_ranked_terms_df)

In [25]:
# Write to CSV
top_10_ranked_terms_df.to_csv('table-5/table-5.csv', index=False)

## Stopwords Exploratory Analysis

Here we reproduce the result of Table 6 from the manuscript.

In [None]:
import pandas as pd
from nltk.corpus import stopwords

# Ensure you have the stopwords downloaded
import nltk
nltk.download('stopwords')

def getrank(sorted_measures):
    unique_terms = set()
    for terms in sorted_measures.values():
        unique_terms.update(terms)
    unique_terms = sorted(unique_terms)
    
    # Create a data frame to hold the rankings
    ranking_df = pd.DataFrame(index=unique_terms, columns=sorted_measures.keys())
    
    # Fill the data frame with rankings
    for measure, terms in sorted_measures.items():
        for rank, term in enumerate(terms):
            ranking_df.at[term, measure] = rank + 1  # Rank starts from 1
    
    # Replace NaN with a large number to indicate unranked terms
    ranking_df = ranking_df.fillna(len(unique_terms) + 1)
    #csv_file_path = 'ranking_table.csv'
    #ranking_df.to_csv(csv_file_path)
    return ranking_df

# Function to filter stopwords from the ranking data frame
def filter_stopwords(ranking_df):
    stopwords_list = set(stopwords.words('english'))
    
    # Filter the data frame to include only stopwords
    stopwords_rank = ranking_df[ranking_df.index.isin(stopwords_list)]
    
    # Save the stopwords ranking data frame to a CSV file
    #csv_file_path = 'stopwords_ranking_table.csv'
    #stopwords_rank.to_csv(csv_file_path)
    
    return stopwords_rank

In [None]:
# Initialize a random seed to ensure results can be replicated
np.random.seed(641369)

# Generate term dispersion ranks for R different versions of the data
R = 100
all_quantiles_df = []
for r in tqdm(range(R)):
    sorted_measures = resort(term_scores_df)
    rank = getrank(sorted_measures)
    stopwords_ranks_df = filter_stopwords(rank)
    bursty_measure_names = stopwords_ranks_df.head(0)
    quantiles = []
    for bursty_measure_name in bursty_measure_names:
        quantiles.append(stopwords_ranks_df[bursty_measure_name].quantile([0, 0.25, 0.5, 0.75, 1]))
    quantiles_df = pd.DataFrame(quantiles)
    all_quantiles_df.append(quantiles_df)

In [28]:
# Extract the column and index names from the first quantiles data frame
columns = all_quantiles_df[0].columns
index = all_quantiles_df[0].index

# Initialize empty data frames to store the mean and standard deviation values
mean_df = pd.DataFrame(index=index, columns=columns)
std_df = pd.DataFrame(index=index, columns=columns)

# Compute the mean and standard deviation of corresponding elements across all matrices
for col in columns:
    for idx in index:
        values = [matrix.at[idx, col] for matrix in all_quantiles_df]
        mean_df.at[idx, col] = np.mean(values)
        std_df.at[idx, col] = np.std(values)

In [None]:
# Display the resulting data frames
print("Mean values:")
with pd.option_context('display.precision', 4):
    display(mean_df)
print("\nStandard deviations:")
with pd.option_context('display.precision', 4):
    display(std_df)

In [30]:
# Write to CSV
mean_df.to_csv('table-6/table-6-means.csv')
std_df.to_csv('table-6/table-6-sds.csv')