# Evaluate Term Burstiness Scores on the GENIA Corpus Data

Authors: Samuel Sarria Hurtado, Uyen "Rachel" Lai, and Paul Sheridan

Last update: 2024-05-29

Description: Evaluate the following word burstiness scores on the Genia corpus data:
- Inverse Document Frequency (IDF)
- Inverse Collection Frequency (ICF)
- Chi-square
- Church and Gale (CG)
- Irvine and Callison-Burch (ICB)
- Derivation of Proportions (DoP)
- Residual ICF (RICF)

Calculate P@k scores for each scoring function using the GENIA terms as ground truth. Also, evaluate scoring functions for their ability to filter out stop words.

## Preliminaries

In [1]:
# Imports
import sys
import json
import pandas as pd
imports_path = '../0-base-functions/'
sys.path.append(imports_path)
import word_stats
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import scipy
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from io import StringIO
from numpy import nan
from tqdm import tqdm

[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1002)>


## Load the GENIA Corpus Data

In particular, we load the
- preprocessed GENIA corpus documents,
- and gold standard biological terms (i.e., lexical units) and their associated semantic classes (i.e., sems).

In [2]:
# Load the preprocessed GENIA corpus documents
genia_corpus_path = '../1-preprocessing/GENIAcorpus3.02-preprocessed.json'

with open(genia_corpus_path, "r") as j:
  genia_corpus = json.loads(j.read())

In [3]:
# Load gold standard terms 
genia_keywords_path = '../1-preprocessing/GENIAcorpus3.02-keywords.tsv'

with open(genia_keywords_path, "r") as c:
  genia_lexical_units_and_sems = pd.read_csv(c, sep='\t')

genia_lexical_units = genia_lexical_units_and_sems.lex.to_numpy()

## Prepare the GENIA Corpus Data for Analysis

Prepare the corpus vocabulary.

In [4]:
# Compile the GENIA corpus vocabulary
pre_vocab = []
for i in range(len(genia_corpus)):
  pre_vocab.append(genia_corpus[i].split())

vocab = []
for i in range(len(pre_vocab)):
  for j in range(len(pre_vocab[i])):
    vocab.append(pre_vocab[i][j])

vocab = list(set(vocab))
vocab.sort()

Convert GENIA documents into term-in-document matrix of token counts.

In [5]:
# Custom function so the Count vectorizer won't ignore any words
def analyzer_custom(doc):
  return doc.split()

# Generate term-in-document counts
counter = CountVectorizer(lowercase=False, vocabulary=vocab, analyzer=analyzer_custom)
collection = counter.transform(genia_corpus)

## Evaluate Term Burstiness Scores for Selected Measures

Calculate bag-of-words model word statistics and related quantities.

In [6]:
# Word statistics and related quantities
m = len(counter.get_feature_names_out()) # vocab size
d = collection.shape[0] # collection size
N_i = word_stats.get_Ni(collection)
N_j = word_stats.get_Nj(collection)
N = word_stats.get_N(N_j)
B_ij = word_stats.get_Bij(collection)
B_i = word_stats.get_Bi(B_ij)
B_j = word_stats.get_Bj(B_ij)
DF = word_stats.get_DF(B_i, d)
CF = word_stats.get_CF(N_i)
nij_by_nj = word_stats.get_nij_by_nj(collection, N_j)
thetas = np.array(range(1, max(N_i.A[0]) + 1))/N
opt_thetas = word_stats.get_opt_thetas(N, m, d, N_i, N_j, B_i, thetas)

Evaluate term burstiness scores.

In [7]:
# Term burstiness scores for the various burstiness measures used in this study
IDF = word_stats.get_IDF(DF)
ICF = word_stats.get_ICF(CF)
Chisq = word_stats.get_Chisq(collection)
CG = word_stats.get_CG(N_i, B_i)
ICB = word_stats.get_ICB(nij_by_nj, B_i)
DoP = word_stats.get_DoP(collection, N_i, N_j, N)
RICF = word_stats.get_RICF(opt_thetas, N, ICF)

  return -np.log(chisq_values)


Arrange term burstiness scores into a data frame.

In [8]:
# Initialize bursty scores data frame
term_scores_df = pd.DataFrame(data=
                         {'term': counter.get_feature_names_out(),
                          'IDF': IDF.A[0],
                          'ICF': ICF.A[0],
                          'Chi-sq': Chisq,
                          'CG': CG.A[0],
                          'ICB': ICB.A[0],
                          'DoP': DoP.A[0],
                          'RICF': RICF.A[0]})

In [9]:
# Print term burstiness scores to console
display(term_scores_df)

Unnamed: 0,term,IDF,ICF,Chi-sq,CG,ICB,DoP,RICF
0,'aged'_lymphocyte_lex,7.600902,12.800954,0.701595,1.0,244.0,-0.000673,-0.000251
1,'converted'_TCEd_motif_lex,7.600902,12.800954,0.701595,1.0,195.0,-0.000538,-0.000251
2,'latency_I'_lex,7.600902,12.800954,0.701595,1.0,206.0,-0.000568,-0.000251
3,'latency_II'_lex,7.600902,12.800954,0.701595,1.0,206.0,-0.000568,-0.000251
4,'master_regulator_lex,7.600902,12.800954,0.701595,1.0,63.0,-0.000174,-0.000251
...,...,...,...,...,...,...,...,...
40799,zymogen_plasma_factor_X_lex,7.600902,12.800954,0.701595,1.0,355.0,-0.000979,-0.000251
40800,zymogen_plasma_factors_VII_lex,7.600902,12.800954,0.701595,1.0,355.0,-0.000979,-0.000251
40801,zymography_lex,7.600902,12.800954,0.701595,1.0,191.0,-0.000527,-0.000251
40802,zymosan-treated_cell_lex,7.600902,12.800954,0.701595,1.0,225.0,-0.000621,-0.000251


In [10]:
# Write to TSV
term_scores_df.to_csv('term_burstiness_scores.tsv', sep='\t')

## Compile GENIA Corpus Summary Statistics

This is the result of Table 3 from the paper.

In [11]:
# Collect lexical units belonging to a given high-level semantic class
#def get_high_level_semantic_class_words(high_level_class_lst, lex_sem_dct):
def get_high_level_semantic_class_words(high_level_class_lst):
  words = []
  for k, v in lex_sem_dct.items():
    if v in high_level_class_lst:
      words.append(k)
  return words

In [12]:
# Hardcode the semantic classes according to their high-level designations.
amino_acid_sems = ['G#amino_acid_monomer', 'G#peptide', 'G#protein_N/A',
              'G#protein_complex', 'G#protein_domain_or_region',
              'G#protein_family_or_group', 'G#protein_molecule',
              'G#protein_substructure', 'G#protein_subunit',
              'G#other_organic_compound', 'G#organic', 'G#inorganic', 'G#atom',
              'G#carbohydrate', 'G#lipid']
nucleotide_sems = ['G#nucleotide', 'G#polynucleotide', 'G#DNA_N/A',
        'G#DNA_domain_or_region', 'G#DNA_family_or_group', 'G#DNA_molecule',
        'G#DNA_substructure', 'G#RNA_N/A', 'G#RNA_domain_or_region',
        'G#RNA_family_or_group', 'G#RNA_molecule', 'G#RNA_substructure']
multi_cell_sems = ['G#virus', 'G#mono_cell', 'G#multi_cell', 'G#body_part', 'G#tissue']
cell_sems = ['G#cell_type', 'G#cell_component', 'G#cell_line', 'G#other_artificial_source']
other_sems = ['G#other_name']
high_level_semantic_class_names = ['amino_acid', 'nucleotide', 'multi_cell', 'cell', 'other']
high_level_semantic_class_lex_units = [genia_lexical_units, amino_acid_sems, nucleotide_sems, multi_cell_sems, cell_sems, other_sems]

In [13]:
# Create dictionary of lexical units and their associated semantic classes
sem = np.array(genia_lexical_units_and_sems['sem'])
lex = np.array(genia_lexical_units_and_sems['lex'])
lex_sem_dct = dict(zip(lex, sem))

# Count number of semantic classes in each high-level class
sub_class = [len(amino_acid_sems), len(nucleotide_sems), len(multi_cell_sems), len(cell_sems), len(other_sems)]

# Count number of distinct lexical units in each high-level semantic class
amino_acid = get_high_level_semantic_class_words(amino_acid_sems)
nucleotide = get_high_level_semantic_class_words(nucleotide_sems)
multi_cell = get_high_level_semantic_class_words(multi_cell_sems)
cell = get_high_level_semantic_class_words(cell_sems)
other = get_high_level_semantic_class_words(other_sems)

# Count number of lexical units used as annotations for each high-level semantic class
high_level_class_words = [genia_lexical_units, amino_acid, nucleotide, multi_cell, cell, other]
high_level_class_words_counter = [0, 0, 0, 0, 0, 0]
for i in range(len(high_level_class_words)):
  for j in range(len(vocab)):
    if vocab[j] in high_level_class_words[i]:
      high_level_class_words_counter[i] += N_i.A[0][j]

In [14]:
# Create dictionary of lexical units and their associated semantic classes
sem = np.array(genia_lexical_units_and_sems['sem'])
lex = np.array(genia_lexical_units_and_sems['lex'])
lex_sem_dct = dict(zip(lex, sem))

# Count number of semantic subclasses in each high-level class
subclass_counts = [len(amino_acid_sems), len(nucleotide_sems), len(multi_cell_sems), len(cell_sems), len(other_sems)]

# Gather distinct lexical units for each high-level semantic class
amino_acid = get_high_level_semantic_class_words(amino_acid_sems)
nucleotide = get_high_level_semantic_class_words(nucleotide_sems)
multi_cell = get_high_level_semantic_class_words(multi_cell_sems)
cell = get_high_level_semantic_class_words(cell_sems)
other = get_high_level_semantic_class_words(other_sems)

# Count number of lexical units used as annotations for each high-level semantic class
lex_by_semamtic_class_counts = [0, 0, 0, 0, 0, 0]
for i in range(len(high_level_semantic_class_names)):
  for j in range(len(vocab)):
    if vocab[j] in high_level_semantic_class_names[i]:
      lex_by_semamtic_class_counts[i] += N_i.A[0][j]

In [15]:
# Define the data
high_level_semantic_classes = ['all', 'amino_acid', 'nucleotide', 'multi_cell', 'cell', 'other']
annotations_count_zip = zip(high_level_semantic_classes, high_level_class_words_counter)
annotations_dict = dict(annotations_count_zip)

#categories = ['amino_acid', 'nucleotide', 'multi_cell', 'cell', 'other']
lex_unit_counts = [len(amino_acid), len(nucleotide), len(multi_cell), len(cell), len(other)]
subclass_counts = [len(amino_acid_sems), len(nucleotide_sems), len(multi_cell_sems), len(cell_sems), len(other_sems)]
annotations = [
    annotations_dict['amino_acid'],
    annotations_dict['nucleotide'],
    annotations_dict['multi_cell'],
    annotations_dict['cell'],
    annotations_dict['other']]

# Create a data frame
genia_summary_stats_df = pd.DataFrame({
    'Semantic class': ['amino_acid', 'nucleotide', 'multi_cell', 'cell', 'other'],
    'Sub-class': subclass_counts,
    'Unique terms': lex_unit_counts,
    "Annotations": annotations
})

In [16]:
# Print GENIA summary statistics
display(genia_summary_stats_df)

Unnamed: 0,Semantic class,Sub-class,Unique terms,Annotations
0,amino_acid,15,10155,42478
1,nucleotide,12,5574,11619
2,multi_cell,5,1444,5247
3,cell,4,4051,11626
4,other,1,10560,19999


In [17]:
# Write to CSV
genia_summary_stats_df.to_csv('table-3/table-3.csv', index=False)

## Terminology Extraction Task Experiment

Here we reproduce the result of Table 4 from the paper.

In [18]:
def top_k(dct, k):
  keys = dct.keys()
  values = []
  for key in keys:
    values.append(dct[key][:k])
  keys_values_pair = zip(keys, values)
  return dict(keys_values_pair)
    
def resort(term_scores_df):
    sorted_terms = []
    bursty_measure_names = term_scores_df.columns.values.tolist()[1:]

    for measure in bursty_measure_names:
        # Copy the data frame and add a random column
        temp_df = term_scores_df.copy()
        temp_df['random'] = np.random.rand(len(temp_df))
        
        # Sort by the measure and the random column
        sorted_df = temp_df[['term', measure, 'random']].sort_values(by=[measure, 'random'], ascending=[False, True])
        
        # Append the sorted terms to the list
        sorted_terms.append(np.array(sorted_df['term']))
        
        # Drop the random column from the temporary data frame
        temp_df.drop(columns='random', inplace=True)
    
    sorted_terms = np.array(sorted_terms)
    measure_term_pair = zip(bursty_measure_names, sorted_terms)
    sorted_measures = dict(measure_term_pair)
    return sorted_measures
    
def calculate_means(nested_list):
    result = []
    num_outer = len(nested_list)
    num_inner = len(nested_list[0])

    for i in range(num_inner):
        means = {}
        for column in nested_list[0][i].columns:
            values = [nested_list[outer][i][column].values for outer in range(num_outer)]
            mean_values = np.mean(values, axis=0)
            means[column] = mean_values
        result.append(pd.DataFrame(means, index=nested_list[0][i].index))
    return result

# Function to calculate the standard deviation of the corresponding data frames
def calculate_sds(nested_list):
    result = []
    num_outer = len(nested_list)
    num_inner = len(nested_list[0])

    for i in range(num_inner):
        std_devs = {}
        for column in nested_list[0][i].columns:
            values = [nested_list[outer][i][column].values for outer in range(num_outer)]
            std_values = np.std(values, axis=0, ddof=1)
            std_devs[column] = std_values
        result.append(pd.DataFrame(std_devs, index=nested_list[0][i].index))
    return result

def create_p_k(v,sorted_measures):
  measures = sorted_measures.keys()
  counts = [[], [], [], [], [], [], [], [], [], []]
  p_k_dct = dict(zip(measures, counts))  
  for measure in p_k_dct.keys():
      for value in at_values:
              p_k_dct[measure].append(count_words(top_k(sorted_measures, value)[measure], v)/value)
  result = pd.DataFrame(p_k_dct)
  result.index = at_values
  return result
    
def count_words(lst, imp_words):
  counter = 0
  for x in lst:
    if x in imp_words:
      counter += 1
  return counter

In [19]:
# Initialize a random seed to ensure results can be replicated
np.random.seed(641369)

# These are the P@k scores for the different categories of domain-specific words
at_values = np.array([10, 50, 100, 500, 1000, 5000])
categories = {'all': genia_lexical_units , 'amino_acid': amino_acid, 'nucleotide': nucleotide, 'multi_cell': multi_cell, 'cell': cell, 'other': other}
all_pk_scores = []
sds_dfs = []
R = 100
for r in tqdm(range(R)):
 pk_scores = []
 sorted_measures = resort(term_scores_df)
 for k, v in categories.items():
     pk = create_p_k(v, sorted_measures)
     pk_scores.append(pk)
 all_pk_scores.append(pk_scores)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [29:08<00:00, 17.49s/it]


In [20]:
# Display the resulting data frames
print("Mean P@k scores:")
with pd.option_context('display.precision', 4):
    display(calculate_means(all_pk_scores)[0])
    display(calculate_means(all_pk_scores)[1])
    display(calculate_means(all_pk_scores)[2])
    display(calculate_means(all_pk_scores)[3])
    display(calculate_means(all_pk_scores)[4])
    display(calculate_means(all_pk_scores)[5])

Mean P@k scores:


Unnamed: 0,IDF,ICF,Chi-sq,CG,ICB,DoP,RICF
10,0.855,0.818,0.952,1.0,1.0,0.9,1.0
50,0.8582,0.8506,0.954,0.96,1.0,0.7714,1.0
100,0.863,0.8498,0.9537,0.98,0.98,0.78,1.0
500,0.8611,0.848,0.9527,0.9834,0.974,0.8252,0.9916
1000,0.8593,0.8502,0.9528,0.9813,0.9635,0.8537,0.9854
5000,0.8601,0.8503,0.915,0.928,0.8973,0.8753,0.9315


Unnamed: 0,IDF,ICF,Chi-sq,CG,ICB,DoP,RICF
10,0.255,0.247,0.562,1.0,0.8,0.169,1.0
50,0.2654,0.2422,0.5312,0.7552,0.7,0.2154,0.7902
100,0.2636,0.2418,0.5397,0.82,0.7448,0.1316,0.83
500,0.2652,0.244,0.5384,0.6878,0.622,0.22,0.692
1000,0.2652,0.2457,0.5379,0.6443,0.5901,0.2528,0.6435
5000,0.2651,0.2468,0.4293,0.4282,0.4109,0.2801,0.4294


Unnamed: 0,IDF,ICF,Chi-sq,CG,ICB,DoP,RICF
10,0.156,0.142,0.146,0.0,0.1,0.0,0.0
50,0.1576,0.152,0.1542,0.1,0.12,0.12,0.1
100,0.1592,0.1559,0.1477,0.1,0.1,0.1444,0.11
500,0.1563,0.1538,0.1521,0.1418,0.146,0.1363,0.1426
1000,0.1557,0.1534,0.1527,0.1347,0.1395,0.1401,0.1364
5000,0.1556,0.1547,0.1527,0.1553,0.1483,0.1528,0.1557


Unnamed: 0,IDF,ICF,Chi-sq,CG,ICB,DoP,RICF
10,0.04,0.022,0.042,0.0,0.0,0.0,0.0
50,0.0358,0.031,0.0406,0.04,0.04,0.08,0.04
100,0.0358,0.0337,0.0427,0.02,0.02,0.09,0.02
500,0.0349,0.0359,0.0416,0.0334,0.032,0.0561,0.0336
1000,0.0361,0.0361,0.0411,0.0368,0.042,0.053,0.0358
5000,0.0371,0.0358,0.0438,0.0439,0.0413,0.0468,0.0441


Unnamed: 0,IDF,ICF,Chi-sq,CG,ICB,DoP,RICF
10,0.119,0.1,0.062,0.0,0.0,0.07,0.0
50,0.1172,0.1136,0.0806,0.0048,0.0,0.156,0.0098
100,0.1113,0.1099,0.0821,0.01,0.02,0.12,0.01
500,0.1113,0.1107,0.0816,0.039,0.06,0.09,0.0399
1000,0.1101,0.1111,0.0819,0.0597,0.0689,0.092,0.0598
5000,0.1105,0.1104,0.0956,0.1002,0.0987,0.0947,0.1006


Unnamed: 0,IDF,ICF,Chi-sq,CG,ICB,DoP,RICF
10,0.285,0.307,0.14,0.0,0.1,0.661,0.0
50,0.2822,0.3118,0.1474,0.06,0.14,0.2,0.06
100,0.2931,0.3085,0.1415,0.03,0.0952,0.294,0.03
500,0.2933,0.3036,0.1389,0.0815,0.114,0.3228,0.0834
1000,0.292,0.3039,0.1392,0.1058,0.1229,0.3158,0.11
5000,0.2918,0.3026,0.1937,0.2004,0.1981,0.301,0.2016


In [21]:
print("P@k scores standard deviations:")
with pd.option_context('display.precision', 4):
    display(calculate_sds(all_pk_scores)[0])
    display(calculate_sds(all_pk_scores)[1])
    display(calculate_sds(all_pk_scores)[2])
    display(calculate_sds(all_pk_scores)[3])
    display(calculate_sds(all_pk_scores)[4])
    display(calculate_sds(all_pk_scores)[5])

P@k scores standard deviations:


Unnamed: 0,IDF,ICF,Chi-sq,CG,ICB,DoP,RICF
10,0.1077,0.1192,0.0717,0.0,0.0,7.8107e-16,0.0
50,0.0537,0.0462,0.0318,1.5621e-15,0.0,0.0099514,0.0
100,0.0342,0.0331,0.0207,4.4633e-16,4.4633e-16,7.8107e-16,0.0
500,0.0159,0.0145,0.008,0.00099676,2.2316e-16,0.00098862,0.0009
1000,0.0114,0.011,0.0049,0.0017708,0.0010581,0.00046883,0.0014
5000,0.0042,0.0042,0.0006,0.00072288,0.00013052,0.00011192,0.0003


Unnamed: 0,IDF,ICF,Chi-sq,CG,ICB,DoP,RICF
10,0.1452,0.1352,0.1536,0.0,1.5621e-15,0.046482,0.0
50,0.0678,0.0553,0.0707,0.0085847,1.339e-15,0.0084591,0.010048
100,0.0458,0.0378,0.0463,1.2274e-15,0.0050212,0.0036845,1.2274e-15
500,0.0195,0.0191,0.0202,0.0044485,0.0,1.1158e-16,0.0042211
1000,0.0123,0.0131,0.012,0.0049247,0.0006966,0.00061266,0.0038599
5000,0.0053,0.0057,0.001,0.0010892,0.00023132,0.00016783,0.00051591


Unnamed: 0,IDF,ICF,Chi-sq,CG,ICB,DoP,RICF
10,0.1008,0.1148,0.1105,0.0,1.9527e-16,0.0,0.0
50,0.053,0.0431,0.0501,1.9527e-16,1.9527e-16,1.9527e-16,1.9527e-16
100,0.0361,0.0357,0.0372,1.9527e-16,1.9527e-16,0.0049889,5.5791e-17
500,0.0159,0.0158,0.0156,0.002728,3.3474e-16,0.000676,0.0028069
1000,0.0107,0.0113,0.0089,0.0031968,0.00050242,0.00071202,0.0028387
5000,0.0046,0.004,0.0008,0.00077905,9.8041e-05,0.00020416,0.00031703


Unnamed: 0,IDF,ICF,Chi-sq,CG,ICB,DoP,RICF
10,0.0569,0.044,0.0572,0.0,0.0,0.0,0.0
50,0.027,0.0247,0.0273,2.7895e-17,2.7895e-17,5.5791e-17,2.7895e-17
100,0.0192,0.0186,0.0164,1.3948e-17,1.3948e-17,8.3686e-17,1.3948e-17
500,0.0075,0.0075,0.0083,0.0015277,2.0922e-17,0.00051286,0.0017146
1000,0.0053,0.0053,0.0052,0.0018297,0.00065897,4.1843e-17,0.0013437
5000,0.0022,0.0023,0.0005,0.00053068,9.8041e-05,8.3686e-17,0.00026285


Unnamed: 0,IDF,ICF,Chi-sq,CG,ICB,DoP,RICF
10,0.095,0.0985,0.0776,0.0,0.0,0.046057,0.0
50,0.0409,0.0461,0.042,0.0085847,0.0,0.0080403,0.010048
100,0.0264,0.0342,0.0295,6.9739e-18,1.3948e-17,1.9527e-16,6.9739e-18
500,0.0155,0.0138,0.0107,0.0022472,9.7634e-17,8.3686e-17,0.0025485
1000,0.0102,0.0096,0.0069,0.0024872,0.00073711,5.5791e-17,0.0018051
5000,0.0044,0.0037,0.0005,0.00069588,0.0001432,9.5219e-05,0.00035607


Unnamed: 0,IDF,ICF,Chi-sq,CG,ICB,DoP,RICF
10,0.1438,0.1423,0.1044,0.0,1.9527e-16,0.054855,0.0
50,0.0632,0.0673,0.0471,9.7634e-17,1.6737e-16,3.9054e-16,9.7634e-17
100,0.0438,0.0493,0.0317,4.8817e-17,0.0050212,0.0049237,4.8817e-17
500,0.019,0.02,0.0135,0.0023719,1.8132e-16,0.00098041,0.0027012
1000,0.0122,0.014,0.0086,0.0036145,0.00062109,0.00066203,0.0029404
5000,0.0051,0.0062,0.001,0.0011052,0.0002496,0.00019258,0.00043001


In [22]:
# Calculate mean P@k scores and write to CSV
pd.DataFrame(calculate_means(all_pk_scores)[0]).to_csv('table-4/table-4-all-means.csv', index=False)
pd.DataFrame(calculate_means(all_pk_scores)[1]).to_csv('table-4/table-4-amino_acid-means.csv', index=False)
pd.DataFrame(calculate_means(all_pk_scores)[2]).to_csv('table-4/table-4-nucleotide-means.csv', index=False)
pd.DataFrame(calculate_means(all_pk_scores)[3]).to_csv('table-4/table-4-multicell-means.csv', index=False)
pd.DataFrame(calculate_means(all_pk_scores)[4]).to_csv('table-4/table-4-cell-means.csv', index=False)
pd.DataFrame(calculate_means(all_pk_scores)[5]).to_csv('table-4/table-4-other-means.csv', index=False)

# Calculate standard deviations for P@k scores and write to CSV
pd.DataFrame(calculate_sds(all_pk_scores)[0]).to_csv('table-4/table-4-all-sds.csv', index=False)
pd.DataFrame(calculate_sds(all_pk_scores)[1]).to_csv('table-4/table-4-amino_acid-sds.csv', index=False)
pd.DataFrame(calculate_sds(all_pk_scores)[2]).to_csv('table-4/table-4-nucleotide-sds.csv', index=False)
pd.DataFrame(calculate_sds(all_pk_scores)[3]).to_csv('table-4/table-4-multicell-sds.csv', index=False)
pd.DataFrame(calculate_sds(all_pk_scores)[4]).to_csv('table-4/table-4-cell-sds.csv', index=False)
pd.DataFrame(calculate_sds(all_pk_scores)[5]).to_csv('table-4/table-4-other-sds.csv', index=False)

## Top 10 Ranked Terms Example

Here we reproduce the result of Table 5 from the paper.

In [23]:
# Initialize a random seed to ensure results can be replicated
np.random.seed(641369)

In [24]:
ranked_terms_df = resort(term_scores_df)
top_10_ranked_terms_df = pd.DataFrame(top_k(ranked_terms_df, 10))
display(top_10_ranked_terms_df)

Unnamed: 0,IDF,ICF,Chi-sq,CG,ICB,DoP,RICF
0,recessive,CLE0_element_lex,P_sequence_lex,Bcl-6_lex,Bcl-6_lex,guardian,Bcl-6_lex
1,helix,RAP30_lex,K_protein_lex,SMX_lex,TCRzeta_lex,distinct_function_lex,v-erbA_lex
2,P3A2_lex,12-O-tetradecanoylphorbol-13-acetate_(TPA)_res...,UAS2_lex,v-erbA_lex,ML-9_lex,interleukin-15_lex,SMX_lex
3,p0005,AIDS_IBLP_tumor_lex,Bik_lex,ML-9_lex,AITL_lex,thymus-derived_T-cell_homeostasis_lex,SHP1_lex
4,obese_patient_lex,beta-like_globin_cluster_lex,sesquiterpene_lactone_lex,SHP1_lex,SHP1_lex,extrathymic_development_lex,ML-9_lex
5,human_activated_monocyte_lex,stromal-derived_cytokine_interleukin-7_lex,TS_lex,beta-casein_lex,beta-casein_lex,B-lymphocyte_differentiation_control_lex,beta-casein_lex
6,putative_chicken_Shc_homologue_lex,inductive_stimulation_lex,MTBE_lex,EBNA-2_lex,A-myb_lex,B-lymphocyte_differentiation_lex,I_kappaB_lex
7,DNA_ploidy_lex,Myc_lex,Tax_lex,DM_lex,I_kappaB_lex,plasma_cell_pathway_lex,DM_lex
8,mature_organ_lex,N-terminal_c-Jun_kinase_lex,variants,TCRzeta_lex,SMX_lex,B-cell_commitment_lex,TCRzeta_lex
9,small_GTP-binding_protein_Rho_lex,PKR_expression_lex,glial_cell_lex,I_kappaB_lex,Rap1_protein_lex,memory_B_cells_lex,p95vav_lex


In [25]:
# Write to CSV
top_10_ranked_terms_df.to_csv('table-5/table-5.csv', index=False)

## Stopwords Exploratory Analysis

Here we reproduce the result of Table 6 from the paper.

In [26]:
import pandas as pd
from nltk.corpus import stopwords

# Ensure you have the stopwords downloaded
import nltk
nltk.download('stopwords')

def getrank(sorted_measures):
    unique_terms = set()
    for terms in sorted_measures.values():
        unique_terms.update(terms)
    unique_terms = sorted(unique_terms)
    
    # Create a data frame to hold the rankings
    ranking_df = pd.DataFrame(index=unique_terms, columns=sorted_measures.keys())
    
    # Fill the data frame with rankings
    for measure, terms in sorted_measures.items():
        for rank, term in enumerate(terms):
            ranking_df.at[term, measure] = rank + 1  # Rank starts from 1
    
    # Replace NaN with a large number to indicate unranked terms
    ranking_df = ranking_df.fillna(len(unique_terms) + 1)
    #csv_file_path = 'ranking_table.csv'
    #ranking_df.to_csv(csv_file_path)
    return ranking_df

# Function to filter stopwords from the ranking data frame
def filter_stopwords(ranking_df):
    stopwords_list = set(stopwords.words('english'))
    
    # Filter the data frame to include only stopwords
    stopwords_rank = ranking_df[ranking_df.index.isin(stopwords_list)]
    
    # Save the stopwords ranking data frame to a CSV file
    #csv_file_path = 'stopwords_ranking_table.csv'
    #stopwords_rank.to_csv(csv_file_path)
    
    return stopwords_rank

[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1002)>


In [27]:
# Initialize a random seed to ensure results can be replicated
np.random.seed(641369)

# Generate term burstiness ranks for R different versions of the data
R = 100
all_quantiles_df = []
for r in tqdm(range(R)):
    sorted_measures = resort(term_scores_df)
    rank = getrank(sorted_measures)
    stopwords_ranks_df = filter_stopwords(rank)
    bursty_measure_names = stopwords_ranks_df.head(0)
    quantiles = []
    for bursty_measure_name in bursty_measure_names:
        quantiles.append(stopwords_ranks_df[bursty_measure_name].quantile([0, 0.25, 0.5, 0.75, 1]))
    quantiles_df = pd.DataFrame(quantiles)
    all_quantiles_df.append(quantiles_df)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [02:03<00:00,  1.24s/it]


In [28]:
# Extract the column and index names from the first quantiles DataFrame
columns = all_quantiles_df[0].columns
index = all_quantiles_df[0].index

# Initialize empty data frames to store the mean and standard deviation values
mean_df = pd.DataFrame(index=index, columns=columns)
std_df = pd.DataFrame(index=index, columns=columns)

# Compute the mean and standard deviation of corresponding elements across all matrices
for col in columns:
    for idx in index:
        values = [matrix.at[idx, col] for matrix in all_quantiles_df]
        mean_df.at[idx, col] = np.mean(values)
        std_df.at[idx, col] = np.std(values)

In [29]:
# Display the resulting data frames
print("Mean values:")
with pd.option_context('display.precision', 4):
    display(mean_df)
print("\nStandard deviations:")
with pd.option_context('display.precision', 4):
    display(std_df)

Mean values:


Unnamed: 0,0.00,0.25,0.50,0.75,1.00
IDF,4184.62,39839.62,40595.55,40765.0,40804.0
ICF,4066.6,39749.07,40564.31,40760.0,40804.0
Chi-sq,947.7,7402.0,8077.0,8736.0,40803.0
CG,19.0,6318.0,7748.58,8619.0,39282.65
ICB,70.0,6042.0,12018.0,17067.0,36577.55
DoP,5059.86,39858.0,40602.0,40765.0,40804.0
RICF,2427.0,7804.0,8287.0,8844.0,40803.0



Standard deviations:


Unnamed: 0,0.00,0.25,0.50,0.75,1.00
IDF,3330.9626,8.6218,0.4975,0.0,0.0
ICF,3688.3649,5.8826,0.4625,0.0,0.0
Chi-sq,566.6572,0.0,0.0,0.0,0.0
CG,0.0,0.0,0.4936,0.0,1455.7712
ICB,0.0,0.0,0.0,0.0,52.6453
DoP,7.985,0.0,0.0,0.0,0.0
RICF,0.0,0.0,0.0,0.0,0.0


In [30]:
# Write to CSV
mean_df.to_csv('table-6/table-6-means.csv')
std_df.to_csv('table-6/table-6-sds.csv')