# Evaluate Term Burstiness Scores on the GENIA Corpus Data

Authors: Samuel Sarria Hurtado and Paul Sheridan

Last update: 2024-05-17

Description: Evaluate the following word burstiness scores on the Genia corpus data:
- Inverse Document Frequency (IDF)
- Inverse Collection Frequency (ICF)
- Chi-square
- Church and Gale (CG)
- Irvine and Callison-Burch (ICB)
- Derivation of Proportions (DoP)
- Residual ICF (RICF)

Calculate P@k scores for each scoring function using the GENIA terms as ground truth. Also, evaluate scoring functions for their ability to filter out stop words.

## Imports

In [2]:
# Add path to Python function files to system path
import sys
import json
import pandas as pd
imports_path = '../0-base-functions/'
sys.path.append(imports_path)
import word_stats
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import scipy
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from io import StringIO
from numpy import nan

[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1002)>


## Preliminaries

In [3]:
# Make results reproducible by setting the random seed
np.random.seed(641369)

## Read in Data

In [4]:
# Load the genia corpus, the genia keywords, the keybert rankings, and all the different lists of stopwords.
json_genia_path = '../1-preprocessing/GENIAcorpus3.02-preprocessed.json'

with open(json_genia_path, "r") as j:
  genia = json.loads(j.read())

keyword_genia_path = '../1-preprocessing/GENIAcorpus3.02-keywords.tsv'

with open(keyword_genia_path, "r") as c:
  key_words = pd.read_csv(c, sep='\t')
all_genia_words = key_words.lex.to_numpy()

terrier_path = '../0-data-raw/terrier-stopwords.txt'

terrier_stopwords = np.loadtxt(terrier_path, dtype=str)

myisam_path = '../0-data-raw/myisam-stopwords.txt'

with open(myisam_path, 'r') as t:
  myisam_txt = StringIO(t.read() + '  NA  NA')

myisam_stopwords = np.loadtxt(myisam_txt, dtype=str)
myisam_stopwords = np.reshape(myisam_stopwords, (545, ))
nas = np.where(myisam_stopwords == 'NA')[0]
myisam_stopwords = np.delete(myisam_stopwords, nas).tolist()

## Construct GENIA Corpus Vocabulary

This is the vocabulary input for the count vectorizer below.

In [6]:
# Custom function so the Count vectorizer won't ignore any words
def analyzer_custom(doc):
  return doc.split()

In [7]:
pre_vocab = []
for i in range(len(genia)):
  pre_vocab.append(genia[i].split())

vocab = []
for i in range(len(pre_vocab)):
  for j in range(len(pre_vocab[i])):
    vocab.append(pre_vocab[i][j])

vocab = list(set(vocab))

In [8]:
counter = CountVectorizer(lowercase=False, vocabulary=vocab, analyzer=analyzer_custom)
collection = counter.transform(genia)

In [9]:
print(counter.get_feature_names_out())

['gg' 'dihydroxyvitamin_D3_lex' 'olfactory_neuronal_cell_lex' ...
 'PWM-generated_B-cell_proliferation_lex' '13-cis_RA_lex'
 'diphenylhexatriene_lex']


In [10]:
print(vocab)

['gg', 'dihydroxyvitamin_D3_lex', 'olfactory_neuronal_cell_lex', '35-kDa_(alpha)_subunit_lex', 'IFN-gamma-induced_tyrosine_phosphorylation_lex', 'x507', 'Jun_lex', 'proximal_gamma-globin_promoter_lex', 'coactivating', 'colleagues', 'EMCV_IRES_fragment_lex', 'favor', 'cell-specific_pathway_lex', 'acute_adrenal_failure_lex', 'IL-2_receptor_(IL-2R)_alpha_lex', 'RARA_lex', 'adhesion', 'repressor_element_lex', 'Ig_beta_lex', 'divisions', 'TIMP-1-negative_BL_line_lex', 'elderly', 'keep', 'dl312_virus_lex', 'CD4_signaling_lex', 'existing', 'differentially', 'Gel-shift_analys_lex', 'protein_turnover-degradation_lex', 'human_lymphoblastic_Jurkat_T_cell_lex', 'against', 'DQA1*0201/DQB1*0201_lex', 'Rev_protein_lex', 'nearly', 'comparison', 'non-heat_shock_gene_lex', 'raising', '229-bp_region_lex', 'degranulated_mast_cell_lex', '518', '[3H]dexamethasone_lex', 'TSG101_lex', 'ICAM-1_molecule_lex', 'MyD88-/-_Th1-developing_cell_lex', 'purified_SR-BP_lex', 'hMNL_lex', 'human_T-cell_leukemia_virus_type

## Calculate Bag-of-words Model Word Statistics

In [11]:
m = len(counter.get_feature_names_out())
d = collection.shape[0]
N_i = word_stats.get_Ni(collection)
N_j = word_stats.get_Nj(collection)
N = word_stats.get_N(N_j)
B_ij = word_stats.get_Bij(collection)
B_i = word_stats.get_Bi(B_ij)
B_j = word_stats.get_Bj(B_ij)
DF = word_stats.get_DF(B_i, d)
CF = word_stats.get_CF(N_i)
nij_by_nj = word_stats.get_nij_by_nj(collection, N_j)
thetas = np.array(range(1, max(N_i.A[0]) + 1))/N
opt_thetas = word_stats.get_opt_thetas(N, m, d, N_i, N_j, B_i, thetas)

## Evaluate Term Burstiness Measures

In [12]:
IDF = word_stats.get_IDF(DF)
ICF = word_stats.get_ICF(CF)
Chisq = word_stats.get_Chisq(collection)
CG = word_stats.get_CG(N_i, B_i)
ICB = word_stats.get_ICB(nij_by_nj, B_i)
DoP = word_stats.get_DoP(collection, N_i, N_j, N)
RICF = word_stats.get_RICF(opt_thetas, N, ICF)

  return -np.log(chisq_values)


In [13]:
# testing (delete later)
print(IDF.A[0])
print(counter.get_feature_names_out())

[7.60090246 7.60090246 7.60090246 ... 7.60090246 7.60090246 7.60090246]
['gg' 'dihydroxyvitamin_D3_lex' 'olfactory_neuronal_cell_lex' ...
 'PWM-generated_B-cell_proliferation_lex' '13-cis_RA_lex'
 'diphenylhexatriene_lex']


## Compare Term Burstiness Measures

### Preliminary Dataframes

In [14]:
dta = {'term': counter.get_feature_names_out(), 'IDF': IDF.A[0], 'ICF': ICF.A[0], 'Chi-sq': Chisq, 'CG': CG.A[0], 'ICB': ICB.A[0], 'DoP': DoP.A[0], 'RICF': RICF.A[0]}
scores_df = pd.DataFrame(data=dta)

In [15]:
print(scores_df)
scores_df.to_csv('bursty_scores.tsv', sep='\t')

                                                 term       IDF        ICF  \
0                                                  gg  7.600902  12.107806   
1                             dihydroxyvitamin_D3_lex  7.600902  12.800954   
2                         olfactory_neuronal_cell_lex  7.600902  12.800954   
3                          35-kDa_(alpha)_subunit_lex  7.600902  12.800954   
4      IFN-gamma-induced_tyrosine_phosphorylation_lex  7.600902  12.800954   
...                                               ...       ...        ...   
40799                                  CD2_marker_lex  7.600902  12.800954   
40800                      cytokine-based_therapy_lex  7.600902  12.800954   
40801          PWM-generated_B-cell_proliferation_lex  7.600902  12.800954   
40802                                   13-cis_RA_lex  7.600902  12.800954   
40803                          diphenylhexatriene_lex  7.600902  12.800954   

           Chi-sq   CG    ICB       DoP      RICF  
0      311.

In [16]:
sorted_indices = []
cols = scores_df.columns.values.tolist()
for col in cols:
  if col == 'term':
    sorted_indices.append(np.array(scores_df['term']))
  else:
    a = np.array(scores_df[[col]])
    sorted_indices.append(len(a) - scipy.stats.rankdata(a, method='ordinal', nan_policy='omit').astype(int))

sorted_indices = np.array(sorted_indices)
m_t_pair = zip(cols, sorted_indices)
measures_indices = dict(m_t_pair)
measures_indices_df = pd.DataFrame(measures_indices)

In [17]:
st_words = stopwords.words('english')
all_stopwords = [st_words, terrier_stopwords, myisam_stopwords]
lst_stopwords = []
for i in range(len(all_stopwords)):
  for j in range(len(all_stopwords[i])):
    lst_stopwords.append(all_stopwords[i][j])

lst_stopwords = set(lst_stopwords)
vocab_st_words = list(set(vocab).intersection(set(lst_stopwords)))
vocab_st_words_in = []
for i in range(len(measures_indices_df['term'])):
  if measures_indices_df['term'][i] in vocab_st_words:
    vocab_st_words_in.append(i)

vocab_st_words_in = np.array(vocab_st_words_in)

table_of_stop_words = measures_indices_df[measures_indices_df['term'].isin(vocab_st_words)]

### Statistical Analysis of Measures

In [18]:
# Each measure has a unique ranking for a word in the corpus. The smaller the ranking
# the higher the burstiness. These are the quartiles of the rankings for all measures.
quantiles = []
for col in cols:
  if col == 'term': continue
  quantiles.append(table_of_stop_words[col].quantile([0, 0.25, 0.5, 0.75, 1]))

quantiles_df = pd.DataFrame(quantiles)
display(quantiles_df)

Unnamed: 0,0.00,0.25,0.50,0.75,1.00
IDF,1011.0,37028.0,39693.0,40549.0,40803.0
ICF,886.0,35597.0,39509.0,40515.0,40803.0
Chi-sq,34.0,8022.0,8796.0,39139.0,40802.0
CG,18.0,7750.0,8755.0,23000.0,40786.0
ICB,69.0,11310.0,16852.0,22435.0,40518.0
DoP,0.0,253.0,1111.0,3719.0,40497.0
RICF,2093.0,8122.0,8810.0,39312.0,40802.0


In [19]:
# This cell writes the previous pandas dataframe as a CSV (uncomment to write)
quantiles_df.to_csv('quartiles.csv')

In [20]:
sorted_terms = []
measures = cols[1:]
for measure in measures:
  sorted_terms.append(np.array(scores_df[['term', measure]].sort_values(measure, ascending=False)['term']))

sorted_terms = np.array(sorted_terms)
measure_term_pair = zip(measures, sorted_terms)
sorted_measures = dict(measure_term_pair)

In [21]:
def top_k(dct, k):
  keys = dct.keys()
  values = []
  for key in keys:
    values.append(dct[key][:k])
  keys_values_pair = zip(keys, values)
  return dict(keys_values_pair)

In [22]:
# These are the top 10 most bursty words as ranked by each measure
top_10_terms = pd.DataFrame(top_k(sorted_measures, 10))
top_10_terms

Unnamed: 0,IDF,ICF,Chi-sq,CG,ICB,DoP,RICF
0,gg,c-myc_protooncogene_mRNA_expression_lex,shear,Bcl-6_lex,Bcl-6_lex,of,Bcl-6_lex
1,blood_histamine_response_lex,strong_positive_regulatory_effect_lex,X-SCID_B_cell_lex,SMX_lex,TCRzeta_lex,the,SMX_lex
2,crossreacts,wellrecognized,DG_lex,v-erbA_lex,ML-9_lex,in,v-erbA_lex
3,z,human_GASd_element_lex,Trident_lex,SHP1_lex,AITL_lex,and,ML-9_lex
4,19224,Single_C_motif-1_(SCM-1)/lymphotactin_lex,SMX-NHOH_lex,ML-9_lex,SHP1_lex,to,SHP1_lex
5,protein_inhibitor_lex,DNA_affinity-purified_protein_lex,pim-1_gene_lex,beta-casein_lex,beta-casein_lex,a,beta-casein_lex
6,NF-AT1_transcription_factor_complex_lex,restricted_protein_kinase_C_(PKC)_isoform_lex,ethanol_lex,I_kappaB_lex,A-myb_lex,that,DM_lex
7,peroxisome_proliferator-activated_receptor_alp...,inflammatory_liver_disease_lex,RelB_lex,DM_lex,I_kappaB_lex,by,I_kappaB_lex
8,dominant-negative_form_of_Stat3_lex,EBV-infected_T_cell_lex,HL60_cell_lex,TCRzeta_lex,SMX_lex,with,TCRzeta_lex
9,VDRnuc_lex,nodular_lymphocyte_predominance_Hodgkin's_dise...,IL_6_lex,EBNA-2_lex,Rap1_protein_lex,we,p95vav_lex


In [23]:
# This cell writes the previous pandas dataframe as a CSV (uncomment to write)
top_10_terms.to_csv('top_10_terms.csv')

In [24]:
# This cell writes the full dataframe as a CSV (uncomment to write)
top_45000_terms = pd.DataFrame(top_k(sorted_measures, 45000))
top_45000_terms.to_csv('top_45000_terms.csv')

In [25]:
amino_acid_sems = ['G#amino_acid_monomer', 'G#peptide', 'G#protein_N/A',
              'G#protein_complex', 'G#protein_domain_or_region',
              'G#protein_family_or_group', 'G#protein_molecule',
              'G#protein_substructure', 'G#protein_subunit',
              'G#other_organic_compound', 'G#organic', 'G#inorganic', 'G#atom',
              'G#carbohydrate', 'G#lipid']
nucleotide_sems = ['G#nucleotide', 'G#polynucleotide', 'G#DNA_N/A',
        'G#DNA_domain_or_region', 'G#DNA_family_or_group', 'G#DNA_molecule',
        'G#DNA_substructure', 'G#RNA_N/A', 'G#RNA_domain_or_region',
        'G#RNA_family_or_group', 'G#RNA_molecule', 'G#RNA_substructure']
multi_cell_sems = ['G#virus', 'G#mono_cell', 'G#multi_cell', 'G#body_part', 'G#tissue']
cell_sems = ['G#cell_type', 'G#cell_component', 'G#cell_line',
         'G#other_artificial_source']
other_sems = ['G#other_name']

sem = np.array(key_words['sem'])
lex = np.array(key_words['lex'])
lex_sem_dct = dict(zip(lex, sem))

def get_color_words(lst_color):
  words = []
  for k, v in lex_sem_dct.items():
    if v in lst_color:
      words.append(k)
  return words

amino_acid = get_color_words(amino_acid_sems)
nucleotide = get_color_words(nucleotide_sems)
multi_cell = get_color_words(multi_cell_sems)
cell = get_color_words(cell_sems)
other = get_color_words(other_sems)

print('amino_acid words: ', len(amino_acid), 'nucleotide words: ', len(nucleotide),
      'multi_cell words: ', len(multi_cell), 'cell words: ', len(cell),
      'other words: ', len(other))

amino_acid words:  10155 nucleotide words:  5574 multi_cell words:  1444 cell words:  4051 other words:  10560


In [26]:
def count_words(lst, imp_words):
  counter = 0
  for x in lst:
    if x in imp_words:
      counter += 1
  return counter

def create_p_k(lst_words):
  measures = sorted_measures.keys()
  counts = [[], [], [], [], [], [], [], [], [], []]
  p_k_dct = dict(zip(measures, counts))
  for measure in p_k_dct.keys():
    for value in at_values:
      p_k_dct[measure].append(count_words(top_k(sorted_measures, value)[measure], lst_words)/value)
  result = pd.DataFrame(p_k_dct)
  result.index = at_values
  return result

In [27]:
# These are the p@k scores for the different categories of domain-specific words
at_values = np.array([10, 50, 100, 500, 1000, 5000])
highlights = {'all': all_genia_words, 'amino_acid': amino_acid, 'nucleotide': nucleotide, 'multi_cell': multi_cell, 'cell': cell, 'other': other}
dfs = []
for k, v in highlights.items():
  dfs.append(create_p_k(v))
  print(k)
  display(dfs[-1])

all


Unnamed: 0,IDF,ICF,Chi-sq,CG,ICB,DoP,RICF
10,0.6,0.9,0.9,1.0,1.0,0.0,1.0
50,0.84,0.9,0.96,0.96,1.0,0.02,1.0
100,0.84,0.87,0.97,0.98,0.98,0.02,1.0
500,0.846,0.862,0.964,0.984,0.974,0.088,0.992
1000,0.863,0.861,0.955,0.981,0.964,0.142,0.984
5000,0.8664,0.856,0.9156,0.9286,0.8974,0.3754,0.931


amino_acid


Unnamed: 0,IDF,ICF,Chi-sq,CG,ICB,DoP,RICF
10,0.4,0.3,0.6,1.0,0.8,0.0,1.0
50,0.26,0.32,0.58,0.74,0.7,0.02,0.78
100,0.25,0.29,0.52,0.82,0.75,0.01,0.83
500,0.256,0.276,0.558,0.682,0.622,0.048,0.692
1000,0.279,0.267,0.542,0.651,0.589,0.07,0.639
5000,0.2664,0.2462,0.4288,0.4252,0.411,0.163,0.4302


nucleotide


Unnamed: 0,IDF,ICF,Chi-sq,CG,ICB,DoP,RICF
10,0.0,0.1,0.1,0.0,0.1,0.0,0.0
50,0.12,0.12,0.16,0.1,0.12,0.0,0.1
100,0.15,0.1,0.17,0.1,0.1,0.0,0.11
500,0.148,0.164,0.16,0.146,0.146,0.004,0.14
1000,0.142,0.157,0.158,0.133,0.14,0.014,0.141
5000,0.1526,0.1514,0.1536,0.1562,0.1482,0.0502,0.1556


multi_cell


Unnamed: 0,IDF,ICF,Chi-sq,CG,ICB,DoP,RICF
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50,0.1,0.0,0.04,0.04,0.04,0.0,0.04
100,0.06,0.01,0.05,0.02,0.02,0.0,0.02
500,0.036,0.032,0.028,0.032,0.032,0.006,0.036
1000,0.038,0.038,0.037,0.037,0.042,0.011,0.037
5000,0.037,0.037,0.0436,0.045,0.0414,0.0202,0.0442


cell


Unnamed: 0,IDF,ICF,Chi-sq,CG,ICB,DoP,RICF
10,0.0,0.1,0.2,0.0,0.0,0.0,0.0
50,0.04,0.12,0.1,0.02,0.0,0.0,0.02
100,0.05,0.12,0.07,0.01,0.02,0.01,0.01
500,0.102,0.11,0.08,0.042,0.06,0.02,0.04
1000,0.113,0.102,0.082,0.058,0.07,0.026,0.059
5000,0.1116,0.1104,0.0954,0.1016,0.099,0.0514,0.1004


other


Unnamed: 0,IDF,ICF,Chi-sq,CG,ICB,DoP,RICF
10,0.2,0.4,0.0,0.0,0.1,0.0,0.0
50,0.32,0.34,0.08,0.06,0.14,0.0,0.06
100,0.33,0.35,0.16,0.03,0.09,0.0,0.03
500,0.304,0.28,0.138,0.082,0.114,0.01,0.084
1000,0.291,0.297,0.136,0.102,0.123,0.021,0.108
5000,0.2988,0.311,0.1942,0.2006,0.1978,0.0906,0.2006


In [29]:
# This cell writes the p@k tables to file (uncomment to write).
dfs[0].to_csv('p_at_k/all_semantic_classes.csv')
dfs[1].to_csv('p_at_k/amino_acid_semantic_classes.csv')
dfs[2].to_csv('p_at_k/nucleotide_semantic_classes.csv')
dfs[3].to_csv('p_at_k/multi-cell_semantic_classes.csv')
dfs[4].to_csv('p_at_k/cell_semantic_classes.csv')
dfs[5].to_csv('p_at_k/other_semantic_classes.csv')

In [30]:
# This cell counts how many words of each color there are in the corpus
color_words = [all_genia_words, amino_acid, nucleotide, multi_cell, cell, other]
color_words_counter = [0, 0, 0, 0, 0, 0]
for i in range(len(color_words)):
  for j in range(len(vocab)):
    if vocab[j] in color_words[i]:
      color_words_counter[i] += N_i.A[0][j]

names_of_words = ['all', 'amino_acid', 'nucleotide', 'multi_cell', 'cell', 'other']
color_word_count_zip = zip(names_of_words, color_words_counter)
counter_dict = dict(color_word_count_zip)
counter_dict

{'all': 90969,
 'amino_acid': 42478,
 'nucleotide': 11619,
 'multi_cell': 5247,
 'cell': 11626,
 'other': 19999}

In [31]:
# This cell counts the number of unique words of each color in the corpus
color_words_counter = [0, 0, 0, 0, 0, 0]
for i in range(len(color_words)):
  for j in range(len(vocab)):
    if vocab[j] in color_words[i]:
      color_words_counter[i] += 1

unique_word_count_zip = zip(names_of_words, color_words_counter)
unique_word_num_dict = dict(unique_word_count_zip)
unique_word_num_dict

{'all': 31784,
 'amino_acid': 10155,
 'nucleotide': 5574,
 'multi_cell': 1444,
 'cell': 4051,
 'other': 10560}