# Evaluate Term Burstiness Scores on the Genia Corpus Data

Authors: Samuel Sarria Hurtado and Paul Sheridan

Last update: 2024-05-16

Description: Evaluate the following word burstiness scores on the Genia corpus data
- Chi-square
- Church and Gale (CG)
- Irvine and Callison-Burch (ICB)
- Derivation of Proportions (DoP)
- Residual ICF (RICF)

Calculate P@k scores for each scoring function using the GENIA terms as ground truth.

## Imports

In [1]:
# Add path to Python function files to system path
import sys
import json
import pandas as pd
imports_path = '../0-base-functions/'
sys.path.append(imports_path)
import word_stats
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import scipy
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from io import StringIO
from numpy import nan

[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1002)>


## Read in Data

In [2]:
np.random.seed(641369)

In [3]:
# load the genia corpus, the genia keywords, the keybert rankings, and all the different lists of stopwords.
json_genia_path = '../0-data-preprocessed/GENIAcorpus3.02-preprocessed.json'

with open(json_genia_path, "r") as j:
  genia = json.loads(j.read())

keyword_genia_path = '../0-data-preprocessed/GENIAcorpus3.02-keywords.tsv'

with open(keyword_genia_path, "r") as c:
  key_words = pd.read_csv(c, sep='\t')
all_genia_words = key_words.lex.to_numpy()

terrier_path = '../0-data-raw/terrier-stopwords.txt'

terrier_stopwords = np.loadtxt(terrier_path, dtype=str)

myisam_path = '../0-data-raw/myisam-stopwords.txt'

with open(myisam_path, 'r') as t:
  myisam_txt = StringIO(t.read() + '  NA  NA')

myisam_stopwords = np.loadtxt(myisam_txt, dtype=str)
myisam_stopwords = np.reshape(myisam_stopwords, (545, ))
nas = np.where(myisam_stopwords == 'NA')[0]
myisam_stopwords = np.delete(myisam_stopwords, nas).tolist()

## Construct Vocabulary

This is the vocabulary input for the count vectorizer below.

In [4]:
pre_vocab = []
for i in range(len(genia)):
  pre_vocab.append(genia[i].split())

vocab = []
for i in range(len(pre_vocab)):
  for j in range(len(pre_vocab[i])):
    vocab.append(pre_vocab[i][j])

vocab = list(set(vocab))

## Vectorize the GENIA Corpus Documents

In [5]:
# Custom function so the Count vectorizer won't ignore any words
def analyzer_custom(doc):
  return doc.split()

In [6]:
counter = CountVectorizer(lowercase=False, vocabulary=vocab, analyzer=analyzer_custom)
collection = counter.transform(genia)

In [7]:
print(counter.get_feature_names_out())

['128_bp_of_upstream_sequence_lex' 'HIV_enhancer_activity_lex'
 'T_lymphotropic_virus_lex' ... 'B104_lex' 'cytokine_induction_lex'
 'apocynin_lex']


## Important Text Analysis Variables

In [8]:
m = len(counter.get_feature_names_out())
d = collection.shape[0]
N_i = word_stats.get_Ni(collection)
N_j = word_stats.get_Nj(collection)
N = word_stats.get_N(N_j)
B_ij = word_stats.get_Bij(collection)
B_i = word_stats.get_Bi(B_ij)
B_j = word_stats.get_Bj(B_ij)
DF = word_stats.get_DF(B_i, d)
CF = word_stats.get_CF(N_i)
nij_by_nj = word_stats.get_nij_by_nj(collection, N_j)
thetas = np.array(range(1, max(N_i.A[0]) + 1))/N
opt_thetas = word_stats.get_opt_thetas(N, m, d, N_i, N_j, B_i, thetas)

## Important Term Burstiness Measures

In [9]:
IDF = word_stats.get_IDF(DF)
ICF = word_stats.get_ICF(CF)
Chisq = word_stats.get_Chisq(collection)
CG = word_stats.get_CG(N_i, B_i)
ICB = word_stats.get_ICB(nij_by_nj, B_i)
DoP = word_stats.get_DoP(collection, N_i, N_j, N)
RICF = word_stats.get_RICF(opt_thetas, N, ICF)

  return -np.log(chisq_values)


In [12]:
# testing (deleted later)
print(IDF.A[0])
print(counter.get_feature_names_out())

[7.60090246 7.60090246 7.60090246 ... 7.60090246 4.23360663 7.60090246]
['virus_infectivity_lex'
 'vascular_cell_adhesion_molecule-1_(VCAM-1)_expression_lex' 'Pax-8_lex'
 ... 'Epstein-Barr_virus_immediate-early_antigen_lex'
 'nuclear_protein_lex' 'stably_infected_CD4+_cell_lex']


## Compare Term Burstiness Measures

### Preliminary Dataframes

In [10]:
dta = {'term': counter.get_feature_names_out(), 'IDF': IDF.A[0], 'ICF': ICF.A[0], 'Chi-sq': Chisq, 'CG': CG.A[0], 'ICB': ICB.A[0], 'DoP': DoP.A[0], 'RICF': RICF.A[0]}
scores_df = pd.DataFrame(data=dta)

In [11]:
print(scores_df)
scores_df.to_csv('../3-main-results/bursty_scores.tsv', sep='\t')

                                     term       IDF        ICF      Chi-sq  \
0                      effector_sites_lex  7.600902  12.800954    0.701595   
1               insulin_gene_promoter_lex  7.600902  12.107806  311.074036   
2                                  tester  7.600902  12.800954    0.701595   
3               p95vav_overexpression_lex  7.600902  12.800954    0.701595   
4                               prognosis  5.521461  10.498368   65.726451   
...                                   ...       ...        ...         ...   
40799        neuronal_differentiation_lex  7.600902  12.800954    0.701595   
40800          35-kDa_(alpha)_subunit_lex  7.600902  12.800954    0.701595   
40801  HSV-specific_CD4+_T-cell_clone_lex  7.600902  12.800954    0.701595   
40802              DNA-mobility_shift_lex  7.600902  12.800954    0.701595   
40803                     HLA-DRalpha_lex  7.600902  12.800954    0.701595   

         CG      ICB       DoP      RICF  
0      1.00  162.000

In [12]:
sorted_indices = []
cols = scores_df.columns.values.tolist()
for col in cols:
  if col == 'term':
    sorted_indices.append(np.array(scores_df['term']))
  else:
    a = np.array(scores_df[[col]])
    sorted_indices.append(len(a) - scipy.stats.rankdata(a, method='ordinal', nan_policy='omit').astype(int))

sorted_indices = np.array(sorted_indices)
m_t_pair = zip(cols, sorted_indices)
measures_indices = dict(m_t_pair)
measures_indices_df = pd.DataFrame(measures_indices)

In [13]:
st_words = stopwords.words('english')
all_stopwords = [st_words, terrier_stopwords, myisam_stopwords]
lst_stopwords = []
for i in range(len(all_stopwords)):
  for j in range(len(all_stopwords[i])):
    lst_stopwords.append(all_stopwords[i][j])

lst_stopwords = set(lst_stopwords)
vocab_st_words = list(set(vocab).intersection(set(lst_stopwords)))
vocab_st_words_in = []
for i in range(len(measures_indices_df['term'])):
  if measures_indices_df['term'][i] in vocab_st_words:
    vocab_st_words_in.append(i)

vocab_st_words_in = np.array(vocab_st_words_in)

table_of_stop_words = measures_indices_df[measures_indices_df['term'].isin(vocab_st_words)]

### Statistical Analysis of Measures

In [14]:
# Each measure has a unique ranking for a word in the corpus. The smaller the ranking
# the higher the burstiness. These are the quartiles of the rankings for all measures.
quantiles = []
for col in cols:
  if col == 'term': continue
  quantiles.append(table_of_stop_words[col].quantile([0, 0.25, 0.5, 0.75, 1]))

quantiles_df = pd.DataFrame(quantiles)
display(quantiles_df)

Unnamed: 0,0.00,0.25,0.50,0.75,1.00
IDF,691.0,37072.0,39692.0,40549.0,40803.0
ICF,626.0,35687.0,39494.0,40515.0,40803.0
Chi-sq,1282.0,8023.0,8796.0,39133.0,40802.0
CG,18.0,7750.0,8754.0,24023.0,40697.0
ICB,69.0,11310.0,16852.0,22435.0,40502.0
DoP,0.0,253.0,1111.0,3719.0,40487.0
RICF,2093.0,8122.0,8810.0,39487.0,40802.0


In [15]:
# This cell writes the previous pandas dataframe as a CSV (uncomment to write)
quantiles_df.to_csv('../3-main-results/measures_quartiles.csv')

In [16]:
sorted_terms = []
measures = cols[1:]
for measure in measures:
  sorted_terms.append(np.array(scores_df[['term', measure]].sort_values(measure, ascending=False)['term']))

sorted_terms = np.array(sorted_terms)
measure_term_pair = zip(measures, sorted_terms)
sorted_measures = dict(measure_term_pair)

In [17]:
def top_k(dct, k):
  keys = dct.keys()
  values = []
  for key in keys:
    values.append(dct[key][:k])
  keys_values_pair = zip(keys, values)
  return dict(keys_values_pair)

In [18]:
# These are the top 10 most bursty words as ranked by each measure
top_10_terms = pd.DataFrame(top_k(sorted_measures, 10))
top_10_terms

Unnamed: 0,IDF,ICF,Chi-sq,CG,ICB,DoP,RICF
0,effector_sites_lex,effector_sites_lex,FN_lex,Bcl-6_lex,Bcl-6_lex,of,Bcl-6_lex
1,IL-2-dependent_T_cell_proliferation_lex,early_inflammatory_response_lex,ICER_lex,v-erbA_lex,TCRzeta_lex,the,v-erbA_lex
2,total_cell_lysate_lex,NF-muNR_binding_lex,Egr-2_lex,SMX_lex,ML-9_lex,in,SMX_lex
3,ligand-inducible_phosphorylation_lex,transdimer,supine,SHP1_lex,AITL_lex,and,ML-9_lex
4,HS1_lex,major_mRNA_transcript_lex,LEF1/BCF1_lex,ML-9_lex,SHP1_lex,to,SHP1_lex
5,microg106,CD40_activation_lex,MNP-2_lex,beta-casein_lex,beta-casein_lex,a,beta-casein_lex
6,CD4-expressing_T_lymphocyte_lex,pp38_lex,RS_cell_lex,EBNA-2_lex,A-myb_lex,that,I_kappaB_lex
7,aminoacid_cys22-gly22_lex,functional_binding_site_lex,TCR_affinity_lex,I_kappaB_lex,I_kappaB_lex,by,p95vav_lex
8,hypertensives,noncytoxic_concentration_lex,Tax1_lex,DM_lex,SMX_lex,with,TCRzeta_lex
9,mammalian_histone_deacetylase_lex,Gal/ER_lex,peri-kappa_B_site_lex,p95vav_lex,Rap1_protein_lex,we,DM_lex


In [19]:
# This cell writes the previous pandas dataframe as a CSV (uncomment to write)
top_10_terms.to_csv('../3-main-results/top_10_terms.csv')

In [20]:
# This cell writes the full dataframe as a CSV (uncomment to write)
top_45000_terms = pd.DataFrame(top_k(sorted_measures, 45000))
top_45000_terms.to_csv('../3-main-results/top_45000_terms.csv')

In [21]:
amino_acid_sems = ['G#amino_acid_monomer', 'G#peptide', 'G#protein_N/A',
              'G#protein_complex', 'G#protein_domain_or_region',
              'G#protein_family_or_group', 'G#protein_molecule',
              'G#protein_substructure', 'G#protein_subunit',
              'G#other_organic_compound', 'G#organic', 'G#inorganic', 'G#atom',
              'G#carbohydrate', 'G#lipid']
nucleotide_sems = ['G#nucleotide', 'G#polynucleotide', 'G#DNA_N/A',
        'G#DNA_domain_or_region', 'G#DNA_family_or_group', 'G#DNA_molecule',
        'G#DNA_substructure', 'G#RNA_N/A', 'G#RNA_domain_or_region',
        'G#RNA_family_or_group', 'G#RNA_molecule', 'G#RNA_substructure']
multi_cell_sems = ['G#virus', 'G#mono_cell', 'G#multi_cell', 'G#body_part', 'G#tissue']
cell_sems = ['G#cell_type', 'G#cell_component', 'G#cell_line',
         'G#other_artificial_source']
other_sems = ['G#other_name']

sem = np.array(key_words['sem'])
lex = np.array(key_words['lex'])
lex_sem_dct = dict(zip(lex, sem))

def get_color_words(lst_color):
  words = []
  for k, v in lex_sem_dct.items():
    if v in lst_color:
      words.append(k)
  return words

amino_acid = get_color_words(amino_acid_sems)
nucleotide = get_color_words(nucleotide_sems)
multi_cell = get_color_words(multi_cell_sems)
cell = get_color_words(cell_sems)
other = get_color_words(other_sems)

print('amino_acid words: ', len(amino_acid), 'nucleotide words: ', len(nucleotide),
      'multi_cell words: ', len(multi_cell), 'cell words: ', len(cell),
      'other words: ', len(other))

amino_acid words:  10155 nucleotide words:  5574 multi_cell words:  1444 cell words:  4051 other words:  10560


In [22]:
def count_words(lst, imp_words):
  counter = 0
  for x in lst:
    if x in imp_words:
      counter += 1
  return counter

def create_p_k(lst_words):
  measures = sorted_measures.keys()
  counts = [[], [], [], [], [], [], [], [], [], []]
  p_k_dct = dict(zip(measures, counts))
  for measure in p_k_dct.keys():
    for value in at_values:
      p_k_dct[measure].append(count_words(top_k(sorted_measures, value)[measure], lst_words)/value)
  result = pd.DataFrame(p_k_dct)
  result.index = at_values
  return result

In [23]:
# These are the p@k scores for the different categories of domain-specific words
at_values = np.array([10, 50, 100, 500, 1000, 5000])
highlights = {'all': all_genia_words, 'amino_acid': amino_acid, 'nucleotide': nucleotide, 'multi_cell': multi_cell, 'cell': cell, 'other': other}
dfs = []
for k, v in highlights.items():
  dfs.append(create_p_k(v))
  print(k)
  display(dfs[-1])

all


Unnamed: 0,IDF,ICF,Chi-sq,CG,ICB,DoP,RICF
10,0.8,0.9,0.9,1.0,1.0,0.0,1.0
50,0.78,0.84,0.96,0.96,1.0,0.02,1.0
100,0.85,0.84,0.96,0.98,0.98,0.02,1.0
500,0.858,0.828,0.946,0.984,0.974,0.088,0.992
1000,0.872,0.84,0.946,0.983,0.964,0.142,0.987
5000,0.865,0.855,0.9148,0.9286,0.8974,0.3754,0.931


amino_acid


Unnamed: 0,IDF,ICF,Chi-sq,CG,ICB,DoP,RICF
10,0.5,0.3,0.6,1.0,0.8,0.0,1.0
50,0.3,0.26,0.6,0.76,0.7,0.02,0.78
100,0.32,0.22,0.57,0.82,0.75,0.01,0.83
500,0.264,0.24,0.532,0.686,0.622,0.048,0.696
1000,0.273,0.254,0.522,0.641,0.589,0.07,0.643
5000,0.268,0.2496,0.4294,0.4302,0.4112,0.163,0.43


nucleotide


Unnamed: 0,IDF,ICF,Chi-sq,CG,ICB,DoP,RICF
10,0.0,0.2,0.1,0.0,0.1,0.0,0.0
50,0.1,0.12,0.1,0.1,0.12,0.0,0.1
100,0.12,0.1,0.11,0.1,0.1,0.0,0.11
500,0.13,0.112,0.136,0.142,0.146,0.004,0.144
1000,0.147,0.13,0.149,0.135,0.14,0.014,0.137
5000,0.151,0.149,0.1534,0.1552,0.1484,0.0502,0.156


multi_cell


Unnamed: 0,IDF,ICF,Chi-sq,CG,ICB,DoP,RICF
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50,0.02,0.04,0.02,0.04,0.04,0.0,0.04
100,0.04,0.05,0.04,0.02,0.02,0.0,0.02
500,0.046,0.034,0.054,0.034,0.032,0.006,0.032
1000,0.039,0.041,0.046,0.039,0.042,0.011,0.037
5000,0.0372,0.0342,0.044,0.0442,0.0412,0.0202,0.0442


cell


Unnamed: 0,IDF,ICF,Chi-sq,CG,ICB,DoP,RICF
10,0.1,0.0,0.1,0.0,0.0,0.0,0.0
50,0.1,0.06,0.14,0.0,0.0,0.0,0.02
100,0.1,0.14,0.12,0.01,0.02,0.01,0.01
500,0.102,0.14,0.078,0.04,0.06,0.02,0.038
1000,0.097,0.116,0.087,0.062,0.07,0.026,0.06
5000,0.1074,0.1096,0.0942,0.0988,0.0986,0.0514,0.1


other


Unnamed: 0,IDF,ICF,Chi-sq,CG,ICB,DoP,RICF
10,0.2,0.4,0.1,0.0,0.1,0.0,0.0
50,0.26,0.36,0.1,0.06,0.14,0.0,0.06
100,0.27,0.33,0.12,0.03,0.09,0.0,0.03
500,0.316,0.302,0.146,0.082,0.114,0.01,0.082
1000,0.316,0.299,0.142,0.106,0.123,0.021,0.11
5000,0.3014,0.3126,0.1938,0.2002,0.198,0.0906,0.2008


In [25]:
# This cell writes the p@k tables to file (uncomment to write).
dfs[0].to_csv('../3-main-results/p@k/all_semantic_classes.csv')
dfs[1].to_csv('../3-main-results/p@k/amino_acid_semantic_classes.csv')
dfs[2].to_csv('../3-main-results/p@k/nucleotide_semantic_classes.csv')
dfs[3].to_csv('../3-main-results/p@k/multi-cell_semantic_classes.csv')
dfs[4].to_csv('../3-main-results/p@k/cell_semantic_classes.csv')
dfs[5].to_csv('../3-main-results/p@k/other_semantic_classes.csv')

In [52]:
# This cell counts how many words of each color there are in the corpus
color_words = [all_genia_words, amino_acid, nucleotide, multi_cell, cell, other]
color_words_counter = [0, 0, 0, 0, 0, 0]
for i in range(len(color_words)):
  for j in range(len(vocab)):
    if vocab[j] in color_words[i]:
      color_words_counter[i] += N_i.A[0][j]

names_of_words = ['all', 'amino_acid', 'nucleotide', 'multi_cell', 'cell', 'other']
color_word_count_zip = zip(names_of_words, color_words_counter)
counter_dict = dict(color_word_count_zip)
counter_dict

{'all': 90969,
 'amino_acid': 42478,
 'nucleotide': 11619,
 'multi_cell': 5247,
 'cell': 11626,
 'other': 19999}

In [53]:
# This cell counts the number of unique words of each color in the corpus
color_words_counter = [0, 0, 0, 0, 0, 0]
for i in range(len(color_words)):
  for j in range(len(vocab)):
    if vocab[j] in color_words[i]:
      color_words_counter[i] += 1

unique_word_count_zip = zip(names_of_words, color_words_counter)
unique_word_num_dict = dict(unique_word_count_zip)
unique_word_num_dict

{'all': 31784,
 'amino_acid': 10155,
 'nucleotide': 5574,
 'multi_cell': 1444,
 'cell': 4051,
 'other': 10560}