# Evaluate Term Burstiness Scores on the Genia Corpus Data

Authors: Samuel Sarria Hurtado and Paul Sheridan

Last update: 2023-10-03

Description: Evaluate the following word burstiness scores on the Genia corpus data
- Church and Gale (CG)
- Irvine and Callison-Burch (ICB)
- Derivation of Proportions (DOP)
- Chi-square
- Residual ICF (RICF)

Likewise evaluate KeyBERT word scores. Calculate P@k scores for each scoring function using the GENIA terms as ground truth.

## Imports

In [1]:
# Add path to Python function files to system path
import sys
import json
import pandas as pd
imports_path = '../0-base-functions/'
sys.path.append(imports_path)
import word_stats
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import scipy
import nltk
from nltk.corpus import stopwords
from io import StringIO
from numpy import nan

## Read in Data

In [3]:
# load the genia corpus, the genia keywords, the keybert rankings, and all the different lists of stopwords.
json_genia_path = '../0-data-preprocessed/GENIAcorpus3.02-preprocessed.json'

with open(json_genia_path, "r") as j:
  genia = json.loads(j.read())

keyword_genia_path = '../0-data-preprocessed/GENIAcorpus3.02-keywords.tsv'

with open(keyword_genia_path, "r") as c:
  key_words = pd.read_csv(c, sep='\t')
important_words = key_words.lex.to_numpy()

json_keybert_path = '../3-keybert/keybert-scores.json'

with open(json_keybert_path, 'r') as k:
  keybert_scores = json.loads(k.read())
keybert = pd.DataFrame(keybert_scores)
keybert.columns = ['term', 'keybert']

terrier_path = '../0-data-raw/terrier-stopwords.txt'

terrier_stopwords = np.loadtxt(terrier_path, dtype=str)

myisam_path = '../0-data-raw/myisam-stopwords.txt'

with open(myisam_path, 'r') as t:
  myisam_txt = StringIO(t.read() + '  NA  NA')

myisam_stopwords = np.loadtxt(myisam_txt, dtype=str)
myisam_stopwords = np.reshape(myisam_stopwords, (545, ))
nas = np.where(myisam_stopwords == 'NA')[0]
myisam_stopwords = np.delete(myisam_stopwords, nas).tolist()

## Construct Vocabulary

This is the vocabulary input for the count vectorizer below.

In [4]:
pre_vocab = []
for i in range(len(genia)):
  pre_vocab.append(genia[i].split())

vocab = []
for i in range(len(pre_vocab)):
  for j in range(len(pre_vocab[i])):
    vocab.append(pre_vocab[i][j])

vocab = list(set(vocab))

## Vectorize the GENIA Corpus Documents

In [5]:
# Custom function so the Count vectorizer won't ignore any words
def analyzer_custom(doc):
  return doc.split()

In [6]:
counter = CountVectorizer(lowercase=False, vocabulary=vocab, analyzer=analyzer_custom)
collection = counter.transform(genia)

## Important Text Analysis Variables

In [7]:
m = len(counter.get_feature_names_out())
d = collection.shape[0]
N_i = word_stats.get_Ni(collection)
N_j = word_stats.get_Nj(collection)
N = word_stats.get_N(N_j)
B_ij = word_stats.get_Bij(collection)
B_i = word_stats.get_Bi(B_ij)
B_j = word_stats.get_Bj(B_ij)
CF = word_stats.get_cf(N_i)
DF = word_stats.get_df(B_i, d)
nij_by_nj = word_stats.get_nij_by_nj(collection, N_j)
thetas = np.array(range(1, max(N_i.A[0]) + 1))/N
opt_thetas = word_stats.get_opt_thetas(N, m, d, N_i, N_j, B_i, thetas)

## Important Term Burstiness Measures

In [8]:
church = word_stats.get_church(N_i, B_i)
irvine = word_stats.get_irvine(nij_by_nj, B_i)
dop = word_stats.get_dop(collection, N_i, N_j, N)
chisqr = word_stats.get_chisq_score(collection)
ricf = word_stats.get_ricf(opt_thetas, N, word_stats.get_icf(CF))

  return -np.log(chisq_values)


## Compare Term Burstiness Measures

### Preliminary Dataframes

In [9]:
dta = {'term': counter.get_feature_names_out(), 'church': church.A[0], 'irvine': irvine.A[0], 'dop': dop.A[0], 'chisq_score': chisqr, 'RICF': ricf.A[0]}
df = pd.DataFrame(data=dta)
all_scores = df.merge(keybert,how='left', left_on='term', right_on='term')

In [10]:
sorted_indices = []
cols = all_scores.columns.values.tolist()
for col in cols:
  if col == 'term':
    sorted_indices.append(np.array(all_scores['term']))
  #elif col == 'keybert':
  #  a = np.array(all_scores[[col]])
  #  sorted_indices.append(5000 - scipy.stats.rankdata(a, method='ordinal', nan_policy='omit'))
  else:
    a = np.array(all_scores[[col]])
    sorted_indices.append(len(a) - scipy.stats.rankdata(a, method='ordinal', nan_policy='omit').astype(int))

sorted_indices = np.array(sorted_indices)
m_t_pair = zip(cols, sorted_indices)
measures_indices = dict(m_t_pair)
measures_indices_df = pd.DataFrame(measures_indices)

for i in range(len(measures_indices_df['keybert'])):
  if not np.isnan(measures_indices_df['keybert'][i]):
    measures_indices_df['keybert'][i] = int(measures_indices_df['keybert'][i])

In [12]:
nltk.download('stopwords')
st_words = stopwords.words('english')
all_stopwords = [st_words, terrier_stopwords, myisam_stopwords]
lst_stopwords = []
for i in range(len(all_stopwords)):
  for j in range(len(all_stopwords[i])):
    lst_stopwords.append(all_stopwords[i][j])

lst_stopwords = set(lst_stopwords)
vocab_st_words = list(set(vocab).intersection(set(lst_stopwords)))
vocab_st_words_in = []
for i in range(len(measures_indices_df['term'])):
  if measures_indices_df['term'][i] in vocab_st_words:
    vocab_st_words_in.append(i)

vocab_st_words_in = np.array(vocab_st_words_in)

table_of_stop_words = measures_indices_df[measures_indices_df['term'].isin(vocab_st_words)]

[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1002)>


### Statistical Analysis of Measures

In [13]:
# Each measure has a unique ranking for a word in the corpus. The smaller the ranking
# the higher the burstiness. These are the quartiles of the rankings for all measures.

quantiles = []
for col in cols:
  #if col == 'term' or col == 'keybert': continue
  if col == 'term': continue
  quantiles.append(table_of_stop_words[col].quantile([0, 0.25, 0.5, 0.75, 1]))

quantiles_df = pd.DataFrame(quantiles)
display(quantiles_df)

Unnamed: 0,0.00,0.25,0.50,0.75,1.00
church,18.0,7750.0,8755.0,22443.0,40548.0
irvine,69.0,11310.0,16852.0,22434.0,40515.0
dop,0.0,253.0,1111.0,3719.0,40494.0
chisq_score,14.0,8023.0,8796.0,39120.0,40802.0
RICF,2093.0,8122.0,8810.0,39344.0,40802.0
keybert,26467.0,34115.0,35869.0,38210.0,40800.0


In [90]:
# This cell writes the previous pandas dataframe as a csv, uncomment to rewrite
#quantiles_df.to_csv('../2-main-results/measures_quartiles.csv')

In [14]:
sorted_terms = []
measures = cols[1:]
for measure in measures:
  sorted_terms.append(np.array(all_scores[['term', measure]].sort_values(measure, ascending=False)['term']))

sorted_terms = np.array(sorted_terms)
measure_term_pair = zip(measures, sorted_terms)
sorted_measures = dict(measure_term_pair)

In [15]:
def top_k(dct, k):
  keys = dct.keys()
  values = []
  for key in keys:
    values.append(dct[key][:k])
  keys_values_pair = zip(keys, values)
  return dict(keys_values_pair)

In [16]:
# These are the top 10 most bursty words as ranked by each measure
top_10_terms = pd.DataFrame(top_k(sorted_measures, 10))
top_10_terms

Unnamed: 0,church,irvine,dop,chisq_score,RICF,keybert
0,Bcl-6_lex,Bcl-6_lex,of,hypercalciuria_lex,Bcl-6_lex,human_monocyte_IL-1_receptor_antagonist_(IL-1r...
1,v-erbA_lex,TCRzeta_lex,the,NO_lex,SMX_lex,cell-type-specific_interleukin-2_receptor_alph...
2,SMX_lex,ML-9_lex,in,C/EBPalpha_lex,v-erbA_lex,T_cell_IL3_gene_expression_lex
3,SHP1_lex,AITL_lex,and,DHS_lex,ML-9_lex,T-cell_growth_factor_interleukin_(IL)-2_lex
4,ML-9_lex,SHP1_lex,to,ringed_sideroblast_lex,SHP1_lex,cytosolic_signal_transducers_and_activators_of...
5,beta-casein_lex,beta-casein_lex,a,renin_lex,beta-casein_lex,mouse_interleukin-2_receptor_alpha_gene_expres...
6,TCRzeta_lex,A-myb_lex,that,IL-9-induced_cell_proliferation_lex,TCRzeta_lex,interleukin_3_(IL3)_gene_expression_lex
7,DM_lex,I_kappaB_lex,by,TSHR_lex,DM_lex,T_cell_interleukin-2_receptor_expression_lex
8,I_kappaB_lex,SMX_lex,with,bcl-6_protein_lex,p95vav_lex,IL-4-activated_transcription_factor_signal_tra...
9,p95vav_lex,Rap1_protein_lex,we,NE_lex,I_kappaB_lex,human_interleukin_(IL)_4_gene_lex


In [94]:
# This cell writes the previous pandas dataframe as a csv, uncomment to rewrite
#top_10_terms.to_csv('../2-main-results/top_10_terms.csv')

In [18]:
top_45000_terms = pd.DataFrame(top_k(sorted_measures, 45000))
top_45000_terms.to_csv('../3-main-results/top_45000_terms.csv')

In [21]:
green = ['G#cell_type', 'G#cell_component', 'G#cell_line',
         'G#other_artificial_source']
blue = ['G#nucleotide', 'G#polynucleotide', 'G#DNA_N/A',
        'G#DNA_domain_or_region', 'G#DNA_family_or_group', 'G#DNA_molecule',
        'G#DNA_substructure', 'G#RNA_N/A', 'G#RNA_domain_or_region',
        'G#RNA_family_or_group', 'G#RNA_molecule', 'G#RNA_substructure']
light_blue = ['G#amino_acid_monomer', 'G#peptide', 'G#protein_N/A',
              'G#protein_complex', 'G#protein_domain_or_region',
              'G#protein_family_or_group', 'G#protein_molecule',
              'G#protein_substructure', 'G#protein_subunit',
              'G#other_organic_compound', 'G#organic', 'G#inorganic', 'G#atom',
              'G#carbohydrate', 'G#lipid']
yellow = ['G#virus', 'G#mono_cell', 'G#multi_cell', 'G#body_part', 'G#tissue']
red = ['G#other_name']

sem = np.array(key_words['sem'])
lex = np.array(key_words['lex'])
lex_sem_dct = dict(zip(lex, sem))

def get_color_words(lst_color):
  words = []
  for k, v in lex_sem_dct.items():
    if v in lst_color:
      words.append(k)
  return words

green_words = get_color_words(green)
blue_words = get_color_words(blue)
light_blue_words = get_color_words(light_blue)
yellow_words = get_color_words(yellow)
red_words = get_color_words(red)

print('green words: ', len(green_words), 'blue words: ', len(blue_words),
      'light blue: ', len(light_blue_words), 'yellow words: ', len(yellow_words),
      'red words: ', len(red_words))

green words:  4051 blue words:  5574 light blue:  10155 yellow words:  1444 red words:  10560


In [22]:
def count_words(lst, imp_words):
  counter = 0
  for x in lst:
    if x in imp_words:
      counter += 1
  return counter

def create_p_k(lst_words):
  measures = sorted_measures.keys()
  counts = [[], [], [], [], [], [], [], [], [], []]
  p_k_dct = dict(zip(measures, counts))
  for measure in p_k_dct.keys():
    for value in at_values:
      p_k_dct[measure].append(count_words(top_k(sorted_measures, value)[measure], lst_words)/value)
  result = pd.DataFrame(p_k_dct)
  result.index = at_values
  return result

In [97]:
# These are the p@k scores for the different categories of domain-specific words

at_values = np.array([10, 50, 100, 500, 1000, 5000])
highlights = {'important_words': important_words, 'light_blue_words': light_blue_words, 'blue_words': blue_words, 'yellow_words': yellow_words, 'green_words': green_words, 'red_words': red_words}
dfs = []
for k, v in highlights.items():
  dfs.append(create_p_k(v))
  print(k)
  display(dfs[-1])

important_words


Unnamed: 0,church,irvine,dop,chisq_score,RICF,keybert
10,1.0,1.0,0.0,1.0,1.0,1.0
50,0.96,1.0,0.02,0.94,1.0,1.0
100,0.98,0.98,0.02,0.94,1.0,1.0
500,0.984,0.974,0.088,0.968,0.992,1.0
1000,0.983,0.963,0.142,0.959,0.986,1.0
5000,0.928,0.8972,0.3754,0.9146,0.9322,1.0


light_blue_words


Unnamed: 0,church,irvine,dop,chisq_score,RICF,keybert
10,1.0,0.8,0.0,0.6,1.0,0.3
50,0.76,0.7,0.02,0.6,0.8,0.16
100,0.82,0.75,0.01,0.55,0.83,0.2
500,0.684,0.622,0.048,0.53,0.694,0.234
1000,0.639,0.59,0.07,0.536,0.646,0.278
5000,0.4284,0.411,0.163,0.4296,0.429,0.2716


blue_words


Unnamed: 0,church,irvine,dop,chisq_score,RICF,keybert
10,0.0,0.1,0.0,0.3,0.0,0.1
50,0.1,0.12,0.0,0.14,0.1,0.3
100,0.1,0.1,0.0,0.15,0.11,0.31
500,0.14,0.146,0.004,0.182,0.146,0.284
1000,0.137,0.14,0.014,0.172,0.132,0.224
5000,0.155,0.1482,0.0502,0.1534,0.1564,0.1494


yellow_words


Unnamed: 0,church,irvine,dop,chisq_score,RICF,keybert
10,0.0,0.0,0.0,0.0,0.0,0.0
50,0.04,0.04,0.0,0.0,0.04,0.0
100,0.02,0.02,0.0,0.03,0.02,0.0
500,0.036,0.032,0.006,0.032,0.034,0.0
1000,0.041,0.042,0.011,0.032,0.035,0.0
5000,0.0444,0.0414,0.0202,0.0432,0.0444,0.0072


green_words


Unnamed: 0,church,irvine,dop,chisq_score,RICF,keybert
10,0.0,0.0,0.0,0.1,0.0,0.0
50,0.0,0.0,0.0,0.06,0.0,0.0
100,0.01,0.02,0.01,0.06,0.01,0.0
500,0.04,0.06,0.02,0.078,0.038,0.044
1000,0.06,0.068,0.026,0.082,0.059,0.06
5000,0.1002,0.0988,0.0514,0.0958,0.101,0.1536


red_words


Unnamed: 0,church,irvine,dop,chisq_score,RICF,keybert
10,0.0,0.1,0.0,0.0,0.0,0.6
50,0.06,0.14,0.0,0.14,0.06,0.54
100,0.03,0.09,0.0,0.15,0.03,0.49
500,0.084,0.114,0.01,0.146,0.08,0.438
1000,0.106,0.123,0.021,0.137,0.114,0.438
5000,0.2,0.1978,0.0906,0.1926,0.2014,0.4182


In [98]:
# This cell writes the p@k tables to a folder, uncomment to rewrite.

#dfs[0].to_csv('../2-main-results/p@k/green_words.csv')
#fs[1].to_csv('../2-main-results/p@k/blue_words.csv')
#dfs[2].to_csv('../2-main-results/p@k/light_blue_words.csv')
#dfs[3].to_csv('../2-main-results/p@k/yellow_words.csv')
#dfs[4].to_csv('../2-main-results/p@k/important_words.csv')
#dfs[5].to_csv('../2-main-results/p@k/red_words.csv')

In [None]:
# Tbis cell counts how many words of each color there are in the corpus
color_words = [green_words, blue_words, light_blue_words, yellow_words, important_words, red_words]
color_words_counter = [0, 0, 0, 0, 0, 0]
for i in range(len(color_words)):
  for j in range(len(vocab)):
    if vocab[j] in color_words[i]:
      color_words_counter[i] += N_i.A[0][j]

names_of_words = ['green_words', 'blue_words', 'light_blue_words', 'yellow_words', 'all_important_words', 'red_words']
color_word_count_zip = zip(names_of_words, color_words_counter)
counter_dict = dict(color_word_count_zip)
counter_dict

In [100]:
# This cell counts the number of unique words of each color in the corpus
color_words_counter = [0, 0, 0, 0, 0, 0]
for i in range(len(color_words)):
  for j in range(len(vocab)):
    if vocab[j] in color_words[i]:
      color_words_counter[i] += 1

unique_word_count_zip = zip(names_of_words, color_words_counter)
unique_word_num_dict = dict(unique_word_count_zip)
unique_word_num_dict

{'green_words': 4051,
 'blue_words': 5574,
 'light_blue_words': 10155,
 'yellow_words': 1444,
 'all_important_words': 31784,
 'red_words': 10560}