# Evaluate Word Burstiness Scores on the Genia Corpus Data

Authors: Samuel Sarria Hurtado and Paul Sheridan

Goal: Evaluate the following word burstiness scores on the Genia corpus data
- Kwok
- Irvine and Callison-Burch
- Derivation of Proportions (DOP for short)
- Chi-square
- Naive Sarria Hurtado Mullen Sheridan
- Sarria Hurtado Mullen Sheridan tail probability

Likewise evaluate KeyBERT word scores. Calculate P@k scores for each scoring function using the Genia terms as ground truth.

## Preliminaries

In [1]:
# Add path to Python function files to system path
import sys
import json
import pandas as pd
imports_path = '../0-base-functions/'
sys.path.append(imports_path)
import word_stats
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import scipy
import nltk
from nltk.corpus import stopwords
from io import StringIO
from numpy import nan

### Read in all necessary data

In [2]:
# load the genia corpus, the genia keywords, the keybert rankings, and all the different lists of stopwords.
json_genia_path = '../0-data-preprocessed/GENIAcorpus3.02-preprocessed.json'

with open(json_genia_path, "r") as j:
  genia = json.loads(j.read())

keyword_genia_path = '../0-data-preprocessed/GENIAcorpus3.02-keywords.tsv'

with open(keyword_genia_path, "r") as c:
  key_words = pd.read_csv(c, sep='\t')
important_words = key_words.lex.to_numpy()

json_keybert_path = '../4-keybert/keybert-scores.json'

with open(json_keybert_path, 'r') as k:
  keybert_scores = json.loads(k.read())
keybert = pd.DataFrame(keybert_scores)
keybert.columns = ['term', 'keybert']

terrier_path = '../0-data-raw/terrier-stopwords.txt'

terrier_stopwords = np.loadtxt(terrier_path, dtype=str)

myisam_path = '../0-data-raw/myisam-stopwords.txt'

with open(myisam_path, 'r') as t:
  myisam_txt = StringIO(t.read() + '  NA  NA')

myisam_stopwords = np.loadtxt(myisam_txt, dtype=str)
myisam_stopwords = np.reshape(myisam_stopwords, (545, ))
nas = np.where(myisam_stopwords == 'NA')[0]
myisam_stopwords = np.delete(myisam_stopwords, nas).tolist()

## Constructing Vocabulary

This is the vocabulary input for the count vectorizer below.

In [3]:
pre_vocab = []
for i in range(len(genia)):
  pre_vocab.append(genia[i].split())

vocab = []
for i in range(len(pre_vocab)):
  for j in range(len(pre_vocab[i])):
    vocab.append(pre_vocab[i][j])

vocab = list(set(vocab))

## Vectorizing the GENIA collection

In [4]:
# Custom function so the Count vectorizer won't ignore any words
def analyzer_custom(doc):
  return doc.split()

In [5]:
counter = CountVectorizer(lowercase=False, vocabulary=vocab, analyzer=analyzer_custom)
collection = counter.transform(genia)

### Important Text Analysis Variables

In [12]:
m = len(counter.get_feature_names_out())
d = collection.shape[0]
N_i = word_stats.get_Ni(collection)
N_j = word_stats.get_Nj(collection)
N = word_stats.get_N(N_j)
B_ij = word_stats.get_Bij(collection)
B_i = word_stats.get_Bi(B_ij)
B_j = word_stats.get_Bj(B_ij)
CF = word_stats.get_cf(N_i)
DF = word_stats.get_df(B_i, d)
nij_by_nj = word_stats.get_nij_by_nj(collection, N_j)
thetas = np.array(range(1, max(N_i.A[0]) + 1))/N
opt_thetas = word_stats.get_opt_thetas(N, m, d, N_i, N_j, B_i, thetas)
#alpha_i = word_stats.get_alpha_i(collection, B_j, N_j)
#mu_alpha_i = word_stats.get_mu_alpha_i(alpha_i, d)
#sigma_alpha_i = word_stats.get_sigma_alpha_i(alpha_i, mu_alpha_i, d)

In [None]:
def eidf_icf_diff(theta, d, nj, bi_obs):
  sum1 = np.power(1 - theta, nj, dtype = np.longdouble).sum()
  sum2 = np.power(1 - theta, 2*nj, dtype = np.longdouble).sum()

  EDF = 1 - (1/d)*sum1
  EIDF = (sum1 - sum2)/((2*d**2)*(EDF**2)) - np.log(EDF)

  return EIDF - np.log(d/bi_obs)


In [None]:
thetas = np.array(range(1, max(N_i.A[0]) + 1))/N
opt_thetas = []
for i in range(m):
  result = scipy.optimize.brentq(f = lambda x: eidf_icf_diff(x, d, N_j.T.A[0], B_i.A[0][i]), a=min(thetas), b=max(thetas))
  opt_thetas.append(result)

opt_thetas = np.array(opt_thetas)

In [None]:
def get_B1(thetas, vector_cf, N):
  expected_ICF = word_stats.get_eicf(thetas, N)
  observed_ICF = word_stats.get_icf(vector_cf)
  return expected_ICF - observed_ICF

In [None]:
B1 = get_B1(opt_thetas, CF, N)

In [None]:
B1.A[0]

array([ 1.79150853e+00,  1.09836135e+00, -2.72500228e-03, ...,
        2.86051669e-01, -2.50942360e-04, -2.50942360e-04])

### Important Bursty Heuristics/Measures

In [13]:
church = word_stats.get_church(N_i, B_i)
irvine = word_stats.get_irvine(nij_by_nj, B_i)
dop = word_stats.get_dop(collection, N_i, N_j, N)
chisqr = word_stats.get_chisq_score(collection)
ricf = word_stats.get_ricf(opt_thetas, N, word_stats.get_icf(CF))

## Heuristics/Measures Comparison

### Preliminary Dataframes

In [14]:
dta = {'term': counter.get_feature_names_out(), 'church': church.A[0], 'irvine': irvine.A[0], 'dop': dop.A[0], 'chisq_score': chisqr, 'RICF': ricf.A[0]}
df = pd.DataFrame(data=dta)
all_scores = df.merge(keybert,how='left', left_on='term', right_on='term')

In [15]:
sorted_indices = []
cols = all_scores.columns.values.tolist()
for col in cols:
  if col == 'term':
    sorted_indices.append(np.array(all_scores['term']))
  elif col == 'keybert':
    a = np.array(all_scores[[col]])
    sorted_indices.append(5000 - scipy.stats.rankdata(a, method='ordinal', nan_policy='omit'))
  else:
    a = np.array(all_scores[[col]])
    sorted_indices.append(len(a) - scipy.stats.rankdata(a, method='ordinal', nan_policy='omit').astype(int))

sorted_indices = np.array(sorted_indices)
m_t_pair = zip(cols, sorted_indices)
measures_indices = dict(m_t_pair)
measures_indices_df = pd.DataFrame(measures_indices)

for i in range(len(measures_indices_df['keybert'])):
  if not np.isnan(measures_indices_df['keybert'][i]):
    measures_indices_df['keybert'][i] = int(measures_indices_df['keybert'][i])

In [17]:
nltk.download('stopwords')
st_words = stopwords.words('english')
all_stopwords = [st_words, terrier_stopwords, myisam_stopwords]
lst_stopwords = []
for i in range(len(all_stopwords)):
  for j in range(len(all_stopwords[i])):
    lst_stopwords.append(all_stopwords[i][j])

lst_stopwords = set(lst_stopwords)
vocab_st_words = list(set(vocab).intersection(set(lst_stopwords)))
vocab_st_words_in = []
for i in range(len(measures_indices_df['term'])):
  if measures_indices_df['term'][i] in vocab_st_words:
    vocab_st_words_in.append(i)

vocab_st_words_in = np.array(vocab_st_words_in)

table_of_stop_words = measures_indices_df[measures_indices_df['term'].isin(vocab_st_words)]

[nltk_data] Downloading package stopwords to /home/samuel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Statistical analysis of measures/heuristics

In [18]:
# Each measure has a unique ranking for a word in the corpus. The smaller the ranking
# the higher the burstiness. These are the quartiles of the rankings for all measures.

quantiles = []
for col in cols:
  if col == 'term' or col == 'keybert': continue
  quantiles.append(table_of_stop_words[col].quantile([0, 0.25, 0.5, 0.75, 1]))

quantiles_df = pd.DataFrame(quantiles)
display(quantiles_df)

Unnamed: 0,0.00,0.25,0.50,0.75,1.00
church,18.0,7750.0,8754.0,23953.0,40680.0
irvine,69.0,11310.0,16852.0,22434.0,40504.0
dop,0.0,253.0,1111.0,3719.0,40487.0
chisq_score,1.0,1671.0,32007.0,32781.0,40323.0
RICF,2093.0,8122.0,8810.0,39418.0,40802.0


In [41]:
# This cell writes the previous pandas dataframe as a csv, uncomment to rewrite
#quantiles_df.to_csv('../2-main-results/measures_quartiles.csv')

In [20]:
sorted_terms = []
measures = cols[1:]
for measure in measures:
  sorted_terms.append(np.array(all_scores[['term', measure]].sort_values(measure, ascending=False)['term']))

sorted_terms = np.array(sorted_terms)
measure_term_pair = zip(measures, sorted_terms)
sorted_measures = dict(measure_term_pair)

In [21]:
def top_k(dct, k):
  keys = dct.keys()
  values = []
  for key in keys:
    values.append(dct[key][:k])
  keys_values_pair = zip(keys, values)
  return dict(keys_values_pair)

In [22]:
# These are the top 10 most bursty words as ranked by each measure

top_10_terms = pd.DataFrame(top_k(sorted_measures, 10))
top_10_terms

Unnamed: 0,church,irvine,dop,chisq_score,RICF,keybert
0,Bcl-6_lex,Bcl-6_lex,of,suggest,Bcl-6_lex,interleukin2
1,v-erbA_lex,TCRzeta_lex,the,here,v-erbA_lex,interleukin2independent
2,SMX_lex,ML-9_lex,in,indicate,SMX_lex,interleukin
3,ML-9_lex,AITL_lex,and,results,SHP1_lex,cytokinestimulated
4,SHP1_lex,SHP1_lex,to,however,ML-9_lex,cytokines
5,beta-casein_lex,beta-casein_lex,a,study,beta-casein_lex,kinases
6,EBNA-2_lex,A-myb_lex,that,previous,p95vav_lex,interleukin4
7,I_kappaB_lex,I_kappaB_lex,by,recent,DM_lex,kinase
8,DM_lex,SMX_lex,with,investigate,TCRzeta_lex,cytokine
9,TCRzeta_lex,Rap1_protein_lex,we,thus,I_kappaB_lex,interleukin1


In [42]:
# This cell writes the previous pandas dataframe as a csv, uncomment to rewrite
#top_10_terms.to_csv('../2-main-results/top_10_terms.csv')

In [24]:
green = ['G#cell_type', 'G#cell_component', 'G#cell_line',
         'G#other_artificial_source']
blue = ['G#nucleotide', 'G#polynucleotide', 'G#DNA_N/A',
        'G#DNA_domain_or_region', 'G#DNA_family_or_group', 'G#DNA_molecule',
        'G#DNA_substructure', 'G#RNA_N/A', 'G#RNA_domain_or_region',
        'G#RNA_family_or_group', 'G#RNA_molecule', 'G#RNA_substructure']
light_blue = ['G#amino_acid_monomer', 'G#peptide', 'G#protein_N/A',
              'G#protein_complex', 'G#protein_domain_or_region',
              'G#protein_family_or_group', 'G#protein_molecule',
              'G#protein_substructure', 'G#protein_subunit',
              'G#other_organic_compound', 'G#organic', 'G#inorganic', 'G#atom',
              'G#carbohydrate', 'G#lipid']
yellow = ['G#virus', 'G#mono_cell', 'G#multi_cell', 'G#body_part', 'G#tissue']
red = ['G#other_name']

sem = np.array(key_words['sem'])
lex = np.array(key_words['lex'])
lex_sem_dct = dict(zip(lex, sem))

def get_color_words(lst_color):
  words = []
  for k, v in lex_sem_dct.items():
    if v in lst_color:
      words.append(k)
  return words

green_words = get_color_words(green)
blue_words = get_color_words(blue)
light_blue_words = get_color_words(light_blue)
yellow_words = get_color_words(yellow)
red_words = get_color_words(red)

print('green words: ', len(green_words), 'blue words: ', len(blue_words),
      'light blue: ', len(light_blue_words), 'yellow words: ', len(yellow_words),
      'red words: ', len(red_words))

green words:  4051 blue words:  5574 light blue:  10155 yellow words:  1444 red words:  10560


In [25]:
def count_words(lst, imp_words):
  counter = 0
  for x in lst:
    if x in imp_words:
      counter += 1
  return counter

def create_p_k(lst_words):
  measures = sorted_measures.keys()
  counts = [[], [], [], [], [], [], [], [], [], []]
  p_k_dct = dict(zip(measures, counts))
  for measure in p_k_dct.keys():
    for value in at_values:
      p_k_dct[measure].append(count_words(top_k(sorted_measures, value)[measure], lst_words)/value)
  result = pd.DataFrame(p_k_dct)
  result.index = at_values
  return result

In [26]:
# These are the p@k scores for the different categories of domain-specific words

at_values = np.array([10, 50, 100, 500, 1000, 5000])
highlights = {'green_words': green_words, 'blue_words': blue_words, 'light_blue_words': light_blue_words, 'yellow_words': yellow_words, 'important_words': important_words, 'red_words': red_words}
dfs = []
for k, v in highlights.items():
  dfs.append(create_p_k(v))
  print(k)
  display(dfs[-1])

green_words


Unnamed: 0,church,irvine,dop,chisq_score,RICF,keybert
10,0.0,0.0,0.0,0.0,0.0,0.0
50,0.0,0.0,0.0,0.0,0.0,0.02
100,0.01,0.02,0.01,0.0,0.01,0.08
500,0.04,0.06,0.02,0.012,0.044,0.094
1000,0.056,0.068,0.026,0.023,0.062,0.095
5000,0.099,0.099,0.0514,0.0628,0.1006,0.0922


blue_words


Unnamed: 0,church,irvine,dop,chisq_score,RICF,keybert
10,0.0,0.1,0.0,0.0,0.0,0.0
50,0.1,0.12,0.0,0.0,0.1,0.04
100,0.1,0.1,0.0,0.0,0.11,0.1
500,0.14,0.146,0.004,0.012,0.142,0.106
1000,0.134,0.14,0.014,0.021,0.141,0.121
5000,0.1544,0.1482,0.0502,0.0746,0.1562,0.1388


light_blue_words


Unnamed: 0,church,irvine,dop,chisq_score,RICF,keybert
10,1.0,0.8,0.0,0.0,1.0,0.0
50,0.76,0.7,0.02,0.02,0.8,0.06
100,0.82,0.74,0.01,0.01,0.83,0.15
500,0.684,0.622,0.048,0.022,0.692,0.252
1000,0.644,0.589,0.07,0.041,0.641,0.233
5000,0.4308,0.4108,0.163,0.1392,0.4292,0.2466


yellow_words


Unnamed: 0,church,irvine,dop,chisq_score,RICF,keybert
10,0.0,0.0,0.0,0.0,0.0,0.0
50,0.04,0.04,0.0,0.0,0.04,0.0
100,0.02,0.02,0.0,0.0,0.02,0.04
500,0.032,0.032,0.006,0.002,0.032,0.034
1000,0.036,0.043,0.011,0.006,0.035,0.043
5000,0.0438,0.0412,0.0202,0.0268,0.0442,0.0366


important_words


Unnamed: 0,church,irvine,dop,chisq_score,RICF,keybert
10,1.0,1.0,0.0,0.0,1.0,0.0
50,0.96,1.0,0.02,0.04,1.0,0.2
100,0.98,0.98,0.02,0.03,1.0,0.51
500,0.984,0.974,0.088,0.082,0.992,0.746
1000,0.978,0.963,0.142,0.162,0.987,0.758
5000,0.928,0.8972,0.3754,0.48,0.9306,0.7742


red_words


Unnamed: 0,church,irvine,dop,chisq_score,RICF,keybert
10,0.0,0.1,0.0,0.0,0.0,0.0
50,0.06,0.14,0.0,0.02,0.06,0.08
100,0.03,0.1,0.0,0.02,0.03,0.14
500,0.088,0.114,0.01,0.034,0.082,0.26
1000,0.108,0.123,0.021,0.071,0.108,0.266
5000,0.2,0.198,0.0906,0.1766,0.2004,0.26


In [45]:
# This cell writes the p@k tables to a folder, uncomment to rewrite.

#dfs[0].to_csv('../2-main-results/p@k/green_words.csv')
#dfs[1].to_csv('../2-main-results/p@k/blue_words.csv')
#dfs[2].to_csv('../2-main-results/p@k/light_blue_words.csv')
#dfs[3].to_csv('../2-main-results/p@k/yellow_words.csv')
#dfs[4].to_csv('../2-main-results/p@k/important_words.csv')
#dfs[5].to_csv('../2-main-results/p@k/light_bule_words.csv')

In [33]:
# Tbis cell counts how many words of each color there are in the corpus
color_words = [green_words, blue_words, light_blue_words, yellow_words, important_words, red_words]
color_words_counter = [0, 0, 0, 0, 0, 0]
for i in range(len(color_words)):
  for j in range(len(vocab)):
    if vocab[j] in color_words[i]:
      color_words_counter[i] += N_i.A[0][j]

names_of_words = ['green_words', 'blue_words', 'light_blue_words', 'yellow_words', 'all_important_words', 'red_words']
color_word_count_zip = zip(names_of_words, color_words_counter)
counter_dict = dict(color_word_count_zip)
counter_dict

{'green_words': 11626,
 'blue_words': 11619,
 'light_blue_words': 42478,
 'yellow_words': 5247,
 'all_important_words': 90969,
 'red_words': 19999}

In [35]:
# This cell counts the number of unique words of each color in the corpus

color_words_counter = [0, 0, 0, 0, 0, 0]
for i in range(len(color_words)):
  for j in range(len(vocab)):
    if vocab[j] in color_words[i]:
      color_words_counter[i] += 1

unique_word_count_zip = zip(names_of_words, color_words_counter)
unique_word_num_dict = dict(unique_word_count_zip)
unique_word_num_dict

{'green_words': 4051,
 'blue_words': 5574,
 'light_blue_words': 10155,
 'yellow_words': 1444,
 'all_important_words': 31784,
 'red_words': 10560}