# Tracking of sense gains and losses of words over time

Some setting up:

In [0]:
try:
  from google.colab import drive
  drive.mount("/content/gdrive")
except ModuleNotFoundError:
  print("This notebook is not currently using Google Colab.")


do_train_bert = False
do_train_xlnet = False

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
% cd /content/gdrive/My\ Drive/UofT/CSC2611/Project

/content/gdrive/My Drive/UofT/CSC2611/Project


We will be fine-tuning a pretrained BERT model in this investigation from 

---

Hugging Face's repository, so we will need to install it.

In [0]:
try:
  import transformers
except ModuleNotFoundError: 
  print("Module transformers not found, trying to install from cache...")
  try:
    % pip install --upgrade --force-reinstall `cat/content/gdrive/My\ Drive/colab_installed.txt`
    import transformers
  except ModuleNotFoundError:
    print("Cache not found, installing the package...")
    % pip install transformers
    % pip freeze --local > /content/gdrive/My\ Drive/colab_installed.txt
    import transformers

Some packages that need to be installed:

In [0]:
import os
import sys
import json
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pylab
from torch.utils.data import TensorDataset, DataLoader, \
    RandomSampler, SequentialSampler
import argparse
from keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm
from transformers import DistilBertForSequenceClassification, \
    get_linear_schedule_with_warmup, AdamW, XLNetForSequenceClassification, \
    DistilBertTokenizer
from collections import defaultdict
from sklearn.manifold import TSNE
from scipy.spatial.distance import jensenshannon
from scipy.stats import spearmanr, pearsonr, rankdata
import warnings
import contextlib

Functions for preprocessing data

In [0]:
def tokenize_inputs(tokenizer, sentences, add_special_tokens=True):
    """
    Use the tokenizer given to tokenize the sentences into their IDs.
    """
    tokenized_sentences = []
    for sentence in sentences:
        tokenized_sentence = tokenizer.encode(
            sentence, 
            add_special_tokens=add_special_tokens
            )
        tokenized_sentences.append(tokenized_sentence)

    return tokenized_sentences


def pad_inputs(tokenized_sentences, padding_token=0):
    """
    Return the padded sentences where each sentence is padded with 0's so that
    all sentences have the length of the longest sentence.
    """
    max_len = max([len(sentence) for sentence in tokenized_sentences])

    return pad_sequences(tokenized_sentences, maxlen=max_len, dtype="long",
                         value=padding_token, truncating="post", padding="post")


def get_padding_mask(padded_sentences):
    """
    Return a list of masks, one for each tokenized sentence, where at each
    position of the sentence, if the token has a non-zero value, then the token
    has meaning and thus is not a padding token and will have value 1 in the
    corresponding position in the mask, otherwise, the token is a padding token
    and will have value 0 in the corresponding position in the mask.
    """
    return padded_sentences > 0


def get_train_val_loader(inputs, masks, labels, batch_size, train_split=0.8):
    """
    Return the train and validation data loader.
    """
    assert 0 < train_split < 1, \
        "train_split needs to be a fraction between 0 and 1 exclusive"
    num_train = int(np.ceil(inputs.size(0) * train_split))
    num_val = int(inputs.size(0) - num_train)

    assert num_train and num_val, \
        f"the train_split given ({train_split}) resultted in either the " \
        f"number of training or validation examples being 0, which is " \
        f"invalid"

    permuted_indices = torch.randperm(inputs.size(0))
    train_indices = permuted_indices[: num_train]
    val_indices = permuted_indices[num_train: ]

    train_inputs, train_masks, train_labels = \
        inputs[train_indices], masks[train_indices], labels[train_indices]

    val_inputs, val_masks, val_labels = inputs[val_indices], \
                                        masks[val_indices], labels[val_indices]

    train_data = TensorDataset(train_inputs, train_masks, train_labels)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, batch_size, sampler=train_sampler)

    val_data = TensorDataset(val_inputs, val_masks, val_labels)
    val_sampler = SequentialSampler(val_data)
    val_dataloader = DataLoader(val_data, batch_size, sampler=val_sampler)

    return train_dataloader, val_dataloader

def check_path(path):
    """
    Check whether or not the path given is a valid path in this OS.
    """
    full_path = path
    if full_path[-1] == "/":
        full_path = full_path[:-1]

    parent_end = full_path.rfind("/")
    if parent_end == -1:
        parent_end = 0
    parent = os.path.join(
        os.path.abspath("."),
        full_path[:parent_end]
    )
    if not os.path.isdir(parent):
        raise FileNotFoundError(f"parent path {parent} not found")


def set_seed(seed, modules=()):
  """
  Seed the random modules.
  """
  for mod in modules:
    mod(seed)

The fine-tuning architecture

In [0]:
name_to_activation = {
    "relu": nn.ReLU,
    "gelu": nn.GELU
}


class DistilBertWordSenseDisambiguation(DistilBertForSequenceClassification):
    """
    Apply BERT for Word Sense Disambiguation task.
    """

    def __init__(self, config):
        """
        Initialize an instance of WSD BERT.
        """
        super(DistilBertWordSenseDisambiguation, self).__init__(config)
        self.config_ = config

    def reset_classifier(self, num_layers, activation):
        """
        Reset the classifier to default initialization.
        """
        config = self.config_
        blocks = []
        for _ in range(num_layers):
            layer = nn.Linear(config.hidden_size, config.hidden_size)
            nn.init.xavier_uniform_(layer.weight)
            nn.init.constant_(layer.bias, 0)
            blocks.append(layer)
            blocks.append(name_to_activation[activation]())

        self.classifier = nn.Sequential(
            *blocks,
            nn.Linear(config.hidden_size, self.config.num_labels)
        )


class XLNetWordSenseDisambiguation(XLNetForSequenceClassification):
    """
    Apply XLNet for Word Sense Disambiguation task.
    """

    def __init__(self, config):
        """
        Initialize an instance of WSD BERT.
        """
        super(XLNetWordSenseDisambiguation, self).__init__(config)
        self.config_ = config

    def reset_classifier(self, num_layers, activation):
        """
        Reset the classifier to default initialization.
        """
        config = self.config_
        blocks = []
        for _ in range(num_layers):
            layer = nn.Linear(config.d_model, config.d_model)
            nn.init.xavier_uniform_(layer.weight)
            nn.init.constant_(layer.bias, 0)
            blocks.append(layer)
            blocks.append(name_to_activation[activation]())

        self.logits_proj = nn.Sequential(
            *blocks,
            nn.Linear(config.d_model, self.config.num_labels)
        )

In [0]:
# our training configurations
seed = 42
num_layers = 2
activation = "gelu"
epochs = 1
batch_size = 8
data_path = "data/semcor_training_data.txt"
bert_save_outputs_weights = "bert_WSD.tar"
xlnet_save_outputs_weights = "xlnet_WSD.tar"
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [0]:
# data that is processed previously and now we just load it instead of 
# tokenizing the entire dataset again
padded_data_input = "finetune/tokenized_semcor_data.tar"

assert ((do_train_bert and "xlnet" not in padded_data_input) 
or (do_train_xlnet and "xlnet" in padded_data_input) 
or not (do_train_bert or do_train_xlnet))

if do_train_bert or do_train_xlnet:
  print("Loading data...")
  padded_data = torch.load(padded_data_input, map_location="cpu")

In [0]:
if do_train_bert or do_train_xlnet:
  plain_data = open(data_path)
  data = plain_data.readlines()
  plain_data.close()
  labels = []
  print("Getting labels...")
  for line in data:
      _, label = line.strip().split("@")
      labels.append(int(label))

  del data

  labels = torch.tensor(labels)
  masks = get_padding_mask(padded_data)
  if do_train_xlnet:
    masks = masks.int()  

  print("Getting data loaders...")
  # from our data, labels, and masks, we create iterators for our
  # training loop
  train_dataloader, val_dataloader = get_train_val_loader(
      padded_data,
      masks,
      labels,
      batch_size
  )

  # delete these unused variables to save RAM
  del padded_data
  del masks
  del labels

The training and validation loop

In [0]:
losses = []
def train(model, optimizer, scheduler, device, name="bert", max_iter=None):
  """
  Train the model with the arguments given to the optimizer.
  The scheduler varies the learning rate. If max_iter is not None, stop the
  training when we have trained max_iter iterations.
  """
  step = 1
  total_iters = len(train_dataloader) * epochs
  total_loss = 0
  for epoch in range(epochs):
    model.train()
    train_losses = []
    for inputs, masks, labels in train_dataloader:
      model.zero_grad()
      inputs = inputs.to(device)
      masks = masks.to(device)
      labels = labels.to(device)
      outputs = model(
          inputs,
          attention_mask=masks,
          labels=labels
      )

      loss = outputs[0]
      total_loss += loss.item()
      train_losses.append(loss)
      losses.append(loss)
      if not (step % 100):
        print(f"At iteration {step}/{total_iters}; Avg Loss: {total_loss/100}")
        total_loss = 0
        if not (step % 1000):
          print(f"Saving checkpoint at step {step}...")
          torch.save(model.state_dict(), f"checkpoints/{name}_checkpoint{step}.tar")
          print("Checkpoint saved...")
      loss.backward()
      torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
      optimizer.step()
      scheduler.step()

      if max_iter is not None and step >= max_iter:
        break
        
      step += 1

      del loss
      del inputs
      del masks
      del labels
      del outputs
      torch.cuda.empty_cache()
  return train_losses 


def validate(model, device, max_batches=None):
  """
  Validate the model by running a holdout dataset through the model and
  computing the accuracy. If max_batches is not None, we only validate using
  this number of batches.
  """
  model.eval()
  correct = 0
  total = 0
  step = 1
  for inputs, masks, labels in val_dataloader:
    inputs = inputs.to(device)
    masks = masks.to(device)
    labels = labels.to(device)
    with torch.no_grad():
      outputs = model(
          inputs,
          attention_mask=masks,
          labels=labels
      )

    loss, logits = outputs[:2]

    preds = logits.argmax(dim=1).flatten()
    correct += (preds == labels).sum()
    total += preds.size(0)

    del loss
    del inputs
    del masks
    del labels
    del outputs
    torch.cuda.empty_cache()

    if max_batches is not None and step >= max_batches:
        break
    
    step += 1
  print(f"Validation accuracy: {correct.item()/total}")

In [0]:
seed_functions = [np.random.seed, torch.manual_seed]
if torch.cuda.is_available():
  seed_functions.append(torch.cuda.manual_seed)

Finetuning BERT

In [0]:
pretrained_model = "distilbert-base-uncased"

print(f"Using pretrained model: {pretrained_model}")
bert_tokenizer = DistilBertTokenizer.from_pretrained(pretrained_model)
model = DistilBertWordSenseDisambiguation.from_pretrained(
    pretrained_model,
    num_labels=2,
    output_attentions=False,
    output_hidden_states=False
)
model.reset_classifier(num_layers, activation)

Using pretrained model: distilbert-base-uncased


HBox(children=(IntProgress(value=0, description='Downloading', max=231508, style=ProgressStyle(description_wid…




HBox(children=(IntProgress(value=0, description='Downloading', max=546, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=267967963, style=ProgressStyle(description_…




In [0]:
from_checkpoint = True
checkpoint = "checkpoints/bert_WSD.tar"
max_iter = None

torch.cuda.empty_cache()
bert_model = model
if from_checkpoint:
  print("Loading checkpoint...")
  bert_model.load_state_dict(torch.load(checkpoint, map_location="cpu"))
bert_model = bert_model.to(device)
do_train = False
if do_train:
  set_seed(seed, seed_functions)
  total_iterations = len(train_dataloader) * epochs
  parameters = []
  for name, param in model.named_parameters():
    if name == "distilbert.embeddings.word_embeddings.weight" or \
      name.startswith("classifier"):
        parameters.append(param)
    else:
      param.requires_grad_(False)
  parameters = nn.ParameterList(parameters)
  bert_optimizer = AdamW(parameters, lr=2e-5)
  bert_scheduler = get_linear_schedule_with_warmup(
      bert_optimizer, 0, total_iterations)

  train_loss = train(bert_model, bert_optimizer, bert_scheduler, device)
  torch.save(bert_model.state_dict(), bert_save_outputs_weights)

Loading checkpoint...


In [0]:
test_bert = True
if test_bert:
  validate(bert_model, device, 1000)

Fine-tuning XLNet

Due to limited memory on the GPU, this isn't feasible on Colab.

In [0]:
# pretrained_model = "xlnet-base-cased"
# print(f"Using pretrained model: {pretrained_model}")
# model = XLNetWordSenseDisambiguation.from_pretrained(
#     pretrained_model,
#     num_labels=2,
#     output_attentions=False,
#     output_hidden_states=False
# )
# model.reset_classifier(num_layers, activation)

In [0]:

# xlnet_model = model
# checkpoint_path = "checkpoints/xlnet_checkpoint6000.tar"
# from_checkpoint = True
# if from_checkpoint:
#   xlnet_model.load_state_dict(torch.load(checkpoint_path, map_location="cpu"))

# xlnet_model = xlnet_model.to(device)
# max_iter = 205741 - 6000

# if do_train_xlnet:
#   set_seed(seed, seed_functions)
#   torch.cuda.empty_cache()
#   total_iterations = len(train_dataloader) * epochs
#   xlnet_parameters = []
#   for name, param in model.named_parameters():
#     if name == "transformer.word_embedding.weight" or \
#       name.startswith("logits_proj"):
#         xlnet_parameters.append(param)
#     else:
#       param.requires_grad_(False)
#   xlnet_parameters = nn.ParameterList(xlnet_parameters)
#   xlnet_optimizer = AdamW(xlnet_parameters, lr=2e-5)
#   xlnet_scheduler = get_linear_schedule_with_warmup(
#       xlnet_optimizer, 0, total_iterations)

#   train_loss = train(xlnet_model, xlnet_optimizer, xlnet_scheduler, device, "xlnet", max_iter)
#   torch.save(xlnet_model.state_dict(), xlnet_save_outputs_weights)

In [0]:
def get_bert_sense_embedding(sentences, target, sense, model, tokenizer, device, return_sum=False):
  """
  Return the "sense embedding" by feeding sentence with the target word
  with its corresponding sense and getting the associated word emebdding.
  """
  tokenized_sentences = tokenize_inputs(
      tokenizer, sentences, add_special_tokens=True)
  tokenized_target = tokenizer.encode(target, add_special_tokens=False)[0]
  assert all(tokenized_target in tokenized_sentence 
              for tokenized_sentence in tokenized_sentences)
  target_indices = [tokenized_sentence.index(tokenized_target) 
                    for tokenized_sentence in tokenized_sentences]

  target_indices = torch.tensor(target_indices)

  padded_sentences = pad_inputs(tokenized_sentences, tokenizer.pad_token_id)
  padded_sentences = torch.tensor(padded_sentences)
  masks = get_padding_mask(padded_sentences)
  padded_sentences = padded_sentences.to(device)
  masks = masks.to(device)

  bert = model.distilbert
  hidden_states = bert(padded_sentences, attention_mask=masks)[0]

  shape = hidden_states.size()
  num_sentences = len(sentences)
  hidden_states = hidden_states.view(-1, shape[-1])
  target_indices += torch.arange(0, shape[0] * shape[1], shape[1])
  if not return_sum:
    sense_embedding = hidden_states[target_indices].mean(dim=0)
  else:
    sense_embedding = hidden_states[target_indices].sum(dim=0)

  sense_embedding = sense_embedding.to(torch.device("cpu"))

  del hidden_states
  return sense_embedding

In [0]:
target_words = [
                "face", "part", "head", "record", "word", "edge", "land", 
                "circle", "relationship", "rag", "fiction", "ball", "plane", 
                "risk", "gas", "ounce", "bag", "prop", "bit", "tree", "twist", 
                "attack", "savage", "tip", "pin", "player", "contemplation", 
                "lane", "stroke", "thump", "stab", "chairman"
                ]

sentences_and_word_senses = json.load(open(
    "words_with_sentences_separated_by_senses_pos_big.json"
))

In [0]:
def cosine_similarity(possible, true):
  """
  Measure the cosine distance between each vector in possible
  with the vector true.

  Possible should have shape (n, hidden_dim) and true should have shape
  (hidden_dim,), where n is the number of vectors. 
  """
  assert possible.dim() == 2 and true.dim() == 1
  dot_products = torch.einsum("ij,j->i", possible, true)
  norms_possible = torch.norm(possible, dim=1)
  norm_true = torch.norm(true)
  
  return dot_products / (norms_possible * norm_true)

In [0]:
def count_sense_freq(corpus, sense_embeddings, device, has_pos=True):
  """
  For the corpus <corpus>, count the number of times each sense of a word is
  used. A sense of a target word in a sentence is the sense that has the
  highest cosine similarity between the sense embedding and the target word's
  word embedding.
  """
  # the dictionary of word: sense: count which we will populate and return
  word_sense_to_freq = {word: {} for word in target_words}

  for word in corpus:
    # the corpus is a dictionary of words to the associated sentences where
    # the word has a labelled sense, we only want to process a subset of
    # target words
    if word not in target_words:
      continue
    print("Getting embeddings for word", word)

    # we want to get the sense embeddings of all the senses of this word
    embeddings = {}

    # the target words in the testing set has an associated POS tag, so when
    # construct sense embeddings, we eliminate senses that are not of the 
    # correct POS, some words only have senses of a particular POS in the corpus
    # and by eliminating it, we might not have any sense left for the word
    # we omit/skip these words
    if not word_sense_to_id[word]:
      print(f"Skipping word {word} because it has no available senses")
      continue

    # here, we get the sense embeddings for each of the sense by loading
    # the saved tensor to memory
    # sense_embeddings is a dictionary of 
    # word: sense: <path to saved tensor of sense embedding>
    for sense in word_sense_to_id[word]:
      sense_id = word_sense_to_id[word][sense]
      word_sense_to_freq[word][sense_id] = 0
      embeddings[sense_id] = torch.load(sense_embeddings[word][sense])

    # now we do the counting for the senses of the word 
    for sentence in corpus[word]:
      tokenized = bert_tokenizer.encode(sentence)
      word_id = bert_tokenizer.encode(word, add_special_tokens=False)[0]

      # we get the index of the target word in the sentence
      index = tokenized.index(word_id)
      tokenized = torch.tensor(tokenized).to(device).view(1, -1)

      # the word embedding of the target word in this sentence is obtained by
      # accessing the final hidden layer after passing the sentence through
      # the model
      word_embedding = bert_model.distilbert(tokenized)[0][0][index]

      # we compute the cosine similarity scores between all the senses available
      # and the word embedding of the word
      possible = torch.cat([*embeddings.values()]).view(len(embeddings), -1)
      possible = possible.to(device)
      similarity = cosine_similarity(possible, word_embedding)

      # the sense of the word is the sense with the highest similarity score
      correct_sense_id = similarity.argmax().item()
      word_sense_to_freq[word][correct_sense_id] += 1
  
    del embeddings
  return word_sense_to_freq

Sense frequencies analysis

Note: To avoid having to recompute these values everytime the notebook is restarted, the computed values are saved and loaded. Without the correct files, the files cannot be properly loaded and the recomputations would need to be done. Without a GPU, this part might take quite a while.

In [0]:
get_frequencies = False

# Count the frequencies and save the results
if get_frequencies:
  # % rm sense_embeddings/*

  word_sense_to_id = {word: {} for word in target_words}
  for word in sentences_and_word_senses:
    for i, sense in enumerate(sentences_and_word_senses[word]):
      word_sense_to_id[word][sense] = i

  sense_embeddings = {word: {} for word in sentences_and_word_senses}
  for word in sentences_and_word_senses:
    print(f"Processing word {word}")
    for sense in sentences_and_word_senses[word]:
      sentences = sentences_and_word_senses[word][sense]
      num_sentences = len(sentences)
      embedding = torch.zeros(768)
      for i in range(len(sentences)):
        batch = [sentences[i]]
        batch_embedding = get_bert_sense_embedding(
            batch, word, sense, bert_model, bert_tokenizer, device, True)
        embedding += batch_embedding
        del batch_embedding
        torch.cuda.empty_cache()
      embedding /= num_sentences
      sense_id = word_sense_to_id[word][sense]
      path = f"sense_embeddings/{word}_{sense_id}_embedding_pos.tar"
      torch.save(embedding, path)
      sense_embeddings[word][sense] = path
      del embedding

  with open("sense_embeddings.json", "w+") as file:
    json.dump(sense_embeddings, file)
    file.close()

  word_to_sentences_corpus1 = json.load(open("target_sent_corpus1.json"))
  word_to_sentences_corpus2 = json.load(open("target_sent_corpus2.json"))

  print("Analyzing corpus 1...")
  word_sense_to_freq_corpus1 = count_sense_freq(
      word_to_sentences_corpus1, sense_embeddings, device)
  print("Analyzing corpus 2...")
  word_sense_to_freq_corpus2 = count_sense_freq(
      word_to_sentences_corpus2, sense_embeddings, device)
  
  print(word_sense_to_freq_corpus1)
  with open("word_sense_to_id_pos.json", "w+") as file:
    json.dump(word_sense_to_id, file)
    file.close()

  with open("word_sense_freq_corpus1_pos.json", "w+") as file:
    json.dump(word_sense_to_freq_corpus1, file)
    file.close()

  with open("word_sense_freq_corpus2_pos.json", "w+") as file:
    json.dump(word_sense_to_freq_corpus2, file)
    file.close()
else:    # Load the results instead of recomputing
  with open("word_sense_to_id_pos.json") as file:
    word_sense_to_id = json.load(file)

  with open("word_sense_freq_corpus1_pos.json") as file:
    word_sense_to_freq_corpus1 = json.load(file)

  with open("word_sense_freq_corpus2_pos.json") as file:
    word_sense_to_freq_corpus2 = json.load(file)

  with open("sense_embeddings.json") as file:
    sense_embeddings = json.load(file)

In [0]:
word_sense_to_freq_corpus1 == word_sense_to_freq_corpus2

False

In [0]:
word_id_to_sense = {word: {word_sense_to_id[word][k]: k 
                           for k in word_sense_to_id[word]} 
                    for word in word_sense_to_id}

In [0]:
word_id_to_sense["ball"][1]

'round object that is hit or thrown or kicked in games'

In [0]:
def get_sense_of_word(sentence, word, sense_embeddings, device):
  """
  Get the sense of the word <word> in <sentence>.

  Sense embeddings should contain the embeddings for all possible
  senses of the word <word>. 
  """
  # Here, we perform the same procedure as when we compute the 
  # frequencies of senses.

  # At the end, we get the sense embededing with the highest similarity
  # then we use the dictionary of sense index to sense description
  # to return the sense of the word
  
  embeddings = {}
  for sense in word_sense_to_id[word]:
    sense_id = word_sense_to_id[word][sense]
    embeddings[sense_id] = torch.load(sense_embeddings[word][sense])
  tokenized = bert_tokenizer.encode(sentence)
  word_id = bert_tokenizer.encode(word, add_special_tokens=False)[0]
  index = tokenized.index(word_id)
  tokenized = torch.tensor(tokenized).to(device).view(1, -1)
  word_embedding = bert_model.distilbert(tokenized)[0][0][index]
  possible = torch.cat([*embeddings.values()]).view(len(embeddings), -1)
  possible = possible.to(device)
  similarity = cosine_similarity(possible, word_embedding)
  correct_sense_id = similarity.argmax().item()
  return word_id_to_sense[word][correct_sense_id]

In [0]:
get_sense_of_word("Harry went to the Yule Ball in his fourth year.", "ball", 
                  sense_embeddings, device)

'the people assembled at a lavish formal dance'

In [0]:
def compute_sense_use_ratio(sense_frequencies):
  """
  Compute the frequency of all senses of all words by dividing the number of
  times a sense of a word is used by the number of times that word is used.
  """
  sense_use_ratio = {}
  for word in sense_frequencies:
    sense_use_ratio[word] = {}
    sense_to_freq = sense_frequencies[word]
    total = sum(list(sense_to_freq.values()))
    for sense in sense_to_freq:
      sense_use_ratio[word][sense] = sense_frequencies[word][sense] / total

  return sense_use_ratio

In [0]:
sense_use_ratio_corpus1 = compute_sense_use_ratio(word_sense_to_freq_corpus1)
sense_use_ratio_corpus2 = compute_sense_use_ratio(word_sense_to_freq_corpus2)
sense_use_ratio_compare = {
    word: {
        sense: (
            sense_use_ratio_corpus1[word][sense], 
            sense_use_ratio_corpus2[word][sense]
            )
        for sense in sense_use_ratio_corpus1[word]
        } for word in sense_use_ratio_corpus1
}

In [0]:
for sense in sense_use_ratio_compare["attack"]:
  print(sense, word_id_to_sense["attack"][int(sense)])
sense_use_ratio_compare["attack"]

0 the act of attacking
1 (military) an offensive against an enemy (using weapons)
2 ideas or actions intended to deal with a problem or situation
3 intense adverse criticism
4 a decisive manner of beginning a musical tone or phrase
5 an offensive move in a sport or game
6 a sudden occurrence of an uncontrollable condition


{'0': (0.0022026431718061676, 0.006002400960384154),
 '1': (0.18061674008810572, 0.42016806722689076),
 '2': (0.022026431718061675, 0.006002400960384154),
 '3': (0.6475770925110133, 0.3169267707082833),
 '4': (0.0022026431718061676, 0.0012004801920768306),
 '5': (0.08590308370044053, 0.07563025210084033),
 '6': (0.05947136563876652, 0.17406962785114047)}

In [0]:
with open("sense_use_ratio_compare.json", "w+") as file:
  json.dump(sense_use_ratio_compare, file)
  file.close()

In [0]:
true_data = open("english.txt").readlines()[:]
true_data = [line.split("\t") for line in true_data]
word_to_label = {}
for word, label in true_data:
  stripped_word = word[:-3]
  if stripped_word in sense_use_ratio_compare and sense_use_ratio_compare[stripped_word]:
    word_to_label[stripped_word] = int(label.strip())

word_to_label

{'attack': 1,
 'bag': 0,
 'ball': 0,
 'bit': 1,
 'chairman': 0,
 'circle': 1,
 'contemplation': 0,
 'edge': 1,
 'face': 0,
 'fiction': 0,
 'gas': 0,
 'head': 1,
 'land': 1,
 'lane': 0,
 'ounce': 0,
 'part': 0,
 'pin': 0,
 'plane': 1,
 'player': 1,
 'prop': 1,
 'rag': 1,
 'record': 1,
 'relationship': 0,
 'risk': 0,
 'stroke': 0,
 'tip': 1,
 'tree': 0,
 'twist': 0,
 'word': 0}

In [0]:
def get_accuracy(predicted, actual):
  correct = 0
  total = len(gain_loss)
  for word in predicted:
    if predicted[word] == actual[word]:
      correct += 1

  return correct / total

In [0]:
gain_loss = {}
threshold = 0.02
for word in sense_use_ratio_compare:
  if sense_use_ratio_compare[word]:
    gain_loss[word] = {}
    for sense in sense_use_ratio_compare[word]:
      ratios = sense_use_ratio_compare[word][sense]
      if ratios[0] < threshold and ratios[1] >= threshold:
        gain_loss[word][sense] = "+"
      elif ratios[0] >= threshold and ratios[1] < threshold:
        gain_loss[word][sense] = "-"
      else:
        gain_loss[word][sense] = 0

changed = {word: int(any(val for val in gain_loss[word].values())) for word in gain_loss}

print(f"Accuracy: {get_accuracy(changed, word_to_label)}")

Accuracy: 0.6896551724137931


In [0]:
# using plain counts instead of normalizing them

changed_using_count = {}
for word in word_sense_to_freq_corpus1:
  if word_sense_to_freq_corpus1[word]:
    for sense in word_sense_to_freq_corpus1[word]:
      count1 = word_sense_to_freq_corpus1[word][sense]
      count2 = word_sense_to_freq_corpus2[word][sense]

      if (count1 <= 2 and count2 >= 5) or (count1 >= 5 and count2 <= 2):
        changed_using_count[word] = 1
        break
    
    if word not in changed_using_count:
      changed_using_count[word] = 0

changed_using_count
print(f"Accuracy: {get_accuracy(changed_using_count, word_to_label)}")

Accuracy: 0.5862068965517241
