organize instructions

TalSchuster · Mar 10, 2019 · a61e85f · a61e85f
1 parent 76d2e81
commit a61e85f
Show file tree

Hide file tree

Showing 3 changed files with 60 additions and 32 deletions.
diff --git a/README.md b/README.md
@@ -14,27 +14,49 @@ The following are models were trained on Wikipedia and the second layer was alig
 
 | Language        | Model weights | Aligning matrix  |
 | ------------- |:-------------:| :-----:|
-| en     | [weights.hdf5](https://www.dropbox.com/s/1h62kc1qdcuyy2u/en_weights.hdf5?dl=0) | [best_mapping.pth](https://www.dropbox.com/s/nufj4pxxgv5838r/en_best_mapping.pth?dl=0) |
-| es     | [weights.hdf5](https://www.dropbox.com/s/ygfjm7zmufl5gu2/es_weights.hdf5?dl=0) | [best_mapping.pth](https://www.dropbox.com/s/6kqot8ssy66d5u0/es_best_mapping.pth?dl=0) |
-| fr     | [weights.hdf5](https://www.dropbox.com/s/mm64goxb8wbawhj/fr_weights.hdf5?dl=0) | [best_mapping.pth](https://www.dropbox.com/s/0zdlanjhajlgflm/fr_best_mapping.pth?dl=0) |
-| it     | [weights.hdf5](https://www.dropbox.com/s/owfou7coi04dyxf/it_weights.hdf5?dl=0) | [best_mapping.pth](https://www.dropbox.com/s/gg985snnhajhm5i/it_best_mapping.pth?dl=0) |
-| pt     | [weights.hdf5](https://www.dropbox.com/s/ul82jsal1khfw5b/pt_weights.hdf5?dl=0) | [best_mapping.pth](https://www.dropbox.com/s/skdfz6zfud24iup/pt_best_mapping.pth?dl=0) |
-| sv     | [weights.hdf5](https://www.dropbox.com/s/boptz21zrs4h3nw/sv_weights.hdf5?dl=0) | [best_mapping.pth](https://www.dropbox.com/s/o7v64hciyifvs8k/sv_best_mapping.pth?dl=0) |
-| de    | [weights.hdf5](https://www.dropbox.com/s/2kbjnvb12htgqk8/de_weights.hdf5?dl=0) | [best_mapping.pth](https://www.dropbox.com/s/u9cg19o81lpm0h0/de_best_mapping.pth?dl=0) |
+| en     | [weights.hdf5](https://www.dropbox.com/s/1h62kc1qdcuyy2u/en_weights.hdf5) | [best_mapping.pth](https://www.dropbox.com/s/nufj4pxxgv5838r/en_best_mapping.pth) |
+| es     | [weights.hdf5](https://www.dropbox.com/s/ygfjm7zmufl5gu2/es_weights.hdf5) | [best_mapping.pth](https://www.dropbox.com/s/6kqot8ssy66d5u0/es_best_mapping.pth) |
+| fr     | [weights.hdf5](https://www.dropbox.com/s/mm64goxb8wbawhj/fr_weights.hdf5) | [best_mapping.pth](https://www.dropbox.com/s/0zdlanjhajlgflm/fr_best_mapping.pth) |
+| it     | [weights.hdf5](https://www.dropbox.com/s/owfou7coi04dyxf/it_weights.hdf5) | [best_mapping.pth](https://www.dropbox.com/s/gg985snnhajhm5i/it_best_mapping.pth) |
+| pt     | [weights.hdf5](https://www.dropbox.com/s/ul82jsal1khfw5b/pt_weights.hdf5) | [best_mapping.pth](https://www.dropbox.com/s/skdfz6zfud24iup/pt_best_mapping.pth) |
+| sv     | [weights.hdf5](https://www.dropbox.com/s/boptz21zrs4h3nw/sv_weights.hdf5) | [best_mapping.pth](https://www.dropbox.com/s/o7v64hciyifvs8k/sv_best_mapping.pth) |
+| de     | [weights.hdf5](https://www.dropbox.com/s/2kbjnvb12htgqk8/de_weights.hdf5) | [best_mapping.pth](https://www.dropbox.com/s/u9cg19o81lpm0h0/de_best_mapping.pth) |
 
 
-options file (for all models) - [options.json](https://www.dropbox.com/s/ypjuzlf7kj957g3/options262.json?dl=0)
+options file (for all models) - [options.json](https://www.dropbox.com/s/ypjuzlf7kj957g3/options262.json)
+
+To download all the ELMo models in the table, use `get_models.sh`
+
+To download all of the alignment matrices in the table, use `get_alignments.sh`.
+
+### Generating anchors
+
+Use the `gen_anchors.py` script to generate your own anchors. You will need a trained ELMo model, text files with one sentence per line, and vocab file with token per line containing the tokens that you wish to calculate for.
+run `gen_anchors.py -h` for more details.
 
 ## Usage
 
+### Generating aligned contextual embeddings
+
+Given the output of a specific layer from ELMo (the contextual embeddings), run:
+```
+aligning  = torch.load(aligning_matrix_path)
+aligned_embeddings = np.matmul(embeddings, aligning.transpose())
+```
+
+An example can be seen in `demo.py`. 
+
+
+### Using in a model
+
 The models can be used with the [AllenNLP](https://allennlp.org) framework by simply using any model with ELMo embeddings and replacing the paths in the configuration with our provided models.
 
 Each ELMo model was trained on Wikipedia of the relevant language. To align the models, you will need to add the following code to your model:
 
 Load the alignment matrix in the `__init__()` function:
 
 ```
-aligning_matrix_path = ...
+aligning_matrix_path = ... (pth file)
 self.aligning_matrix = torch.FloatTensor(torch.load(aligning_matrix_path))
 self.aligning = torch.nn.Linear(self.aligning_matrix[0], self.aligning_matrix[1], bias=False)
 self.aligning.weight = torch.nn.Parameter(self.aligning_matrix)

diff --git a/demo.py b/demo.py
@@ -13,7 +13,7 @@
     '--elmo_weights_path',
     type=str,
     default='models/$l_weights.hdf5',
-    help="Path to elmo weights files - use $l as a placeholder for language")
+    help="Path to elmo weights files - use $l as a placeholder for language.")
 parser.add_argument(
     '--elmo_options_path',
     type=str,
@@ -23,7 +23,7 @@
     '--align_path',
     type=str,
     default='models/align/$l_best_mapping.pth',
-    help="Path to elmo options file. n_characters in the file should be 262")
+    help="Path to the aligning matrix saved in a pyTorch format. Use $l as a placeholder for language.")
 parser.add_argument(
     '-l1',
     '--language1',
@@ -75,8 +75,8 @@
 
 def parse_config(args):
     '''
-    replace $l with args.lang
-    print args
+    Replaces $l for the two languages.
+    Prints the args
     '''
 
     new_args = copy.deepcopy(args)
@@ -150,6 +150,7 @@ def analyze_sents(embeds_l1, embeds_l2, sent1, sent2, w1_ind, w2_ind, k=5):
     align2 = torch.load(args.align_path_l2)
     s2_embeds_aligned = np.matmul(s2_embeds, align2.transpose())
 
+    # Analyse
     print("--- Before alignment:")
     analyze_sents(s1_embeds, s2_embeds, sent1_tokens, sent2_tokens, w1_ind, w2_ind)
 

diff --git a/gen_anchors.py b/gen_anchors.py
@@ -3,6 +3,7 @@
 import glob
 import os
 import json
+import sys
 from tqdm import tqdm
 
 from allennlp.commands.elmo import ElmoEmbedder
@@ -15,7 +16,7 @@
     '--elmo_weights_path',
     type=str,
     default='models/$l_weights.hdf5',
-    help="Path to elmo weight file. Can use $l as a placeholder for language")
+    help="Path to elmo weight file. Can use $l as a placeholder for language argument")
 parser.add_argument(
     '--elmo_options_path',
     type=str,
@@ -32,7 +33,7 @@
     type=str,
     default='wiki_files/$l/dev*.txt',
     help=
-    "Path to files with sentences (one per line). Can use $l as a placeholder for language"
+    "Path to files with sentences (one per line). Can use $l as a placeholder for language argument"
 )
 parser.add_argument(
     '--vocab_file',
@@ -45,7 +46,7 @@
     '--out_dir',
     type=str,
     default='anchors_output/$l',
-    help="Path to output directory. Can use $l as a placeholder for language")
+    help="Path to output directory. Can use $l as a placeholder for language argument")
 parser.add_argument(
     '--layers',
     type=int,
@@ -57,13 +58,13 @@
 parser.add_argument(
     '-d', '--emb_dim', type=int, default=1024, help="Embeddings size")
 parser.add_argument(
-    '-c', '--cuda_device', type=int, default=-1, help="Cuda device")
+    '-c', '--cuda_device', type=int, default=-1, help="Cuda device. Use -1 for cpu")
 args = parser.parse_args()
 
 
 def parse_config(args):
     '''
-    replace $l with args.lang
+    replace $l with args.language
     print args
     '''
 
@@ -102,18 +103,18 @@ def run_elmo(txt_files, elmo_options_file, elmo_weights_file, vocab, layers,
     batch_size - batch size
     cuda_device - cuda device
 
-    returns dicts of mean embeddings and norm (per layer) and the list of occurences per token (inds by vocab)
+    Returns dicts of anchors and norm (per layer) and the list of occurrences per token (indices by vocab)
 
     '''
     print('Loading ELMo Embedder...')
     elmo = ElmoEmbedder(elmo_options_file, elmo_weights_file, cuda_device)
-    num_occurences = [0] * vocab.get_vocab_size()
-    mean_embeddings = {}
+    num_occurrences = [0] * vocab.get_vocab_size()
+    anchors = {}
     norms = {}
     total_words = 0
     for l in layers:
         norms[l] = 0.0
-        mean_embeddings[l] = np.zeros(
+        anchors[l] = np.zeros(
             shape=(vocab.get_vocab_size(), args.emb_dim))
 
     oov_ind = vocab.get_token_index(vocab._oov_token)
@@ -133,28 +134,28 @@ def run_elmo(txt_files, elmo_options_file, elmo_weights_file, vocab, layers,
                     if w_id == oov_ind:
                         continue
 
-                    n = num_occurences[w_id]
+                    n = num_occurrences[w_id]
                     for l in layers:
-                        mean_embeddings[l][
-                            w_id, :] = mean_embeddings[l][w_id, :] * (
+                        anchors[l][
+                            w_id, :] = anchors[l][w_id, :] * (
                                 n / (n + 1)) + em[l, j, :] / (n + 1)
                         norm = np.linalg.norm(em[l,j,:])
                         norms[l] = norms[l] * (total_words / (total_words +
                                                1)) + norm / (total_words + 1)
 
                     total_words += 1
-                    num_occurences[w_id] += 1
+                    num_occurrences[w_id] += 1
         f.close()
 
-    return mean_embeddings, norms, num_occurences
+    return anchors, norms, num_occurrences
 
-def save_embeds(file_path, embeds, vocab, num_occurences, emb_dim):
+def save_embeds(file_path, embeds, vocab, num_occurrences, emb_dim):
     # Don't include words not in the text.
-    n_tokens = len(np.nonzero(num_occurences)[0])
+    n_tokens = len(np.nonzero(num_occurrences)[0])
     with open(file_path, 'w') as f:
         f.write('%d %d\n' % (n_tokens, emb_dim))
         for i in range(embeds.shape[0]):
-            if num_occurences[i] == 0:
+            if num_occurrences[i] == 0:
                 continue
 
             token = vocab.get_token_from_index(i)
@@ -164,11 +165,15 @@ def save_embeds(file_path, embeds, vocab, num_occurences, emb_dim):
 
 if __name__ == '__main__':
     args = parse_config(args)
+    if os.path.exists(args.out_dir):
+        print("Output dir already exists: {}".format(args.out_dir))
+        sys.exit(1)
+
     vocab = vocabulary.Vocabulary()
     vocab.set_from_file(args.vocab_file, oov_token='<UNK>')
     print("Loaded vocabulary of size {}".format(vocab.get_vocab_size()))
 
-    mean_embeddings, norms, num_occurences = run_elmo(
+    anchors, norms, num_occurrences = run_elmo(
         args.txt_files, args.elmo_options_path, args.elmo_weights_path, vocab,
         args.layers, args.batch_size, args.cuda_device)
 
@@ -179,7 +184,7 @@ def save_embeds(file_path, embeds, vocab, num_occurences, emb_dim):
         norm_key = 'avg_norm_layer_{}'.format(l)
         norm_dict[norm_key] = norms[l]
         file_path = os.path.join(args.out_dir, 'avg_embeds_{}.txt'.format(l))
-        save_embeds(file_path, mean_embeddings[l], vocab, num_occurences, args.emb_dim)
+        save_embeds(file_path, anchors[l], vocab, num_occurrences, args.emb_dim)
 
     file_path = os.path.join(args.out_dir, 'norms.json'.format(l))
     json.dump(norm_dict, open(file_path, 'w'))