Skip to content

Commit

Permalink
organize instructions
Browse files Browse the repository at this point in the history
  • Loading branch information
TalSchuster committed Mar 10, 2019
1 parent 76d2e81 commit a61e85f
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 32 deletions.
40 changes: 31 additions & 9 deletions README.md
Expand Up @@ -14,27 +14,49 @@ The following are models were trained on Wikipedia and the second layer was alig

| Language | Model weights | Aligning matrix |
| ------------- |:-------------:| :-----:|
| en | [weights.hdf5](https://www.dropbox.com/s/1h62kc1qdcuyy2u/en_weights.hdf5?dl=0) | [best_mapping.pth](https://www.dropbox.com/s/nufj4pxxgv5838r/en_best_mapping.pth?dl=0) |
| es | [weights.hdf5](https://www.dropbox.com/s/ygfjm7zmufl5gu2/es_weights.hdf5?dl=0) | [best_mapping.pth](https://www.dropbox.com/s/6kqot8ssy66d5u0/es_best_mapping.pth?dl=0) |
| fr | [weights.hdf5](https://www.dropbox.com/s/mm64goxb8wbawhj/fr_weights.hdf5?dl=0) | [best_mapping.pth](https://www.dropbox.com/s/0zdlanjhajlgflm/fr_best_mapping.pth?dl=0) |
| it | [weights.hdf5](https://www.dropbox.com/s/owfou7coi04dyxf/it_weights.hdf5?dl=0) | [best_mapping.pth](https://www.dropbox.com/s/gg985snnhajhm5i/it_best_mapping.pth?dl=0) |
| pt | [weights.hdf5](https://www.dropbox.com/s/ul82jsal1khfw5b/pt_weights.hdf5?dl=0) | [best_mapping.pth](https://www.dropbox.com/s/skdfz6zfud24iup/pt_best_mapping.pth?dl=0) |
| sv | [weights.hdf5](https://www.dropbox.com/s/boptz21zrs4h3nw/sv_weights.hdf5?dl=0) | [best_mapping.pth](https://www.dropbox.com/s/o7v64hciyifvs8k/sv_best_mapping.pth?dl=0) |
| de | [weights.hdf5](https://www.dropbox.com/s/2kbjnvb12htgqk8/de_weights.hdf5?dl=0) | [best_mapping.pth](https://www.dropbox.com/s/u9cg19o81lpm0h0/de_best_mapping.pth?dl=0) |
| en | [weights.hdf5](https://www.dropbox.com/s/1h62kc1qdcuyy2u/en_weights.hdf5) | [best_mapping.pth](https://www.dropbox.com/s/nufj4pxxgv5838r/en_best_mapping.pth) |
| es | [weights.hdf5](https://www.dropbox.com/s/ygfjm7zmufl5gu2/es_weights.hdf5) | [best_mapping.pth](https://www.dropbox.com/s/6kqot8ssy66d5u0/es_best_mapping.pth) |
| fr | [weights.hdf5](https://www.dropbox.com/s/mm64goxb8wbawhj/fr_weights.hdf5) | [best_mapping.pth](https://www.dropbox.com/s/0zdlanjhajlgflm/fr_best_mapping.pth) |
| it | [weights.hdf5](https://www.dropbox.com/s/owfou7coi04dyxf/it_weights.hdf5) | [best_mapping.pth](https://www.dropbox.com/s/gg985snnhajhm5i/it_best_mapping.pth) |
| pt | [weights.hdf5](https://www.dropbox.com/s/ul82jsal1khfw5b/pt_weights.hdf5) | [best_mapping.pth](https://www.dropbox.com/s/skdfz6zfud24iup/pt_best_mapping.pth) |
| sv | [weights.hdf5](https://www.dropbox.com/s/boptz21zrs4h3nw/sv_weights.hdf5) | [best_mapping.pth](https://www.dropbox.com/s/o7v64hciyifvs8k/sv_best_mapping.pth) |
| de | [weights.hdf5](https://www.dropbox.com/s/2kbjnvb12htgqk8/de_weights.hdf5) | [best_mapping.pth](https://www.dropbox.com/s/u9cg19o81lpm0h0/de_best_mapping.pth) |


options file (for all models) - [options.json](https://www.dropbox.com/s/ypjuzlf7kj957g3/options262.json?dl=0)
options file (for all models) - [options.json](https://www.dropbox.com/s/ypjuzlf7kj957g3/options262.json)

To download all the ELMo models in the table, use `get_models.sh`

To download all of the alignment matrices in the table, use `get_alignments.sh`.

### Generating anchors

Use the `gen_anchors.py` script to generate your own anchors. You will need a trained ELMo model, text files with one sentence per line, and vocab file with token per line containing the tokens that you wish to calculate for.
run `gen_anchors.py -h` for more details.

## Usage

### Generating aligned contextual embeddings

Given the output of a specific layer from ELMo (the contextual embeddings), run:
```
aligning = torch.load(aligning_matrix_path)
aligned_embeddings = np.matmul(embeddings, aligning.transpose())
```

An example can be seen in `demo.py`.


### Using in a model

The models can be used with the [AllenNLP](https://allennlp.org) framework by simply using any model with ELMo embeddings and replacing the paths in the configuration with our provided models.

Each ELMo model was trained on Wikipedia of the relevant language. To align the models, you will need to add the following code to your model:

Load the alignment matrix in the `__init__()` function:

```
aligning_matrix_path = ...
aligning_matrix_path = ... (pth file)
self.aligning_matrix = torch.FloatTensor(torch.load(aligning_matrix_path))
self.aligning = torch.nn.Linear(self.aligning_matrix[0], self.aligning_matrix[1], bias=False)
self.aligning.weight = torch.nn.Parameter(self.aligning_matrix)
Expand Down
9 changes: 5 additions & 4 deletions demo.py
Expand Up @@ -13,7 +13,7 @@
'--elmo_weights_path',
type=str,
default='models/$l_weights.hdf5',
help="Path to elmo weights files - use $l as a placeholder for language")
help="Path to elmo weights files - use $l as a placeholder for language.")
parser.add_argument(
'--elmo_options_path',
type=str,
Expand All @@ -23,7 +23,7 @@
'--align_path',
type=str,
default='models/align/$l_best_mapping.pth',
help="Path to elmo options file. n_characters in the file should be 262")
help="Path to the aligning matrix saved in a pyTorch format. Use $l as a placeholder for language.")
parser.add_argument(
'-l1',
'--language1',
Expand Down Expand Up @@ -75,8 +75,8 @@

def parse_config(args):
'''
replace $l with args.lang
print args
Replaces $l for the two languages.
Prints the args
'''

new_args = copy.deepcopy(args)
Expand Down Expand Up @@ -150,6 +150,7 @@ def analyze_sents(embeds_l1, embeds_l2, sent1, sent2, w1_ind, w2_ind, k=5):
align2 = torch.load(args.align_path_l2)
s2_embeds_aligned = np.matmul(s2_embeds, align2.transpose())

# Analyse
print("--- Before alignment:")
analyze_sents(s1_embeds, s2_embeds, sent1_tokens, sent2_tokens, w1_ind, w2_ind)

Expand Down
43 changes: 24 additions & 19 deletions gen_anchors.py
Expand Up @@ -3,6 +3,7 @@
import glob
import os
import json
import sys
from tqdm import tqdm

from allennlp.commands.elmo import ElmoEmbedder
Expand All @@ -15,7 +16,7 @@
'--elmo_weights_path',
type=str,
default='models/$l_weights.hdf5',
help="Path to elmo weight file. Can use $l as a placeholder for language")
help="Path to elmo weight file. Can use $l as a placeholder for language argument")
parser.add_argument(
'--elmo_options_path',
type=str,
Expand All @@ -32,7 +33,7 @@
type=str,
default='wiki_files/$l/dev*.txt',
help=
"Path to files with sentences (one per line). Can use $l as a placeholder for language"
"Path to files with sentences (one per line). Can use $l as a placeholder for language argument"
)
parser.add_argument(
'--vocab_file',
Expand All @@ -45,7 +46,7 @@
'--out_dir',
type=str,
default='anchors_output/$l',
help="Path to output directory. Can use $l as a placeholder for language")
help="Path to output directory. Can use $l as a placeholder for language argument")
parser.add_argument(
'--layers',
type=int,
Expand All @@ -57,13 +58,13 @@
parser.add_argument(
'-d', '--emb_dim', type=int, default=1024, help="Embeddings size")
parser.add_argument(
'-c', '--cuda_device', type=int, default=-1, help="Cuda device")
'-c', '--cuda_device', type=int, default=-1, help="Cuda device. Use -1 for cpu")
args = parser.parse_args()


def parse_config(args):
'''
replace $l with args.lang
replace $l with args.language
print args
'''

Expand Down Expand Up @@ -102,18 +103,18 @@ def run_elmo(txt_files, elmo_options_file, elmo_weights_file, vocab, layers,
batch_size - batch size
cuda_device - cuda device
returns dicts of mean embeddings and norm (per layer) and the list of occurences per token (inds by vocab)
Returns dicts of anchors and norm (per layer) and the list of occurrences per token (indices by vocab)
'''
print('Loading ELMo Embedder...')
elmo = ElmoEmbedder(elmo_options_file, elmo_weights_file, cuda_device)
num_occurences = [0] * vocab.get_vocab_size()
mean_embeddings = {}
num_occurrences = [0] * vocab.get_vocab_size()
anchors = {}
norms = {}
total_words = 0
for l in layers:
norms[l] = 0.0
mean_embeddings[l] = np.zeros(
anchors[l] = np.zeros(
shape=(vocab.get_vocab_size(), args.emb_dim))

oov_ind = vocab.get_token_index(vocab._oov_token)
Expand All @@ -133,28 +134,28 @@ def run_elmo(txt_files, elmo_options_file, elmo_weights_file, vocab, layers,
if w_id == oov_ind:
continue

n = num_occurences[w_id]
n = num_occurrences[w_id]
for l in layers:
mean_embeddings[l][
w_id, :] = mean_embeddings[l][w_id, :] * (
anchors[l][
w_id, :] = anchors[l][w_id, :] * (
n / (n + 1)) + em[l, j, :] / (n + 1)
norm = np.linalg.norm(em[l,j,:])
norms[l] = norms[l] * (total_words / (total_words +
1)) + norm / (total_words + 1)

total_words += 1
num_occurences[w_id] += 1
num_occurrences[w_id] += 1
f.close()

return mean_embeddings, norms, num_occurences
return anchors, norms, num_occurrences

def save_embeds(file_path, embeds, vocab, num_occurences, emb_dim):
def save_embeds(file_path, embeds, vocab, num_occurrences, emb_dim):
# Don't include words not in the text.
n_tokens = len(np.nonzero(num_occurences)[0])
n_tokens = len(np.nonzero(num_occurrences)[0])
with open(file_path, 'w') as f:
f.write('%d %d\n' % (n_tokens, emb_dim))
for i in range(embeds.shape[0]):
if num_occurences[i] == 0:
if num_occurrences[i] == 0:
continue

token = vocab.get_token_from_index(i)
Expand All @@ -164,11 +165,15 @@ def save_embeds(file_path, embeds, vocab, num_occurences, emb_dim):

if __name__ == '__main__':
args = parse_config(args)
if os.path.exists(args.out_dir):
print("Output dir already exists: {}".format(args.out_dir))
sys.exit(1)

vocab = vocabulary.Vocabulary()
vocab.set_from_file(args.vocab_file, oov_token='<UNK>')
print("Loaded vocabulary of size {}".format(vocab.get_vocab_size()))

mean_embeddings, norms, num_occurences = run_elmo(
anchors, norms, num_occurrences = run_elmo(
args.txt_files, args.elmo_options_path, args.elmo_weights_path, vocab,
args.layers, args.batch_size, args.cuda_device)

Expand All @@ -179,7 +184,7 @@ def save_embeds(file_path, embeds, vocab, num_occurences, emb_dim):
norm_key = 'avg_norm_layer_{}'.format(l)
norm_dict[norm_key] = norms[l]
file_path = os.path.join(args.out_dir, 'avg_embeds_{}.txt'.format(l))
save_embeds(file_path, mean_embeddings[l], vocab, num_occurences, args.emb_dim)
save_embeds(file_path, anchors[l], vocab, num_occurrences, args.emb_dim)

file_path = os.path.join(args.out_dir, 'norms.json'.format(l))
json.dump(norm_dict, open(file_path, 'w'))

0 comments on commit a61e85f

Please sign in to comment.