In [15]:
from elmoformanylangs import Embedder
from gensim.models import KeyedVectors
import os

In [16]:
MODEL = "turkish"
DICTIONARY = MODEL + os.sep + "word.dic"

In [5]:
e = Embedder(MODEL)

"""
sents2elmo takes a list of lists of tokens as input and returns a list of numpy arrays.
[[t1, t2, t3], [t1, t2, t3, t4, t5]] -> [array(3, 1024), array(5, 1024)]
"""

sents = [['Dönülmez', 'akşamın', 'ufkundayım'],
['Gel', 'gitme', 'usulca', 'iyi', 'geceye']]
# the list of lists which store the sentences 
# after segment if necessary.

embeddings = e.sents2elmo(sents)
print(embeddings)
print([e.shape for e in embeddings])
# will return a list of numpy arrays 
# each with the shape=(seq_len, embedding_size)

2023-04-17 13:42:11,599 INFO: char embedding size: 2518
2023-04-17 13:42:12,968 INFO: word embedding size: 327303
2023-04-17 13:42:16,401 INFO: Model(
  (token_embedder): ConvTokenEmbedder(
    (word_emb_layer): EmbeddingLayer(
      (embedding): Embedding(327303, 100, padding_idx=3)
    )
    (char_emb_layer): EmbeddingLayer(
      (embedding): Embedding(2518, 50, padding_idx=2515)
    )
    (convolutions): ModuleList(
      (0): Conv1d(50, 32, kernel_size=(1,), stride=(1,))
      (1): Conv1d(50, 32, kernel_size=(2,), stride=(1,))
      (2): Conv1d(50, 64, kernel_size=(3,), stride=(1,))
      (3): Conv1d(50, 128, kernel_size=(4,), stride=(1,))
      (4): Conv1d(50, 256, kernel_size=(5,), stride=(1,))
      (5): Conv1d(50, 512, kernel_size=(6,), stride=(1,))
      (6): Conv1d(50, 1024, kernel_size=(7,), stride=(1,))
    )
    (highways): Highway(
      (_layers): ModuleList(
        (0-1): 2 x Linear(in_features=2048, out_features=4096, bias=True)
      )
    )
    (projection): Linear

[array([[-0.6306045 , -0.45687756, -0.09643789, ..., -0.8433609 ,
        -1.2803847 , -0.8771979 ],
       [-1.4264183 , -0.34265804,  1.159494  , ..., -0.7987824 ,
        -1.1419177 , -1.5314445 ],
       [ 0.20667028,  0.14181264, -0.02899539, ..., -1.6905688 ,
        -2.1933181 , -1.5642744 ]], dtype=float32), array([[-0.2922435 , -0.36825255,  0.15211569, ..., -0.76893157,
        -1.6767775 , -0.6212325 ],
       [-2.4404142 , -0.8221317 ,  0.12822676, ..., -2.2400658 ,
        -1.1370882 , -0.06700142],
       [ 0.7004752 , -0.04158038,  0.13688873, ..., -1.1650782 ,
        -1.0667309 , -1.0297538 ],
       [-0.91501826,  0.04276076, -0.8616883 , ..., -0.9971096 ,
        -0.60500205,  1.3842624 ],
       [ 0.9499776 , -0.6818509 ,  0.7994628 , ..., -2.4046743 ,
        -2.0230227 , -1.4721899 ]], dtype=float32)]
[(3, 1024), (5, 1024)]


In [12]:
emb = e.sents2elmo([["anne"]])
emb[0].shape

2023-04-17 13:44:14,498 INFO: 1 batches, avg len: 3.0


(1, 1024)

In [21]:
### Convert ELMo embeddings to word vectors for Gensim

# first 4 words are special tokens:
# <oov>, <bos>, <eos>, <pad>
with open(DICTIONARY, "r", encoding="utf-8") as f:
    words = [line.strip().split("\t")[0] for line in f.readlines()[4:]]

words[:10]

[',', '.', 've', '-', ':', 'bir', '(', ')', 'ki', 'için']

In [22]:
vectors = [e.sents2elmo([[word]]) for word in words]

2023-04-17 14:33:33,000 INFO: 1 batches, avg len: 3.0
2023-04-17 14:33:33,159 INFO: 1 batches, avg len: 3.0
2023-04-17 14:33:33,292 INFO: 1 batches, avg len: 3.0
2023-04-17 14:33:33,408 INFO: 1 batches, avg len: 3.0
2023-04-17 14:33:33,520 INFO: 1 batches, avg len: 3.0
2023-04-17 14:33:33,657 INFO: 1 batches, avg len: 3.0
2023-04-17 14:33:33,817 INFO: 1 batches, avg len: 3.0
2023-04-17 14:33:33,954 INFO: 1 batches, avg len: 3.0
2023-04-17 14:33:34,095 INFO: 1 batches, avg len: 3.0
2023-04-17 14:33:34,225 INFO: 1 batches, avg len: 3.0
2023-04-17 14:33:34,345 INFO: 1 batches, avg len: 3.0
2023-04-17 14:33:34,453 INFO: 1 batches, avg len: 3.0
2023-04-17 14:33:34,567 INFO: 1 batches, avg len: 3.0
2023-04-17 14:33:34,684 INFO: 1 batches, avg len: 3.0
2023-04-17 14:33:34,802 INFO: 1 batches, avg len: 3.0
2023-04-17 14:33:34,927 INFO: 1 batches, avg len: 3.0
2023-04-17 14:33:35,053 INFO: 1 batches, avg len: 3.0
2023-04-17 14:33:35,252 INFO: 1 batches, avg len: 3.0
2023-04-17 14:33:35,477 INFO

In [None]:
words.shape
vectors.shape

In [None]:
model = KeyedVectors(vectors.shape[1])
model.add(words, vectors)

In [None]:
model.save('elmo.wordvectors')