In [1]:
from elmoformanylangs import Embedder
from gensim.models import KeyedVectors
import numpy as np
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
MODEL = "turkish"
DICTIONARY = MODEL + os.sep + "word.dic"

In [3]:
e = Embedder(MODEL)

"""
sents2elmo takes a list of lists of tokens as input and returns a list of numpy arrays.
[[t1, t2, t3], [t1, t2, t3, t4, t5]] -> [array(3, 1024), array(5, 1024)]
"""

sents = [['Dönülmez', 'akşamın', 'ufkundayım'],
['Gel', 'gitme', 'usulca', 'iyi', 'geceye']]
# the list of lists which store the sentences 
# after segment if necessary.

embeddings = e.sents2elmo(sents)
print(embeddings)
print([e.shape for e in embeddings])
# will return a list of numpy arrays 
# each with the shape=(seq_len, embedding_size)

2023-06-11 17:01:03,886 INFO: char embedding size: 2518
2023-06-11 17:01:04,942 INFO: word embedding size: 327303
2023-06-11 17:01:08,353 INFO: Model(
  (token_embedder): ConvTokenEmbedder(
    (word_emb_layer): EmbeddingLayer(
      (embedding): Embedding(327303, 100, padding_idx=3)
    )
    (char_emb_layer): EmbeddingLayer(
      (embedding): Embedding(2518, 50, padding_idx=2515)
    )
    (convolutions): ModuleList(
      (0): Conv1d(50, 32, kernel_size=(1,), stride=(1,))
      (1): Conv1d(50, 32, kernel_size=(2,), stride=(1,))
      (2): Conv1d(50, 64, kernel_size=(3,), stride=(1,))
      (3): Conv1d(50, 128, kernel_size=(4,), stride=(1,))
      (4): Conv1d(50, 256, kernel_size=(5,), stride=(1,))
      (5): Conv1d(50, 512, kernel_size=(6,), stride=(1,))
      (6): Conv1d(50, 1024, kernel_size=(7,), stride=(1,))
    )
    (highways): Highway(
      (_layers): ModuleList(
        (0): Linear(in_features=2048, out_features=4096, bias=True)
        (1): Linear(in_features=2048, out_fe

[array([[-0.63060707, -0.4568481 , -0.0964649 , ..., -0.843416  ,
        -1.2804095 , -0.87734586],
       [-1.4264383 , -0.34266123,  1.1595092 , ..., -0.79874307,
        -1.1418897 , -1.5314862 ],
       [ 0.20657365,  0.14184271, -0.02908047, ..., -1.690704  ,
        -2.193326  , -1.5649871 ]], dtype=float32), array([[-0.2922735 , -0.36826015,  0.15210158, ..., -0.7689497 ,
        -1.6767696 , -0.620944  ],
       [-2.4404452 , -0.8221226 ,  0.12824607, ..., -2.240035  ,
        -1.1370629 , -0.06703468],
       [ 0.7004862 , -0.0415719 ,  0.13695411, ..., -1.1650708 ,
        -1.0666986 , -1.0297165 ],
       [-0.9151085 ,  0.04273026, -0.86166364, ..., -0.9971321 ,
        -0.6050211 ,  1.3840925 ],
       [ 0.95000213, -0.6819268 ,  0.7994997 , ..., -2.4046612 ,
        -2.023009  , -1.4719483 ]], dtype=float32)]
[(3, 1024), (5, 1024)]


In [None]:
emb = e.sents2elmo([["anne"]])
emb[0].shape

In [None]:
### Convert ELMo embeddings to word vectors for Gensim

# first 4 words are special tokens:
# <oov>, <bos>, <eos>, <pad>
with open(DICTIONARY, "r", encoding="utf-8") as f:
    words = [line.strip().split("\t")[0] for line in f.readlines()[4:]]

words[:10]

In [None]:
vectors = np.array([e.sents2elmo([[word]]) for word in words])

In [None]:
vectors = np.squeeze(np.squeeze(vectors, axis=1), axis=1)

In [7]:
model = KeyedVectors(vectors.shape[1])

In [5]:
words

[',',
 '.',
 've',
 '-',
 ':',
 'bir',
 '(',
 ')',
 'ki',
 'için',
 'bu',
 '2',
 '"',
 'ile',
 '4',
 '?',
 '...',
 'de',
 'da',
 'Bu',
 'çok',
 'olarak',
 'olan',
 '1',
 '!',
 'en',
 '|',
 'daha',
 'dir',
 'li',
 ';',
 'lı',
 'gibi',
 'var',
 '3',
 '5',
 'sonra',
 'kadar',
 'ne',
 'veya',
 'dır',
 'önce',
 'lar',
 'her',
 'yorum',
 'iyi',
 'yer',
 'Bir',
 'bölgesinde',
 'arasında',
 'Tüm',
 'izle',
 'En',
 '–',
 'ama',
 'ise',
 'tarafından',
 '0',
 'yok',
 'o',
 'yeni',
 'Hotel',
 'ya',
 'siz',
 'göre',
 'Göster',
 '..',
 'ilgili',
 '2013',
 'km',
 'zaman',
 'büyük',
 'ilk',
 'Otel',
 '10',
 'yıl',
 '/',
 '7',
 'nasıl',
 '2015',
 'Yorum',
 'güzel',
 '6',
 'içinde',
 'gün',
 'hakkında',
 'ben',
 'son',
 '8',
 'olduğu',
 'Son',
 'TL',
 'iki',
 'Fiyatları',
 'diğer',
 'yakınlarında',
 '2012',
 'TripAdvisor',
 'fazla',
 'iş',
 'Yeni',
 '&',
 'tüm',
 'yakınında',
 'hem',
 'İstanbul',
 'Restoranları',
 'oteller',
 'şekilde',
 'Restoran',
 'önem',
 '15',
 'mı',
 'mi',
 'Türk',
 '20',
 '9',
 '

In [8]:
model.add_vectors(words, vectors)
model.save_word2vec_format("elmo-Turkish-CoNLL17", binary=True)