In [1]:
import os
import gzip
import spacy
import gensim
from gensim.models import Word2Vec
import numpy as np

In [2]:
spacy_de = spacy.load('de_core_news_sm')
spacy_en = spacy.load('en_core_web_sm')

# Функции токенизации
def tokenize_de(text):
    return [tok.text for tok in spacy_de(text)][::-1]

def tokenize_en(text):
    return [tok.text for tok in spacy_en(text)]

# Чтение данных
def read_gzip(file_path):
    with gzip.open(file_path, 'rt', encoding='utf-8') as f:
        return [line.strip() for line in f]

In [3]:
data_path = 'multi30k-dataset/data/task1/raw/'

# Загрузка данных
train_src = read_gzip(os.path.join(data_path, 'train.de.gz'))
train_trg = read_gzip(os.path.join(data_path, 'train.en.gz'))
val_src = read_gzip(os.path.join(data_path, 'val.de.gz'))
val_trg = read_gzip(os.path.join(data_path, 'val.en.gz'))
test_src = read_gzip(os.path.join(data_path, 'test_2016_flickr.de.gz'))
test_trg = read_gzip(os.path.join(data_path, 'test_2016_flickr.en.gz'))


In [4]:

# Подготовка текстов для обучения
def prepare_texts(texts, tokenizer):
    return [tokenizer(text) for text in texts]

# Обучение модели Word2Vec
def train_word2vec(texts, size=100, window=5, min_count=2, workers=4):
    model = Word2Vec(texts, size=size, window=window, min_count=min_count, workers=workers)
    return model


In [6]:

# Токенизация текстов
tokenized_de_texts = [tokenize_de(text) for text in train_src + val_src + test_src]
tokenized_en_texts = [tokenize_en(text) for text in train_trg + val_trg + test_trg]

# Обучение модели Word2Vec
de_model = Word2Vec(tokenized_de_texts, vector_size=100, window=5, min_count=2, workers=4)
en_model = Word2Vec(tokenized_en_texts, vector_size=100, window=5, min_count=2, workers=4)


In [15]:
# Извлечение вектора для слова 'Mann'
mann_vector = de_model.wv['Mann']
print("Vector for 'Mann':", mann_vector)

# Поиск ближайших слов
similar_words = de_model.wv.most_similar('Mann', topn=3)
print("Words similar to 'Mann':", similar_words)

Vector for 'Mann': [-9.29896593e-01  1.72107801e-01  1.16504049e+00 -5.26596189e-01
 -2.38793924e-01  4.96350825e-02 -3.50414038e-01 -6.99769437e-01
  6.78988993e-02  4.12205994e-01 -5.04133582e-01 -1.15436643e-01
  1.33464956e+00 -7.87142396e-01  8.76615226e-01 -7.01445460e-01
  9.45309475e-02 -1.42280072e-01 -3.23659301e-01 -1.25680089e+00
 -1.60028487e-01  5.66368520e-01  2.34313655e+00 -4.76924598e-01
  1.05712295e+00  7.44040132e-01 -1.20399952e+00  3.11883360e-01
 -4.56982136e-01  3.39530021e-01 -6.92506284e-02  5.79221010e-01
  1.15681219e+00 -4.68015075e-01 -8.88574302e-01  5.76740384e-01
 -1.38490951e+00 -3.63547713e-01 -5.66139281e-01 -4.93443608e-01
 -1.07930243e+00  6.54582586e-03 -2.26260990e-01  4.63841438e-01
  1.03379607e+00 -6.42075121e-01  3.95294785e-01  9.71473992e-01
 -8.67427588e-01  2.66395092e-01  7.24475265e-01 -2.81370163e-01
  1.23031035e-01 -8.98201704e-01 -4.29061413e-01 -1.05017163e-01
  6.67502880e-01  7.90818632e-01 -7.86053419e-01 -8.05626810e-01
  8.91