In [2]:
import io
import pickle
import string

import numpy as np
import tensorflow as tf
import tqdm
from nltk.corpus import stopwords
from tensorflow.keras import layers

def preprocess_text(sentences):
    table = str.maketrans("", "", string.punctuation)  # removing punctuation - it has to be a conscious decision
    tokenized_sentences = [[word.lower().translate(table) for word in sentence] for sentence in sentences]
    stop_words = set(stopwords.words("english"))
    tokenized_sentences = [[word for word in sentence if word and word not in stop_words] for sentence in
                           tokenized_sentences]
    return tokenized_sentences


def save_tokenized_sentences(tokenized_sentences, file_path):
    with open(file_path, 'wb') as file:
        pickle.dump(tokenized_sentences, file)


def load_tokenized_sentences(file_path):
    with open(file_path, 'rb') as file:
        tokenized_sentences = pickle.load(file)
    return tokenized_sentences


def preprocess_text_unpacked(sentences):
    table = str.maketrans("", "", string.punctuation)  # removing punctuation - it has to be a conscious decision
    tokenized_sentences = [[word.lower().translate(table) for word in sentence] for sentence in sentences]
    stop_words = set(stopwords.words("polish.txt"))
    stop_words.update(set(stopwords.words('english')))
    tokenized_sentences = [[word for word in sentence if word and word not in stop_words] for sentence in
                           tokenized_sentences]

    new_tokenized_sentences = []
    for sentence in tokenized_sentences:
        if len(sentence) > 1:
            new_tokenized_sentences.append(sentence)

    return new_tokenized_sentences


def mapping(sentences):
    word_to_id = {}
    id_to_word = {}

    word_to_id['null'] = 0  # padding
    id_to_word[0] = 'null'  # padding

    iterator = 1
    for sentence in sentences:
        for token in sentence:
            if not token in word_to_id:
                word_to_id[token] = iterator
                id_to_word[iterator] = token
                iterator += 1

    return word_to_id, id_to_word


def encode_data(sentences, word_to_idx):
    max_len = max(len(lst) for lst in sentences)
    encoded_sentences = []
    for sentence in sentences:
        encoded_sentence = []
        for word in sentence:
            encoded_sentence.append(word_to_idx[word])
        while len(encoded_sentence) < max_len:
            encoded_sentence.append(0)

        encoded_sentences.append(encoded_sentence)

    return encoded_sentences

def generate_training_data(sequences, window_size, num_ns, vocab_size, seed):
    targets, contexts, labels = [], [], []

    sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)
    for sequence in tqdm.tqdm(sequences):

        positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
            sequence,
            vocabulary_size=vocab_size,
            sampling_table=sampling_table,
            window_size=window_size,
            negative_samples=0)

        for target_word, context_word in positive_skip_grams:
            context_class = tf.expand_dims(
                tf.constant([context_word], dtype="int64"), 1)
            negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
                true_classes=context_class,
                num_true=1,
                num_sampled=num_ns,
                unique=True,
                range_max=vocab_size,
                seed=seed,
                name="negative_sampling")

            context = tf.concat([tf.squeeze(context_class, 1), negative_sampling_candidates], 0)
            label = tf.constant([1] + [0] * num_ns, dtype="int64")

            targets.append(target_word)
            contexts.append(context)
            labels.append(label)

    return targets, contexts, labels


def cosine_similarity(first_vector, second_vector):
    return np.dot(first_vector, second_vector) / (np.linalg.norm(first_vector) * np.linalg.norm(second_vector))

def predict_similar_words(model, text, words, ids, n_words_to_predict):
    idx_text = words[text]
    embeddings = model.get_layer('w2v_embedding').get_weights()[0]
    text_vector = embeddings[idx_text]

    cosine_sim_dict = {word_embedding_idx: cosine_similarity(text_vector, embedding) for word_embedding_idx, embedding in enumerate(embeddings)}
    del cosine_sim_dict[idx_text]

    sorted_cosine_sim_list = sorted(cosine_sim_dict.items(), key=lambda x:x[1])[::-1]
    words_to_return = [(ids[sorted_cosine_sim_list[word][0]], sorted_cosine_sim_list[word][1]) for word in range(n_words_to_predict)]

    return words_to_return

class MyWord2Vec(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim):
        super(MyWord2Vec, self).__init__()
        self.target_embedding = layers.Embedding(vocab_size,
                                                 embedding_dim,
                                                 input_length=1,
                                                 name="w2v_embedding")

        self.context_embedding = layers.Embedding(vocab_size,
                                                  embedding_dim,
                                                  input_length=num_ns + 1)


    def call(self, pair):
        target, context = pair
        if len(target.shape) == 2:
            target = tf.squeeze(target, axis=1)
        word_emb = self.target_embedding(target)
        context_emb = self.context_embedding(context)
        output = tf.einsum('be,bce->bc', word_emb, context_emb)
        return output


In [2]:
file_path = './polish_corpus/polish_tokenized_sentences.pkl'
polish_tokenized_sentences = load_tokenized_sentences(file_path)

preprocessed = preprocess_text_unpacked(polish_tokenized_sentences)
words, ids = mapping(preprocessed)

encoded_data = encode_data(preprocessed, words)

vocab_size = len(words)
SEED = 2137
num_ns = 10
BATCH_SIZE = 1024
BUFFER_SIZE = 10000
window_size = 3
NAME = 'polish_model'

In [None]:
targets, contexts, labels = generate_training_data(
    sequences=encoded_data,
    window_size=window_size,
    num_ns=num_ns,
    vocab_size=vocab_size,
    seed=SEED)

targets = np.array(targets)
contexts = np.array(contexts)
labels = np.array(labels)

print('\n')
print(f"targets.shape: {targets.shape}")
print(f"contexts.shape: {contexts.shape}")
print(f"labels.shape: {labels.shape}")

In [17]:
# dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
# dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
# tf.data.Dataset.save(
#     dataset, NAME, compression=None, shard_func=None, checkpoint_args=None
# )

dataset = tf.data.Dataset.load(NAME)

In [19]:
# embedding_dim = 200
# word2vec = MyWord2Vec(vocab_size, embedding_dim)
# word2vec.compile(optimizer='adam',
#                  loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
#                  metrics=['accuracy'])

word2vec = tf.keras.models.load_model(NAME)

In [11]:
word2vec.fit(dataset, epochs=10)
word2vec.save(NAME)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10




INFO:tensorflow:Assets written to: polish_model\assets


INFO:tensorflow:Assets written to: polish_model\assets


In [12]:
weights = word2vec.get_layer('w2v_embedding').get_weights()[0]
vectors = io.open(NAME + '/vectors.tsv', 'w', encoding='utf-8')
metadata = io.open(NAME + '/metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(words):
    if index == 0:
        continue  # skip 0, it's padding.
    vec = weights[index]
    vectors.write('\t'.join([str(x) for x in vec]) + "\n")
    metadata.write(word + "\n")

vectors.close()
metadata.close()

In [20]:
predict_similar_words(word2vec, 'propagować', words, ids, 5)

[('tranzyty', 0.5482787),
 ('uchylającym', 0.5306668),
 ('kananejczyków', 0.52538216),
 ('powiadomiona', 0.522284),
 ('szwedo', 0.50701684)]

-----------------------------------------------------

In [29]:
from nltk.corpus import brown

# Access the sentences in the Brown corpus
sentences = brown.sents()

In [30]:
preprocessed_brown = preprocess_text_unpacked(sentences)
words_brown, ids_brown = mapping(preprocessed_brown)

encoded_data_brown = encode_data(preprocessed_brown, words_brown)

vocab_size = len(words_brown)
SEED = 2137
num_ns = 4
BATCH_SIZE = 1024
BUFFER_SIZE = 10000
window_size = 2
NAME = 'brown_model'

In [4]:
targets, contexts, labels = generate_training_data(
    sequences=encoded_data_brown,
    window_size=window_size,
    num_ns=num_ns,
    vocab_size=vocab_size,
    seed=SEED)

targets = np.array(targets)
contexts = np.array(contexts)
labels = np.array(labels)

print('\n')
print(f"targets.shape: {targets.shape}")
print(f"contexts.shape: {contexts.shape}")
print(f"labels.shape: {labels.shape}")

100%|██████████| 57340/57340 [06:22<00:00, 149.74it/s]




targets.shape: (1265070,)
contexts.shape: (1265070, 5)
labels.shape: (1265070, 5)


In [31]:
# dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
# dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
# tf.data.Dataset.save(
#     dataset, NAME, compression=None, shard_func=None, checkpoint_args=None
# )

dataset = tf.data.Dataset.load(NAME)

In [32]:
# embedding_dim = 200
# word2vec = MyWord2Vec(vocab_size, embedding_dim)
# word2vec.compile(optimizer='adam',
#                  loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
#                  metrics=['accuracy'])

word2vec = tf.keras.models.load_model(NAME)

In [9]:
word2vec.fit(dataset, epochs=10)
word2vec.save(NAME)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10




INFO:tensorflow:Assets written to: brown_model\assets


INFO:tensorflow:Assets written to: brown_model\assets


In [10]:
weights = word2vec.get_layer('w2v_embedding').get_weights()[0]
vectors = io.open(NAME + '/vectors.tsv', 'w', encoding='utf-8')
metadata = io.open(NAME + '/metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(words_brown):
    if index == 0:
        continue  # skip 0, it's padding.
    vec = weights[index]
    vectors.write('\t'.join([str(x) for x in vec]) + "\n")
    metadata.write(word + "\n")
vectors.close()
metadata.close()

In [33]:
predict_similar_words(word2vec, 'national', words_brown, ids_brown, 5)

[('esse', 0.49134412),
 ('kingpin', 0.4220292),
 ('postreapportionment', 0.41746026),
 ('hushed', 0.4081241),
 ('satires', 0.3924919)]

-------------------------------------

In [34]:
from nltk.corpus import reuters
# import nltk
# nltk.download('reuters')
sentences = reuters.sents()

In [35]:
preprocessed_reuters = preprocess_text_unpacked(sentences)
words_reuters, ids_reuters = mapping(preprocessed_reuters)

encoded_data_reuters = encode_data(preprocessed_reuters, words_reuters)

vocab_size = len(words_reuters)
SEED = 2137
num_ns = 4
BATCH_SIZE = 1024
BUFFER_SIZE = 10000
NAME = 'reuters_model'
window_size = 2

In [24]:
targets, contexts, labels = generate_training_data(
    sequences=encoded_data_reuters,
    window_size=window_size,
    num_ns=num_ns,
    vocab_size=vocab_size,
    seed=SEED)

targets = np.array(targets)
contexts = np.array(contexts)
labels = np.array(labels)

print('\n')
print(f"targets.shape: {targets.shape}")
print(f"contexts.shape: {contexts.shape}")
print(f"labels.shape: {labels.shape}")

100%|██████████| 54716/54716 [07:49<00:00, 116.64it/s]




targets.shape: (1481282,)
contexts.shape: (1481282, 11)
labels.shape: (1481282, 11)


In [36]:
#dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
#dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
#tf.data.Dataset.save(
 #   dataset, NAME, compression=None, shard_func=None, checkpoint_args=None
#)

dataset = tf.data.Dataset.load(NAME)

In [37]:
# embedding_dim = 200
# word2vec = MyWord2Vec(vocab_size, embedding_dim)
# word2vec.compile(optimizer='adam',
#                  loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
#                  metrics=['accuracy'])

word2vec = tf.keras.models.load_model(NAME)

In [7]:
word2vec.fit(dataset, epochs=10)
word2vec.save(NAME)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10




INFO:tensorflow:Assets written to: reuters_model\assets


INFO:tensorflow:Assets written to: reuters_model\assets


In [8]:
weights = word2vec.get_layer('w2v_embedding').get_weights()[0]
vectors = io.open(NAME + '/vectors.tsv', 'w', encoding='utf-8')
metadata = io.open(NAME + '/metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(words_reuters):
    if index == 0:
        continue  # skip 0, it's padding.
    vec = weights[index]
    vectors.write('\t'.join([str(x) for x in vec]) + "\n")
    metadata.write(word + "\n")
vectors.close()
metadata.close()

In [39]:
ids_reuters[100]

'major'

In [41]:
predict_similar_words(word2vec, 'major', words_reuters, ids_reuters, 5)

[('pinola', 0.33815446),
 ('deleted', 0.33553192),
 ('recalled', 0.32847714),
 ('triad', 0.32814038),
 ('dont', 0.32432526)]

----------------------------

In [12]:
import gensim.downloader as api

dataset = api.load("text8")

In [13]:
preprocessed_text8 = preprocess_text_unpacked(list(dataset)[:100])
words_text8, ids_text8 = mapping(preprocessed_text8)

encoded_data_text8 = encode_data(preprocessed_text8, words_text8)

vocab_size = len(words_text8)
SEED = 2137
num_ns = 10
BATCH_SIZE = 1000
BUFFER_SIZE = 10000
NAME = 'text8_model'
window_size = 2

In [14]:
len(words_text8)

52559

In [15]:
targets, contexts, labels = generate_training_data(
    sequences=encoded_data_text8,
    window_size=window_size,
    num_ns=num_ns,
    vocab_size=vocab_size,
    seed=SEED)

targets = np.array(targets)
contexts = np.array(contexts)
labels = np.array(labels)

print('\n')
print(f"targets.shape: {targets.shape}")
print(f"contexts.shape: {contexts.shape}")
print(f"labels.shape: {labels.shape}")

100%|██████████| 100/100 [06:40<00:00,  4.00s/it]




targets.shape: (1305986,)
contexts.shape: (1305986, 11)
labels.shape: (1305986, 11)


In [16]:
dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
tf.data.Dataset.save(
   dataset, NAME, compression=None, shard_func=None, checkpoint_args=None
)

# dataset = tf.data.Dataset.load(NAME)

In [17]:
embedding_dim = 200
word2vec = MyWord2Vec(vocab_size, embedding_dim)
word2vec.compile(optimizer='adam',
                 loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                 metrics=['accuracy'])

# word2vec = tf.keras.models.load_model(NAME)

In [18]:
word2vec.fit(dataset, epochs=10)
word2vec.save(NAME)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10




INFO:tensorflow:Assets written to: text8_model\assets


INFO:tensorflow:Assets written to: text8_model\assets


In [20]:
ids_text8[150]

'intervention'

In [21]:
predict_similar_words(word2vec, 'intervention', words_text8, ids_text8, 5)

[('approaches', 0.40962726),
 ('tailor', 0.4086811),
 ('monarchies', 0.40019533),
 ('kraftwerk', 0.3947389),
 ('complementary', 0.3904963)]