<a href="https://colab.research.google.com/github/Re14m/training/blob/master/2022-0321_recipie397.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# [ゼロからword2vecを学習して可視化するレシピ](https://axross-recipe.com/recipes/397)

In [None]:
#パッケージのインストール
!pip install tensorflow

In [None]:
# コーパス
corpus = [
    "dog is a canine",
    "canine is a mammal",
    "mammal is an animal",
    "animal is a living thing",
    "living thing is a thing",
    "bulldog is a dog",
    "kitty is a cat",
    "cat is a mammal",
]

In [None]:
# テキストの前処理を実施（stopwords）
def remove_stop_words(corpus):
    stop_words = ["is", "a", "an", "will", "be"]
    results = []
    for text in corpus:
        tmp = text.split(" ")
        for stop_word in stop_words:
            if stop_word in tmp:
                tmp.remove(stop_word)
        results.append(" ".join(tmp))
    return results

corpus = remove_stop_words(corpus)
corpus

In [None]:
# 単語ごとに分割（stopwords）
words = []
for text in corpus:
    for word in text.split(" "):
        words.append(word)

words = set(words)
words

In [None]:
# 近傍に出現する単語のカウント（Skip-Gram）
word_to_id = {}
for i, word in enumerate(words):
    word_to_id[word] = i

sentences = []
for sentence in corpus:
    sentences.append(sentence.split())

window_size = 2

data = []
for sentence in sentences:
    for idx, word in enumerate(sentence):
        for neighbor in sentence[
            max(idx - window_size, 0) : min(idx + window_size, len(sentence)) + 1
        ]:
            if neighbor != word:
                data.append([word, neighbor])

import pandas as pd

df = pd.DataFrame(data, columns=["input", "label"])
print(df)

In [None]:
# datasetの作成
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

import numpy as np

ONE_HOT_DIM = len(words)

def to_one_hot_encoding(data_point_index):
    one_hot_encoding = np.zeros(ONE_HOT_DIM)
    one_hot_encoding[data_point_index] = 1
    return one_hot_encoding

X = []
Y = []

for x, y in zip(df["input"], df["label"]):
    X.append(to_one_hot_encoding(word_to_id[x]))
    Y.append(to_one_hot_encoding(word_to_id[y]))

X_train = np.asarray(X)
Y_train = np.asarray(Y)

print(word_to_id)
print(X_train)
print(Y_train)

In [None]:
# モデル構築・学習
x = tf.placeholder(tf.float32, shape=(None, ONE_HOT_DIM))
y_label = tf.placeholder(tf.float32, shape=(None, ONE_HOT_DIM))
EMBEDDING_DIM = 2

w1 = tf.Variable(tf.random_normal([ONE_HOT_DIM, EMBEDDING_DIM]))
b1 = tf.Variable(tf.random_normal([1]))
hidden_layer = tf.add(tf.matmul(x, w1), b1)

w2 = tf.Variable(tf.random_normal([EMBEDDING_DIM, ONE_HOT_DIM]))
b2 = tf.Variable(tf.random_normal([1]))
prediction = tf.nn.softmax(tf.add(tf.matmul(hidden_layer, w2), b2))

loss = tf.reduce_mean(-tf.reduce_sum(y_label * tf.log(prediction), axis=[1]))

train_op = tf.train.GradientDescentOptimizer(0.05).minimize(loss)

sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)

iteration = 20000
for i in range(iteration):
    sess.run(train_op, feed_dict={x: X_train, y_label: Y_train})
    if i % 3000 == 0:
        print(
            "iteration " + str(i) + " loss is : ",
            sess.run(loss, feed_dict={x: X_train, y_label: Y_train}),
        )
vectors = sess.run(w1 + b1)
w2v_df = pd.DataFrame(vectors, columns=["x1", "x2"])
w2v_df["word"] = list(words)
w2v_df = w2v_df[["word", "x1", "x2"]]
w2v_df

In [None]:
# 可視化（word2vec）
import matplotlib.pyplot as plt

fig, ax = plt.subplots()

for word, x1, x2 in zip(w2v_df["word"], w2v_df["x1"], w2v_df["x2"]):
    ax.annotate(word, (x1, x2))

PADDING = 1.0

x_axis_min = np.amin(vectors, axis=0)[0] - PADDING
y_axis_min = np.amin(vectors, axis=0)[1] - PADDING
x_axis_max = np.amax(vectors, axis=0)[0] + PADDING
y_axis_max = np.amax(vectors, axis=0)[1] + PADDING

plt.xlim(x_axis_min, x_axis_max)
plt.ylim(y_axis_min, y_axis_max)
plt.rcParams["figure.figsize"] = (10, 10)

plt.show()