In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.datasets import fetch_20newsgroups
from tensorflow.keras.preprocessing.text import Tokenizer

In [2]:
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data
print('총 샘플 수 :',len(documents))

총 샘플 수 : 11314


In [3]:
news_df = pd.DataFrame({'document':documents})
# 특수 문자 제거
news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z]", " ", regex = True)
# 길이가 3이하인 단어는 제거 (길이가 짧은 단어 제거)
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
# 전체 단어에 대한 소문자 변환
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: x.lower())

In [4]:
news_df.isnull().values.any()

False

In [5]:
news_df.replace("", float("NaN"), inplace=True)
news_df.isnull().values.any()

True

In [6]:
news_df.dropna(inplace=True)
print('총 샘플 수 :',len(news_df))

총 샘플 수 : 10995


In [7]:
stop_words = stopwords.words('english')
tokenized_doc = news_df['clean_doc'].apply(lambda x: x.split())
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])
tokenized_doc = tokenized_doc.to_list()

In [8]:
drop_train = [index for index, sentence in enumerate(tokenized_doc) if len(sentence) <= 1]
tokenized_doc = np.delete(tokenized_doc, drop_train, axis=0)
print('총 샘플 수 :',len(tokenized_doc))

총 샘플 수 : 10940


  return array(a, dtype, copy=False, order=order)


In [9]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(tokenized_doc)

word2idx = tokenizer.word_index
idx2word = {v:k for k, v in word2idx.items()}
encoded = tokenizer.texts_to_sequences(tokenized_doc)

In [10]:
print(encoded[:2])

[[9, 59, 603, 207, 3278, 1495, 474, 702, 9470, 13686, 5533, 15227, 702, 442, 702, 70, 1148, 1095, 1036, 20294, 984, 705, 4294, 702, 217, 207, 1979, 15228, 13686, 4865, 4520, 87, 1530, 6, 52, 149, 581, 661, 4406, 4988, 4866, 1920, 755, 10668, 1102, 7837, 442, 957, 10669, 634, 51, 228, 2669, 4989, 178, 66, 222, 4521, 6066, 68, 4295], [1026, 532, 2, 60, 98, 582, 107, 800, 23, 79, 4522, 333, 7838, 864, 421, 3825, 458, 6488, 458, 2700, 4730, 333, 23, 9, 4731, 7262, 186, 310, 146, 170, 642, 1260, 107, 33568, 13, 985, 33569, 33570, 9471, 11491]]


In [11]:
vocab_size = len(word2idx) + 1 
print('단어 집합의 크기 :', vocab_size)

단어 집합의 크기 : 64277


In [12]:
from tensorflow.keras.preprocessing.sequence import skipgrams
# 네거티브 샘플링
skip_grams = [skipgrams(sample, vocabulary_size=vocab_size, window_size=10) for sample in encoded[:10]]

In [13]:
pairs, labels = skip_grams[0][0], skip_grams[0][1]
for i in range(5):
    print("({:s} ({:d}), {:s} ({:d})) -> {:d}".format(
          idx2word[pairs[i][0]], pairs[i][0], 
          idx2word[pairs[i][1]], pairs[i][1], 
          labels[i]))

(look (66), alleg (42753)) -> 0
(media (702), incidences (20294)) -> 1
(atrocities (4406), commited (7837)) -> 1
(reputation (5533), minimized (14706)) -> 0
(treating (4521), lenaduzzi (46374)) -> 0


In [14]:
print('전체 샘플 수 :',len(skip_grams))

전체 샘플 수 : 10


In [15]:
print(len(pairs))
print(len(labels))

2220
2220


In [16]:
skip_grams = [skipgrams(sample, vocabulary_size=vocab_size, window_size=10) for sample in encoded]

In [17]:
# SGNS 직접구현
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, Reshape, Activation, Input
from tensorflow.keras.layers import Dot
from tensorflow.keras.utils import plot_model
from IPython.display import SVG

In [18]:
embed_size = 100

In [19]:
# 중심 단어를 위한 임베딩 테이블
w_inputs = Input(shape=(1, ), dtype='int32')
word_embedding = Embedding(vocab_size, embed_size)(w_inputs)

# 주변 단어를 위한 임베딩 테이블
c_inputs = Input(shape=(1, ), dtype='int32')
context_embedding  = Embedding(vocab_size, embed_size)(c_inputs)

In [20]:
dot_product = Dot(axes=2)([word_embedding, context_embedding])
dot_product = Reshape((1,), input_shape=(1, 1))(dot_product)
output = Activation('sigmoid')(dot_product)

In [21]:
model = Model(inputs=[w_inputs, c_inputs], outputs=output)
model.summary()
model.compile(loss='binary_crossentropy', optimizer='adam')
plot_model(model, to_file='model3.png', show_shapes=True, show_layer_names=True, rankdir='TB')

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 1, 100)       6427700     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1, 100)       6427700     input_2[0][0]                    
______________________________________________________________________________________________

In [22]:
for epoch in range(1, 6):
    loss = 0
    for _, elem in enumerate(skip_grams):
        first_elem = np.array(list(zip(*elem[0]))[0], dtype='int32')
        second_elem = np.array(list(zip(*elem[0]))[1], dtype='int32')
        labels = np.array(elem[1], dtype='int32')
        X = [first_elem, second_elem]
        Y = labels
        loss += model.train_on_batch(X,Y)  
    print('Epoch :',epoch, 'Loss :',loss)

Epoch : 1 Loss : 4628.423934619874
Epoch : 2 Loss : 3681.937418319285
Epoch : 3 Loss : 3527.399136085063
Epoch : 4 Loss : 3324.8852211963385
Epoch : 5 Loss : 3099.0093686953187


In [23]:
import gensim

In [24]:
f = open('vectors.txt' ,'w')
f.write('{} {}\n'.format(vocab_size-1, embed_size))
vectors = model.get_weights()[0]
for word, i in tokenizer.word_index.items():
    f.write('{} {}\n'.format(word, ' '.join(map(str, list(vectors[i, :])))))
f.close()

In [25]:
w2v = gensim.models.KeyedVectors.load_word2vec_format('./vectors.txt', binary=False)

In [26]:
w2v.most_similar(positive=['soldiers'])

[('shelling', 0.7879363894462585),
 ('wounded', 0.7878394722938538),
 ('fighting', 0.7834117412567139),
 ('massacred', 0.7812206149101257),
 ('civilians', 0.7802790999412537),
 ('kurds', 0.7777568697929382),
 ('territories', 0.7744124531745911),
 ('villages', 0.7742233872413635),
 ('inhabitants', 0.773139238357544),
 ('journalists', 0.7730282545089722)]

In [27]:
w2v.most_similar(positive=['doctor'])

[('fetus', 0.6056748032569885),
 ('therapy', 0.58948814868927),
 ('infections', 0.5721393823623657),
 ('motivated', 0.5524737238883972),
 ('treatment', 0.5509247779846191),
 ('monkeys', 0.5498855113983154),
 ('iraqi', 0.5473611950874329),
 ('dying', 0.5435658097267151),
 ('quack', 0.5425921678543091),
 ('pain', 0.5422199368476868)]

In [28]:
w2v.most_similar(positive=['police'])

[('disarming', 0.5951955914497375),
 ('armed', 0.5904083251953125),
 ('liberties', 0.5899626612663269),
 ('outcome', 0.5889502167701721),
 ('officers', 0.5852018594741821),
 ('untenable', 0.5816648602485657),
 ('coercion', 0.5810487270355225),
 ('firearms', 0.578995943069458),
 ('rocks', 0.5771613717079163),
 ('violations', 0.5758952498435974)]

In [29]:
w2v.most_similar(positive=['knife'])

[('raped', 0.7723942995071411),
 ('burned', 0.7662860155105591),
 ('disgusted', 0.7302854657173157),
 ('possessions', 0.7281520962715149),
 ('reign', 0.7252038717269897),
 ('struggling', 0.7237371206283569),
 ('microdistrict', 0.722679853439331),
 ('obeyed', 0.7217905521392822),
 ('rulers', 0.7207414507865906),
 ('helicopter', 0.7199659943580627)]

In [30]:
w2v.most_similar(positive=['engine'])

[('cylinder', 0.5849126577377319),
 ('tires', 0.5793577432632446),
 ('adjustable', 0.5786377787590027),
 ('metzeler', 0.5504566431045532),
 ('wheel', 0.5483571290969849),
 ('valve', 0.5468419194221497),
 ('chevy', 0.5426185727119446),
 ('honda', 0.5317287445068359),
 ('pickup', 0.5307794809341431),
 ('carb', 0.5306989550590515)]