## 영어 Worv2Vec 만들기

In [2]:
import re
import urllib.request
import zipfile
from lxml import etree
from nltk.tokenize import word_tokenize, sent_tokenize

# 데이터 다운로드
urllib.request.urlretrieve("https://raw.githubusercontent.com/ukairia777/tensorflow-nlp-tutorial/main/09.%20Word%20Embedding/dataset/ted_en-20160408.xml", filename="ted_en-20160408.xml")


('ted_en-20160408.xml', <http.client.HTTPMessage at 0x218ff556eb0>)

In [3]:
targetXML = open('ted_en-20160408.xml', 'r', encoding='UTF8')
target_text = etree.parse(targetXML)

parse_text = '\n'.join(target_text.xpath('//content/text()'))

content_text = re.sub(r'\([^)]*\)', '', parse_text)

sent_text = sent_tokenize(content_text)

normalized_text = []
for string in sent_text:
    tokens = re.sub(r"[^a-z0-9]+", " ", string.lower())
    normalized_text.append(tokens)

result = [word_tokenize(sentence) for sentence in normalized_text]

print(len(result))

273424


In [4]:
# word2vec 훈련
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

model = Word2Vec(sentences=result, vector_size=100, window=5, min_count=5, workers=4, sg=0)

In [5]:
model_result = model.wv.most_similar('man')
model_result

[('woman', 0.8394775986671448),
 ('guy', 0.8123396635055542),
 ('boy', 0.7718464732170105),
 ('lady', 0.7653430104255676),
 ('girl', 0.743292510509491),
 ('gentleman', 0.7418871521949768),
 ('soldier', 0.7393339276313782),
 ('kid', 0.6924528479576111),
 ('poet', 0.6733789443969727),
 ('friend', 0.6635307669639587)]

In [1]:
66724 + 29390 + 67997

164111

In [6]:
# Word2Vec 모델 저장하고 로드
model.wv.save_word2vec_format('c:/Users/Myeong/dding/eng_w2v')
loaded_model = KeyedVectors.load_word2vec_format('c:/Users/Myeong/dding/eng_w2v')

In [7]:
model_result = loaded_model.most_similar('man')
model_result

[('woman', 0.8394775986671448),
 ('guy', 0.8123396635055542),
 ('boy', 0.7718464732170105),
 ('lady', 0.7653430104255676),
 ('girl', 0.743292510509491),
 ('gentleman', 0.7418871521949768),
 ('soldier', 0.7393339276313782),
 ('kid', 0.6924528479576111),
 ('poet', 0.6733789443969727),
 ('friend', 0.6635307669639587)]

## 한국어 Word2Vec 만들기

In [8]:
import pandas as pd
import matplotlib.pyplot as plt
import urllib.request
from gensim.models.word2vec import Word2Vec
from konlpy.tag import Okt

urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings.txt", filename="ratings.txt")


('ratings.txt', <http.client.HTTPMessage at 0x21c4852a0a0>)

In [9]:
train_data = pd.read_table('ratings.txt')
train_data.head()

Unnamed: 0,id,document,label
0,8112052,어릴때보고 지금다시봐도 재밌어요ㅋㅋ,1
1,8132799,"디자인을 배우는 학생으로, 외국디자이너와 그들이 일군 전통을 통해 발전해가는 문화산...",1
2,4655635,폴리스스토리 시리즈는 1부터 뉴까지 버릴께 하나도 없음.. 최고.,1
3,9251303,와.. 연기가 진짜 개쩔구나.. 지루할거라고 생각했는데 몰입해서 봤다.. 그래 이런...,1
4,10067386,안개 자욱한 밤하늘에 떠 있는 초승달 같은 영화.,1


In [10]:
train_data.dropna(axis=0, inplace=True)
train_data.isnull().sum()

id          0
document    0
label       0
dtype: int64

In [11]:
import re

def preprocessing(text):
    text = text.strip()
    text = re.sub(r'[^ㄱ-ㅎ가-힣ㅏ-ㅣ]', ' ', text)
    text = text.strip()
    return text

train_data['document'] = train_data['document'].apply(preprocessing)
train_data.head()

Unnamed: 0,id,document,label
0,8112052,어릴때보고 지금다시봐도 재밌어요ㅋㅋ,1
1,8132799,디자인을 배우는 학생으로 외국디자이너와 그들이 일군 전통을 통해 발전해가는 문화산...,1
2,4655635,폴리스스토리 시리즈는 부터 뉴까지 버릴께 하나도 없음 최고,1
3,9251303,와 연기가 진짜 개쩔구나 지루할거라고 생각했는데 몰입해서 봤다 그래 이런...,1
4,10067386,안개 자욱한 밤하늘에 떠 있는 초승달 같은 영화,1


In [12]:
from tqdm import tqdm
# 불용어 정의
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']

# 형태소 분석기 OKT를 사용한 토큰화 작업 (다소 시간 소요)
okt = Okt()

tokenized_data = []
for sentence in tqdm(train_data['document']):
    tokenized_sentence = okt.morphs(sentence, stem=True) # 토큰화
    stopwords_removed_sentence = [word for word in tokenized_sentence if not word in stopwords] # 불용어 제거
    tokenized_data.append(stopwords_removed_sentence)

100%|██████████| 199992/199992 [07:26<00:00, 447.65it/s]


In [8]:
from gensim.models import Word2Vec
model = Word2Vec(sentences = tokenized_data, vector_size=100, window=5, min_count=5, workers=4, sg=0)

model.wv.vectors.shape

(16386, 100)

In [9]:
model.wv.most_similar('대학교')

[('대학', 0.8255337476730347),
 ('설날', 0.8211284279823303),
 ('부천', 0.8175269365310669),
 ('뮬란', 0.8087301850318909),
 ('투니버스', 0.8086249232292175),
 ('국민학교', 0.8056362867355347),
 ('중계', 0.8023992776870728),
 ('선전', 0.8022359013557434),
 ('추석', 0.7995629906654358),
 ('메가박스', 0.7931147813796997)]

In [10]:
# OOV(Out-of-Vocabulary) 문제가 있음
model.wv.most_similar('경상대')

KeyError: "Key '경상대' not present in vocabulary"

## 네거티브 샘플링을 이용한 Word2Vec 구현

In [5]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.datasets import fetch_20newsgroups
from tensorflow.keras.preprocessing.text import Tokenizer
from tqdm import tqdm
import re

dataset = fetch_20newsgroups(shuffle=True, random_state=2023, remove=('headers', 'footers', 'quotes'))
documents = dataset.data
len(documents)

11314

In [6]:
def preprocessing(text):
    stop_words = stopwords.words('english')

    text = text.lower()
    text = text.strip()
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    text = ' '.join([word for word in text.split() if len(word) > 3 and not word in stop_words])
    text = text.strip()
    return text

news_df = pd.DataFrame({'document':documents})
news_df['clean_doc'] = news_df['document'].apply(preprocessing)
news_df.head()

Unnamed: 0,document,clean_doc
0,I have several people sharing my machine and w...,several people sharing machine would like sepa...
1,"\n\nLet's face it, if the words don't get into...",face words noggin first place hope tell sdpa m...
2,"\nWell, it's not an FTP site, but I got an 800...",well site number signetics signetics contain p...
3,Anyone around here read this yet?\nDoes Anita ...,anyone around read anita number tony
4,"\nHaving lived, played, and worked on and near...",lived played worked near navajo reservation nu...


In [7]:
news_df.isnull().sum()

document     0
clean_doc    0
dtype: int64

In [8]:
news_df.replace('', float('NaN'), inplace=True)
news_df.isnull().sum()

document     218
clean_doc    324
dtype: int64

In [9]:
news_df.dropna(inplace=True)
print(len(news_df))

10990


In [10]:
news_df['tokenized'] = news_df['clean_doc'].apply(lambda x : x.split())
tokenized_doc = news_df['tokenized'].to_list()

In [11]:
drop_train = [index for index, sentence in enumerate(tokenized_doc) if len(sentence) <= 1]
tokenized_doc = np.delete(tokenized_doc, drop_train, axis=0)
print(len(tokenized_doc))

10940


  arr = asarray(arr)


In [12]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(tokenized_doc)

word2idx = tokenizer.word_index
idx2word = {value:key for key, value in word2idx.items()}
encoded = tokenizer.texts_to_sequences(tokenized_doc)

In [13]:
len(encoded)

10940

In [14]:
vocab_size = len(word2idx)+1
vocab_size

64277

In [15]:
# 네거티브 샘플링을 위한 케라스의 skipgrams tool 사용
from tensorflow.keras.preprocessing.sequence import skipgrams

skip_grams = [skipgrams(sample, vocabulary_size = vocab_size, window_size=10) for sample in encoded[:10]]

In [16]:
pairs, labels = skip_grams[0][0], skip_grams[0][1]
for i in range(5):
    print("({:s} ({:d}), {:s} ({:d})) -> {:d}".format(
          idx2word[pairs[i][0]], pairs[i][0], 
          idx2word[pairs[i][1]], pairs[i][1], 
          labels[i]))

(would (1), predicting (9738)) -> 0
(depending (1606), settle (4865)) -> 1
(windows (50), aversion (31293)) -> 0
(sharing (3883), several (125)) -> 1
(things (49), grazing (18898)) -> 0


In [17]:
len(pairs), len(labels)

(1020, 1020)

In [18]:
# 전체 데이터에 대해서 수행
skip_grams = [skipgrams(sample, vocabulary_size=vocab_size, window_size=10) for sample in encoded]

### Skip-gram with negative sampling(SGNS) 구현하기

In [19]:
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, Reshape, Activation, Input
from tensorflow.keras.layers import Dot
from tensorflow.keras.utils import plot_model
from IPython.display import SVG

embedding_dim = 100

# 중심 단어를 위한 임베딩
w_inputs = Input(shape=(1,), dtype='int32')
word_embedding = Embedding(vocab_size, embedding_dim)(w_inputs)

# 주변 단어를 위한 임베딩
c_inputs = Input(shape=(1,), dtype='int32')
context_embedding = Embedding(vocab_size, embedding_dim)(c_inputs)

In [20]:
dot_product = Dot(axes=2)([word_embedding, context_embedding])
dot_product = Reshape((1,), input_shape=(1,1))(dot_product)
output = Activation('sigmoid')(dot_product)

model = Model(inputs=[w_inputs, c_inputs], outputs=output)
model.summary()

model.compile(loss='binary_crossentropy', optimizer='adam')
plot_model(model, to_file='model3.png', show_shapes=True, show_layer_names=True, rankdir='TB')

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 embedding (Embedding)          (None, 1, 100)       6427700     ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, 1, 100)       6427700     ['input_2[0][0]']                
                                                                                              

In [21]:
for epoch in range(1,6):
    loss = 0
    for _, elem in tqdm(enumerate(skip_grams)):
        first_elem = np.array(list(zip(*elem[0]))[0], dtype='int32')
        second_elem = np.array(list(zip(*elem[0]))[1], dtype='int32')
        labels = np.array(elem[1], dtype='int32')
        X = [first_elem, second_elem]
        Y = labels
        loss += model.train_on_batch(X, Y)
    print('Epoch :', epoch, "Loss :", loss)

10940it [03:07, 58.40it/s]


Epoch : 1 Loss : 4639.579268962145


10940it [03:07, 58.42it/s]


Epoch : 2 Loss : 3660.6580528505147


10940it [03:07, 58.40it/s]


Epoch : 3 Loss : 3494.809946205467


10940it [03:06, 58.50it/s]


Epoch : 4 Loss : 3291.273645129055


10940it [03:13, 56.43it/s]

Epoch : 5 Loss : 3064.7308524660766





In [23]:
import gensim

f = open('vectors.txt' ,'w')
f.write('{} {}\n'.format(vocab_size-1, embedding_dim))
vectors = model.get_weights()[0]
for word, i in tokenizer.word_index.items():
    f.write('{} {}\n'.format(word, ' '.join(map(str, list(vectors[i, :])))))
f.close()

# 모델 로드
w2v = gensim.models.KeyedVectors.load_word2vec_format('./vectors.txt', binary=False)

In [26]:
w2v.most_similar(positive=['soldiers'])

[('karabakh', 0.8725166320800781),
 ('tartars', 0.8528890609741211),
 ('extermination', 0.8524039387702942),
 ('terrorists', 0.851419985294342),
 ('karabagh', 0.8512776494026184),
 ('massacred', 0.8512627482414246),
 ('perpetrated', 0.8502932786941528),
 ('aviv', 0.8486507534980774),
 ('nagorno', 0.8477845788002014),
 ('bombing', 0.8458418250083923)]

In [27]:
w2v.most_similar(positive=['doctor'])

[('pain', 0.6986089944839478),
 ('allergic', 0.6542124152183533),
 ('quack', 0.6482238173484802),
 ('symptoms', 0.6418255567550659),
 ('migraine', 0.6369087100028992),
 ('obstruction', 0.6231420040130615),
 ('lyme', 0.6230742931365967),
 ('treatments', 0.612416684627533),
 ('disease', 0.6113265752792358),
 ('medicine', 0.6094194650650024)]

In [28]:
w2v.most_similar(positive=['knife'])

[('refugees', 0.7628820538520813),
 ('tartars', 0.7627054452896118),
 ('azerbaijanis', 0.7625837326049805),
 ('suffering', 0.7580936551094055),
 ('issuing', 0.7512878179550171),
 ('province', 0.7475813031196594),
 ('tyranny', 0.742268979549408),
 ('attackers', 0.7389000058174133),
 ('perished', 0.7386478185653687),
 ('caucasus', 0.7385434508323669)]

## FastText

In [13]:
from gensim.models import FastText

model = FastText(tokenized_data, vector_size=100, window=5, min_count=5, workers=4, sg=1)

In [14]:
model.wv.most_similar('경상대')

[('경상도', 0.8764791488647461),
 ('계백', 0.855137050151825),
 ('허준호', 0.8425992131233215),
 ('장미인애', 0.8420888185501099),
 ('티저', 0.8363873362541199),
 ('정두홍', 0.8363051414489746),
 ('흡수', 0.8354095220565796),
 ('한예리', 0.8339340090751648),
 ('정웅인', 0.8336555361747742),
 ('최정윤', 0.8331465721130371)]

## 자모 단위 한국어 FastText 학습하기

In [17]:
from konlpy.tag import Okt
import re
import pandas as pd
import urllib.request
from tqdm import tqdm
import hgtk

urllib.request.urlretrieve('https://raw.githubusercontent.com/bab2min/corpus/master/sentiment/naver_shopping.txt', filename='ratings_total.txt')

('ratings_total.txt', <http.client.HTTPMessage at 0x21c437b40d0>)

In [21]:
total_data = pd.read_table('ratings_total.txt', names=['ratings', 'reviews'])
print(len(total_data))

200000


In [22]:
total_data.head()

Unnamed: 0,ratings,reviews
0,5,배공빠르고 굿
1,2,택배가 엉망이네용 저희집 밑에층에 말도없이 놔두고가고
2,5,아주좋아요 바지 정말 좋아서2개 더 구매했어요 이가격에 대박입니다. 바느질이 조금 ...
3,2,선물용으로 빨리 받아서 전달했어야 하는 상품이었는데 머그컵만 와서 당황했습니다. 전...
4,5,민트색상 예뻐요. 옆 손잡이는 거는 용도로도 사용되네요 ㅎㅎ


### hgtk tutorial

In [23]:
hgtk.letter.decompose('남')

('ㄴ', 'ㅏ', 'ㅁ')

In [26]:
hgtk.letter.compose('ㄴ','ㅏ','ㅁ')

'남'

In [30]:
# 자모 단위 토큰화
def word_to_jamo(token):
    def to_special_token(jamo):
        if not jamo:
            return '-'
        else:
            return jamo
    
    decomposed_token = ''
    for char in token:
        try:
            # 초정, 중성, 종성으로 분리
            cho, joong, jong = hgtk.letter.decompose(char)

            # 자모가 빈 문자일 경우 특수문사 -로 대체
            cho = to_special_token(cho)
            joong = to_special_token(joong)
            jong = to_special_token(jong)
            decomposed_token = decomposed_token + cho + joong + jong

        except Exception as exception:
            # 만약 음절이 한글이 아닐 경우 자모를 나누지 않고 추가
            if type(exception).__name__ == 'NotHangulException':
                decomposed_token += char
    
    return decomposed_token

In [31]:
word_to_jamo('남 동 생')

'ㄴㅏㅁ ㄷㅗㅇ ㅅㅐㅇ'

In [33]:
word_to_jamo('여 동 생')

'ㅇㅕ- ㄷㅗㅇ ㅅㅐㅇ'

In [34]:
okt = Okt()
print(okt.morphs('선물용으로 빨리 받아서 전달했어야 하는 상품이었는데 머그컵만 와서 당황했습니다'))

['선물', '용', '으로', '빨리', '받아서', '전달', '했어야', '하는', '상품', '이었는데', '머그컵', '만', '와서', '당황', '했습니다']


In [35]:
def tokenize_by_jamo(x):
    return [word_to_jamo(token) for token in okt.morphs(x)]

In [36]:
print(tokenize_by_jamo('선물용으로 빨리 받아서 전달했어야 하는 상품이었는데 머그컵만 와서 당황했습니다'))

['ㅅㅓㄴㅁㅜㄹ', 'ㅇㅛㅇ', 'ㅇㅡ-ㄹㅗ-', 'ㅃㅏㄹㄹㅣ-', 'ㅂㅏㄷㅇㅏ-ㅅㅓ-', 'ㅈㅓㄴㄷㅏㄹ', 'ㅎㅐㅆㅇㅓ-ㅇㅑ-', 'ㅎㅏ-ㄴㅡㄴ', 'ㅅㅏㅇㅍㅜㅁ', 'ㅇㅣ-ㅇㅓㅆㄴㅡㄴㄷㅔ-', 'ㅁㅓ-ㄱㅡ-ㅋㅓㅂ', 'ㅁㅏㄴ', 'ㅇㅘ-ㅅㅓ-', 'ㄷㅏㅇㅎㅘㅇ', 'ㅎㅐㅆㅅㅡㅂㄴㅣ-ㄷㅏ-']


- 전체 데이터 자모단위 토큰화

In [38]:
total_data['tokenized'] = total_data['reviews'].apply(tokenize_by_jamo)

In [39]:
tokenized_sen = total_data['tokenized'].to_list()
print(tokenized_sen[0])

['ㅂㅐ-ㄱㅗㅇ', 'ㅃㅏ-ㄹㅡ-ㄱㅗ-', 'ㄱㅜㅅ']


In [40]:
# 자모를 모아서 단어로 바꿔주는 함수
def jamo_to_word(jamo_sequence):
    tokenized_jamo=[]
    index= 0

    # 1. 초기 입력
    # jamo_sequence = 'ㄴㅏㅁㄷㅗㅇㅛㅐㅇ'
    while index < len(jamo_sequence):
        if not hgtk.checker.is_hangul(jamo_sequence[index]):
            tokenized_jamo.append(jamo_sequence[index])
            index += 1
        else:
            tokenized_jamo.append(jamo_sequence[index:index + 3])
            index += 3
    
    # 2. 자모 단위 토큰화 
    word = ''
    try:
        for jamo in tokenized_jamo:

            if len(jamo) == 3:
                if jamo[2] == '-':
                    word = word + hgtk.letter.compose(jamo[0], jamo[1])
                else:
                    word = word + hgtk.letter.compose(jamo[0], jamo[1], jamo[2])
            
            else:
                word = word + jamo
    # 복원 불가능한 경우
    except Exception as exception:
        if type(exception).__name__ == 'NotHangulException':
            return jamo_sequence
    
    return word

In [43]:
jamo_to_word('ㅇㅕ-ㄷㅗㅇㅅㅐㅇ')

'여동생'