#### 영어 Word2Vec만들기(네이버 영화 리뷰)

In [None]:
!pip install konlpy
!pip install mecab-python
!bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh)

In [None]:
import re
import urllib.request
import zipfile
from lxml import etree
from nltk.tokenize import word_tokenize, sent_tokenize

In [None]:
import nltk
nltk.download('punkt_tab')

In [None]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/ukairia777/tensorflow-nlp-tutorial/main/09.%20Word%20Embedding/dataset/ted_en-20160408.xml", filename="ted_en-20160408.xml")

In [None]:
# 필요한 데이터인 <content></content> 사이의 내용만 추출
targetXML = open('ted_en-20160408.xml', 'r', encoding='UTF8')
target_text = etree.parse(targetXML)
parse_text = '\n'.join(target_text.xpath('//content/text()'))

content_text = re.sub(r'\([^)]*\)', '', parse_text)

sent_text = sent_tokenize(content_text)

normalized_text = []
for string in sent_text:
    tokens = re.sub(r"[^a-z0-9]+", " ", string.lower())
    normalized_text.append(tokens)

result = [word_tokenize(sentence) for sentence in normalized_text]

In [None]:
print('총 샘플의 수: ', len(result))

In [None]:
print(result[0])

In [None]:
!pip install gensim

In [None]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

In [None]:
# Word2Vec 하이퍼파라미터
# vector_size: 임베딩 된 벡터의 차원
# window: 컨텍스트 윈도우 크기
# min_count: 단어 최소 빈도 수 제한(min_count 아래의 단어는 학습하지 않는다.)
# workers: 학습을 위한 프로세스 수
# sg: CBOW=0, SKIP-gram=1
model = Word2Vec(sentences=result, vector_size=100, window=5, min_count=5, workers=4, sg=0)

In [None]:
# man과 가장 유사한 단어 찾기(wv.most_similar())
model_result = model.wv.most_similar("man")
print(model_result)

In [None]:
# 모델 저장하고 로드하기
model.wv.save_word2vec_format('eng_w2v')
loaded_model = KeyedVectors.load_word2vec_format("eng_w2v")

In [None]:
!pip install konlpy

#### 한국어 Word2Vec 만들기(네이버 영화 리뷰)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import urllib.request
from gensim.models.word2vec import Word2Vec
from konlpy.tag import Okt

In [None]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings.txt", filename="ratings.txt")

In [None]:
train_data = pd.read_table('ratings.txt')

In [None]:
train_data.head()

In [None]:
train_data.isnull().values.any()

In [None]:
train_data = train_data.dropna(how="any")
len(train_data)

In [None]:
# 한글이 아닌 텍스트 제거
train_data['document'] = train_data['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣]", "", regex=True)

In [None]:
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']

In [None]:
import tqdm

okt = Okt()

train_data = train_data[:50000]

tokenized_data = []
for sentence in tqdm.tqdm(train_data['document']):
    tokenized_sentence = okt.morphs(sentence, stem=True)
    stopwords_removed_sentence = [word for word in tokenized_sentence if not word in stopwords]
    tokenized_data.append(stopwords_removed_sentence)

In [None]:
plt.hist([len(review) for review in tokenized_data], bins=50)
plt.xlabel('length of samples')
plt.ylabel('number of samples')
plt.show()

In [None]:
from gensim.models import Word2Vec

model = Word2Vec(sentences = tokenized_data, vector_size=100, window=5, min_count=5, workers=4, sg=0)

In [None]:
model.wv.vectors.shape

In [None]:
print(model.wv.most_similar("최민식"))

In [None]:
print(model.wv.most_similar("사랑"))

In [None]:
!python -m gensim.scripts.word2vec2tensor --input eng_w2v --output env_w2v