# 영어 Word2Vec 만들기
- 영어로 된 코퍼스를 다운받아 전처리를 수행
- 전처리한 데이터를 바탕으로 Word2Vec 수행

In [1]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
import urllib.request
import zipfile
from lxml import etree
import re
from nltk.tokenize import word_tokenize, sent_tokenize

### 훈련 데이터 이해하기

In [4]:
urllib.request.urlretrieve('https://raw.githubusercontent.com/GaoleMeng/RNN-and-FFNN-textClassification/master/ted_en-20160408.xml', filename='ted_en-20160408.xml')

('ted_en-20160408.xml', <http.client.HTTPMessage at 0x7f35f498bc50>)

### 훈련 데이터 전처리하기

In [5]:
targetXML = open('ted_en-20160408.xml', 'r', encoding='utf8')
target_text = etree.parse(targetXML)
parse_text = '\n'.join(target_text.xpath('//content/text()'))

content_text = re.sub(r'\([^)]*\)', '', parse_text)

sent_text = sent_tokenize(content_text)

normalized_text = []
for string in sent_text:
     tokens = re.sub(r"[^a-z0-9]+", " ", string.lower())
     normalized_text.append(tokens)

result = [word_tokenize(sentence) for sentence in normalized_text]

In [6]:
print(f'총 샘플의 개수: {len(result)}')
for line in result[:3]:
  print(line)

총 샘플의 개수: 273424
['here', 'are', 'two', 'reasons', 'companies', 'fail', 'they', 'only', 'do', 'more', 'of', 'the', 'same', 'or', 'they', 'only', 'do', 'what', 's', 'new']
['to', 'me', 'the', 'real', 'real', 'solution', 'to', 'quality', 'growth', 'is', 'figuring', 'out', 'the', 'balance', 'between', 'two', 'activities', 'exploration', 'and', 'exploitation']
['both', 'are', 'necessary', 'but', 'it', 'can', 'be', 'too', 'much', 'of', 'a', 'good', 'thing']


### Word2Vec 훈련시키기
- size = 워드 벡터의 특징 값, 임베딩 된 벡터의 차원
- window = 컨텍스트 윈도우 크기
- min_count = 단어 최소 빈도 수 제한
- workers = 학습을 위한 프로세스 수
- sg = 0은 CBOW, 1은 Skip-gram

In [7]:
from gensim.models import Word2Vec
model = Word2Vec(sentences=result, size=100, window=5, min_count=5, workers=4, sg=0)

In [9]:
model_result = model.wv.most_similar('man')
model_result

[('woman', 0.8554869890213013),
 ('guy', 0.8004087209701538),
 ('lady', 0.7900898456573486),
 ('boy', 0.7818707823753357),
 ('soldier', 0.7612593173980713),
 ('girl', 0.7534786462783813),
 ('gentleman', 0.7513275146484375),
 ('poet', 0.6957038640975952),
 ('kid', 0.6824278235435486),
 ('friend', 0.674918532371521)]

In [11]:
model.wv.most_similar(positive=['man', 'girl'], negative=['boy'], topn=3)   # man - boy + girl

[('woman', 0.835976243019104),
 ('soldier', 0.746188759803772),
 ('lady', 0.7460042238235474)]

### 모델 저장 및 로드

In [12]:
from gensim.models import KeyedVectors
model.wv.save_word2vec_format('eng_w2v') # 모델 저장
loaded_model = KeyedVectors.load_word2vec_format("eng_w2v") # 모델 로드

In [13]:
model_result = loaded_model.most_similar("man")
print(model_result)

[('woman', 0.8554869890213013), ('guy', 0.8004087209701538), ('lady', 0.7900898456573486), ('boy', 0.7818707823753357), ('soldier', 0.7612593173980713), ('girl', 0.7534786462783813), ('gentleman', 0.7513275146484375), ('poet', 0.6957038640975952), ('kid', 0.6824278235435486), ('friend', 0.674918532371521)]
