<a href="https://colab.research.google.com/github/RealMyeong/Going_Deeper_NLP/blob/main/OnlyCode/GoingDeeper_NLP_5_%EC%9B%8C%EB%93%9C%EC%9E%84%EB%B2%A0%EB%94%A9_onlycode.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# %%bash
# apt-get update
# apt-get install g++ openjdk-8-jdk python-dev python3-dev
# pip3 install JPype1
# pip3 install konlpy

In [None]:
# %env JAVA_HOME "/usr/lib/jvm/java-8-openjdk-amd64"

In [None]:
import re
from konlpy.tag import Okt
from collections import Counter

아래의 텍스트를 이용해 전처리, 토큰화를 진행하고 원-핫 인코딩을 해보겠습니다.

In [None]:
text = "임금님 귀는 당나귀 귀! 임금님 귀는 당나귀 귀! 실컷~ 소리치고 나니 속이 확 뚫려 살 것 같았어."
text

'임금님 귀는 당나귀 귀! 임금님 귀는 당나귀 귀! 실컷~ 소리치고 나니 속이 확 뚫려 살 것 같았어.'

In [None]:
# 정규 표현식을 이용해 한글이 아닌 문자 다 지워줌
reg = re.compile("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]")
text = reg.sub('', text)
print(text)

임금님 귀는 당나귀 귀 임금님 귀는 당나귀 귀 실컷 소리치고 나니 속이 확 뚫려 살 것 같았어


In [None]:
# 토큰화에 KoNLPy에서 제공하는 Okt 사용
okt = Okt()
tokens = okt.morphs(text)
print(tokens)

['임금님', '귀', '는', '당나귀', '귀', '임금님', '귀', '는', '당나귀', '귀', '실컷', '소리', '치고', '나니', '속이', '확', '뚫려', '살', '것', '같았어']


In [None]:
# Python에서 제공하는 Counter 이용해 단어장 만들기
vocab = Counter(tokens)
print(vocab)

Counter({'귀': 4, '임금님': 2, '는': 2, '당나귀': 2, '실컷': 1, '소리': 1, '치고': 1, '나니': 1, '속이': 1, '확': 1, '뚫려': 1, '살': 1, '것': 1, '같았어': 1})


In [None]:
# 빈도수가 높은 단어 5개만 따로 저장
vocab_size = 5
vocab = vocab.most_common(vocab_size)
print(vocab)

[('귀', 4), ('임금님', 2), ('는', 2), ('당나귀', 2), ('실컷', 1)]


In [None]:
# 단어에 인덱스 부여 
# 빈도수가 높은 단어일수록 낮은 인덱스
word2idx={word[0] : index+1 for index, word in enumerate(vocab)}
print(word2idx)

{'귀': 1, '임금님': 2, '는': 3, '당나귀': 4, '실컷': 5}


In [None]:
# 원-핫 벡터 만드는 함수 정의
def one_hot_encoding(word, word2index):
  one_hot_vector = [0] * len(word2index)
  index = word2idx[word]
  one_hot_vector[index-1] = 1
  return one_hot_vector


In [None]:
one_hot_encoding("임금님", word2idx)

[0, 1, 0, 0, 0]

In [None]:
# 케라스를 이용한 원-핫 인코딩
import tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer # 단어장 만드는 역할
from tensorflow.keras.utils import to_categorical # 원-핫 인코딩 도와주는 역할

In [None]:
text = [['강아지', '고양이', '강아지'],['애교', '고양이'], ['컴퓨터', '노트북']]

t = Tokenizer()
t.fit_on_texts(text)
print(t.word_index) # 각 단어에 대한 인코딩 결과 출력.

{'강아지': 1, '고양이': 2, '애교': 3, '컴퓨터': 4, '노트북': 5}


In [None]:
# 단어장 크기 저장
vocab_size = len(t.word_index) + 1

In [None]:
# 단어장이 만들어졌으니 단어장에 포함되는 단어로 이루어진 텍스트 시퀀스는
# 케라스의 토크나이저를 이용해 정수 시퀀스로 변환 가능
sub_text = ['강아지', '고양이', '강아지', '컴퓨터']
encoded = t.texts_to_sequences([sub_text])
print(encoded)

[[1, 2, 1, 4]]


In [None]:
# 이렇게 저장된 정수 시퀀스를
# to_categorical을 이용하여 원-핫 인코딩 가능
one_hot = to_categorical(encoded, num_classes = vocab_size)
print(one_hot)

[[[0. 1. 0. 0. 0. 0.]
  [0. 0. 1. 0. 0. 0.]
  [0. 1. 0. 0. 0. 0.]
  [0. 0. 0. 0. 1. 0.]]]


In [None]:
import nltk
nltk.download('abc')
nltk.download('punkt')

[nltk_data] Downloading package abc to /root/nltk_data...
[nltk_data]   Unzipping corpora/abc.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from nltk.corpus import abc
corpus = abc.sents()
print("슝~")

슝~


In [None]:
print(corpus[:3])

[['PM', 'denies', 'knowledge', 'of', 'AWB', 'kickbacks', 'The', 'Prime', 'Minister', 'has', 'denied', 'he', 'knew', 'AWB', 'was', 'paying', 'kickbacks', 'to', 'Iraq', 'despite', 'writing', 'to', 'the', 'wheat', 'exporter', 'asking', 'to', 'be', 'kept', 'fully', 'informed', 'on', 'Iraq', 'wheat', 'sales', '.'], ['Letters', 'from', 'John', 'Howard', 'and', 'Deputy', 'Prime', 'Minister', 'Mark', 'Vaile', 'to', 'AWB', 'have', 'been', 'released', 'by', 'the', 'Cole', 'inquiry', 'into', 'the', 'oil', 'for', 'food', 'program', '.'], ['In', 'one', 'of', 'the', 'letters', 'Mr', 'Howard', 'asks', 'AWB', 'managing', 'director', 'Andrew', 'Lindberg', 'to', 'remain', 'in', 'close', 'contact', 'with', 'the', 'Government', 'on', 'Iraq', 'wheat', 'sales', '.']]


In [None]:
print('코퍼스의 크기 :',len(corpus))

코퍼스의 크기 : 29059


In [None]:
from gensim.models import Word2Vec

model = Word2Vec(sentences = corpus, size = 100, window = 5, min_count = 5, workers = 4, sg = 0)
print("모델 학습 완료!")

모델 학습 완료!


In [None]:
model_result = model.wv.most_similar("man")
print(model_result)

[('woman', 0.9343962073326111), ('Bang', 0.9235856533050537), ('asteroid', 0.9182454347610474), ('third', 0.9149532318115234), ('rally', 0.9120045304298401), ('skull', 0.9073055386543274), ('dinosaur', 0.9064410924911499), ('dog', 0.9029918909072876), ('penalty', 0.9011621475219727), ('bought', 0.8998528718948364)]


In [None]:
from gensim.models import KeyedVectors

model.wv.save_word2vec_format('/content/drive/MyDrive/AIFFEL/GoingDeeper(NLP)/5_워드임베딩/w2v') 
loaded_model = KeyedVectors.load_word2vec_format("/content/drive/MyDrive/AIFFEL/GoingDeeper(NLP)/5_워드임베딩/w2v")
print("모델  load 완료!")

모델  load 완료!


In [None]:
model_result = loaded_model.most_similar("man")
print(model_result)

[('woman', 0.9343962073326111), ('Bang', 0.9235856533050537), ('asteroid', 0.9182454347610474), ('third', 0.9149532318115234), ('rally', 0.9120045304298401), ('skull', 0.9073055386543274), ('dinosaur', 0.9064410924911499), ('dog', 0.9029918909072876), ('penalty', 0.9011621475219727), ('bought', 0.8998528718948364)]


In [None]:
# 에러가 나더라도 놀라지 마세요.
loaded_model.most_similar('overacting')

KeyError: ignored

In [None]:
loaded_model.most_similar('memorry')

KeyError: ignored

In [None]:
!python -m gensim.scripts.word2vec2tensor --input /content/drive/MyDrive/AIFFEL/GoingDeeper_NLP/5_워드임베딩/w2v --output /content/drive/MyDrive/AIFFEL/GoingDeeper_NLP/5_워드임베딩/w2v

2022-09-30 01:15:21,082 - word2vec2tensor - INFO - running /usr/local/lib/python3.7/dist-packages/gensim/scripts/word2vec2tensor.py --input /content/drive/MyDrive/AIFFEL/GoingDeeper_NLP/5_워드임베딩/w2v --output /content/drive/MyDrive/AIFFEL/GoingDeeper_NLP/5_워드임베딩/w2v
2022-09-30 01:15:21,082 - utils_any2vec - INFO - loading projection weights from /content/drive/MyDrive/AIFFEL/GoingDeeper_NLP/5_워드임베딩/w2v
2022-09-30 01:15:21,585 - utils_any2vec - INFO - loaded (10363, 100) matrix from /content/drive/MyDrive/AIFFEL/GoingDeeper_NLP/5_워드임베딩/w2v
2022-09-30 01:15:22,195 - word2vec2tensor - INFO - 2D tensor file saved to /content/drive/MyDrive/AIFFEL/GoingDeeper_NLP/5_워드임베딩/w2v_tensor.tsv
2022-09-30 01:15:22,196 - word2vec2tensor - INFO - Tensor metadata file saved to /content/drive/MyDrive/AIFFEL/GoingDeeper_NLP/5_워드임베딩/w2v_metadata.tsv
2022-09-30 01:15:22,197 - word2vec2tensor - INFO - finished running word2vec2tensor.py


In [None]:
from gensim.models import FastText
fasttext_model = FastText(corpus, window=5, min_count=5, workers=4, sg=1)
print("FastText 학습 완료!")

FastText 학습 완료!


In [None]:
fasttext_model.wv.most_similar('overacting')

[('interacting', 0.8626994490623474),
 ('extracting', 0.857528805732727),
 ('contracting', 0.8379862308502197),
 ('rising', 0.8371866941452026),
 ('malting', 0.8366826772689819),
 ('consuming', 0.8335932493209839),
 ('attracting', 0.8332030773162842),
 ('costing', 0.8227100372314453),
 ('rating', 0.8217033743858337),
 ('expressing', 0.8192507028579712)]

In [None]:
fasttext_model.wv.most_similar('memoryy')

[('memory', 0.8895126581192017),
 ('musical', 0.8427258729934692),
 ('basic', 0.8344112634658813),
 ('technical', 0.825336217880249),
 ('technological', 0.813245415687561),
 ('mechanical', 0.7993186116218567),
 ('magic', 0.7976161241531372),
 ('mechanisms', 0.7963376045227051),
 ('imagine', 0.7953901290893555),
 ('colour', 0.7905169725418091)]

In [None]:
import gensim.downloader as api
glove_model = api.load("glove-wiki-gigaword-50")  # glove vectors 다운로드
glove_model.most_similar("dog")  # 'dog'과 비슷한 단어 찾기



[('cat', 0.9218005537986755),
 ('dogs', 0.8513159155845642),
 ('horse', 0.7907583713531494),
 ('puppy', 0.7754921913146973),
 ('pet', 0.7724707722663879),
 ('rabbit', 0.7720813751220703),
 ('pig', 0.7490061521530151),
 ('snake', 0.7399188280105591),
 ('baby', 0.7395570874214172),
 ('bite', 0.7387937903404236)]

In [None]:
glove_model.most_similar('overacting')

[('impudence', 0.7842013239860535),
 ('puerile', 0.781603217124939),
 ('winningly', 0.7644237875938416),
 ('grossness', 0.7576098442077637),
 ('deconstructions', 0.7489365935325623),
 ('over-the-top', 0.7460805177688599),
 ('buffoonery', 0.7460456490516663),
 ('impetuosity', 0.7415392994880676),
 ('sophomoric', 0.736961841583252),
 ('zaniness', 0.7353197336196899)]

In [None]:
glove_model.most_similar('memoryy')

KeyError: ignored