**Word2Vec**

- Skip-gram

In [None]:
'''
embedding = torch.nn.Embedding(
    num_embeddings,
    embedding_dim,
    padding_idx=None,
    max_norm=None,
    norm_type=2.0
)
'''

'\nembedding = torch.nn.Embedding(\n    num_embeddings,\n    embedding_dim,\n    padding_idx=None,\n    max_norm=None,\n    norm_type=2.0\n)\n'

In [None]:
#기본 Skip-gram 클래스
from torch import nn

class VanillaSkipgram(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.embedding=nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embedding_dim
        )
        self.linear=nn.Linear(
            in_features=embedding_dim,
            out_features=vocab_size
        )
    def forward(self, input_ids):
        embeddings=self.embedding(input_ids)
        output=self.linear(embeddings)
        return output

In [None]:
!pip install Korpora konlpy



In [None]:
#영화 리뷰 데이터세트 전처리
import pandas as pd
from Korpora import Korpora
from konlpy.tag import Okt

corpus=Korpora.load("nsmc")
corpus=pd.DataFrame(corpus.test)

tokenizer=Okt()
tokens=[tokenizer.morphs(review) for review in corpus.text]
print(tokens[:3])


    Korpora 는 다른 분들이 연구 목적으로 공유해주신 말뭉치들을
    손쉽게 다운로드, 사용할 수 있는 기능만을 제공합니다.

    말뭉치들을 공유해 주신 분들에게 감사드리며, 각 말뭉치 별 설명과 라이센스를 공유 드립니다.
    해당 말뭉치에 대해 자세히 알고 싶으신 분은 아래의 description 을 참고,
    해당 말뭉치를 연구/상용의 목적으로 이용하실 때에는 아래의 라이센스를 참고해 주시기 바랍니다.

    # Description
    Author : e9t@github
    Repository : https://github.com/e9t/nsmc
    References : www.lucypark.kr/docs/2015-pyconkr/#39

    Naver sentiment movie corpus v1.0
    This is a movie review dataset in the Korean language.
    Reviews were scraped from Naver Movies.

    The dataset construction is based on the method noted in
    [Large movie review dataset][^1] from Maas et al., 2011.

    [^1]: http://ai.stanford.edu/~amaas/data/sentiment/

    # License
    CC0 1.0 Universal (CC0 1.0) Public Domain Dedication
    Details in https://creativecommons.org/publicdomain/zero/1.0/

[Korpora] Corpus `nsmc` is already installed at /root/Korpora/nsmc/ratings_train.txt
[Korpora] Corpus `nsmc` is already installed at /root/Korpora/nsmc/ra

In [None]:
#단어 사전 구축
from collections import Counter

def build_vocab(corpus, n_vocab, special_tokens):
    counter=Counter()
    for tokens in corpus:
        counter.update(tokens)
    vocab=special_tokens
    for token, count in counter.most_common(n_vocab):
        vocab.append(token)
    return vocab

vocab=build_vocab(corpus=tokens, n_vocab=5000, special_tokens=["<unk>"])
token_to_id={token: idx for idx, token in enumerate(vocab)}
id_to_token={idx: token for idx, token in enumerate(vocab)}

print(vocab[:10])
print(len(vocab))

['<unk>', '.', '이', '영화', '의', '..', '가', '에', '...', '을']
5001


In [None]:
#Skip-gram의 단어 쌍 추출
def get_word_pairs(tokens, window_size):
    pairs=[]
    for sentence in tokens:
        sentence_length=len(sentence)
        for idx, center_word in enumerate(sentence):
            window_start=max(0, idx - window_size)
            window_end=min(sentence_length, idx + window_size + 1)
            center_word=sentence[idx]
            context_words=sentence[window_start:idx] + sentence[idx+1:window_end]
        for context_word in context_words:
            pairs.append([center_word, context_word])
    return pairs

word_pairs=get_word_pairs(tokens, window_size=2)
print(word_pairs[:5])

[['ㅋ', '굳'], ['아니잖아', '더'], ['아니잖아', '더욱'], ['....', '보기'], ['....', '에는']]


In [None]:
#인덱스 쌍 변환
def get_index_pairs(word_pairs, token_to_id):
    pairs=[]
    unk_index=token_to_id["<unk>"]
    for word_pair in word_pairs:
        centor_word, context_word=word_pair
        centor_index=token_to_id.get(centor_word, unk_index)
        context_index=token_to_id.get(context_word, unk_index)
        pairs.append([centor_index, context_index])
    return pairs

index_pairs=get_index_pairs(word_pairs, token_to_id)
print(index_pairs[:5])

[[100, 595], [2596, 57], [2596, 903], [48, 160], [48, 246]]


In [None]:
#데이터로더 적용
import torch
from torch.utils.data import TensorDataset, DataLoader

index_pairs=torch.tensor(index_pairs)
center_indexs=index_pairs[:, 0]
context_indexs=index_pairs[:, 1]

dataset=TensorDataset(center_indexs, context_indexs)
dataloader=DataLoader(dataset, batch_size=32, shuffle=True)

In [None]:
#Skip-gram 모델 준비 작업
from torch import optim

device="cuda" if torch.cuda.is_available() else "cpu"
word2vec=VanillaSkipgram(vocab_size=len(token_to_id), embedding_dim=128).to(device)
criterion=nn.CrossEntropyLoss().to(device)
optimizer=optim.SGD(word2vec.parameters(), lr=0.1)

In [None]:
#모델 학습
for epoch in range(10):
    cost=0.0
    for input_ids, target_ids in dataloader:
        input_ids=input_ids.to(device)
        target_ids=target_ids.to(device)

        logits=word2vec(input_ids)
        loss=criterion(logits, target_ids)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        cost+=loss

    cost=cost / len(dataloader)
    print(f"Epoch: {epoch+1:4d}, Cost: {cost:.3f}")

Epoch:    1, Cost: 6.673
Epoch:    2, Cost: 6.069
Epoch:    3, Cost: 5.851
Epoch:    4, Cost: 5.709
Epoch:    5, Cost: 5.606
Epoch:    6, Cost: 5.523
Epoch:    7, Cost: 5.455
Epoch:    8, Cost: 5.397
Epoch:    9, Cost: 5.348
Epoch:   10, Cost: 5.303


In [None]:
#임베딩 값 추출
token_to_embedding=dict()
embedding_matrix=word2vec.embedding.weight.detach().cpu().numpy()

for word, embedding in zip(vocab, embedding_matrix):
    token_to_embedding[word]=embedding

index=30
token=vocab[30]
token_embedding=token_to_embedding[token]
print(token)
print(token_embedding)

연기
[ 0.5066367  -1.7869946   1.6448294   0.89922553  0.20326823 -0.2628825
 -2.2654488   0.9203757   1.2920295   0.5848853   1.574148   -0.8532711
 -0.00398266  0.12257556  1.3608975   0.4857879   0.18131097 -0.45535228
  0.29566503  0.2342968  -0.25485453 -0.6742077  -0.9360183  -1.2431601
  0.85525835  0.48338258 -0.44903103  1.1429794   1.7266386   1.6315706
 -0.07791245  0.53372175 -0.74694866  1.5385375  -0.66876096 -0.5279956
  0.41458336 -0.31082094 -0.46849412 -0.35467234 -0.45798895 -0.01768018
  0.08889236 -0.6118245  -1.0128369  -0.10165706  0.07623158  1.1023722
  2.1637952   1.537462   -0.5787339  -0.22282283  0.52543867  1.4715426
 -0.4562049   0.2290246  -0.5590634  -1.545982    0.1516511  -0.3736755
  0.1028283   0.8478788   0.70460135 -1.9054956  -0.6678803   0.9613172
  1.2960109   0.50879854 -1.0169264  -0.18116437  0.54497206  1.3762122
 -0.61338836  0.25766402 -0.14560688  0.77785456  1.2299052  -1.3991659
 -1.448754    1.6916965   1.5390346   0.5739683   0.1962339

In [None]:
#단어 임베딩 유사도 계산
import numpy as np
from numpy.linalg import norm

def cosine_similarity(a, b):
    cosine=np.dot(b, a) / (norm(b, axis=1)*norm(a))
    return cosine

def top_n_index(cosine_matrix, n):
  closest_indexes=cosine_matrix.argsort()[::-1]
  top_n=closest_indexes[1:n+1]
  return top_n

cosine_matrix=cosine_similarity(token_embedding, embedding_matrix)
top_n=top_n_index(cosine_matrix, n=5)

print(f"{token}와 가장 유사한 5개 단어")
for index in top_n:
    print(f"{id_to_token[index]} - 유사도: {cosine_matrix[index]:.4f}")

연기와 가장 유사한 5개 단어
솔직한 - 유사도: 0.3102
담담하게 - 유사도: 0.2988
피디 - 유사도: 0.2815
주는데 - 유사도: 0.2758
에게도 - 유사도: 0.2723


- Gensim

In [None]:
!pip install gensim



In [None]:
'''
word2vec=gensim.models.Word2Vec(
    sentences=None,
    corpus_file=None,
    vector_size=100,
    alpha=0.025,
    window=5,
    min_count=5,
    workers=3,
    sg=0,
    hs=0,
    cbow_mean=1,
    negative=5,
    ns_exponent=0.75,
    max_final_vocab=None,
    epochs=5,
    batch_words=10000
)
'''

'\nword2vec=gensim.models.Word2Vec(\n    sentences=None,\n    corpus_file=None,\n    vector_size=100,\n    alpha=0.025,\n    window=5,\n    min_count=5,\n    workers=3,\n    sg=0,\n    hs=0,\n    cbow_mean=1,\n    negative=5,\n    ns_exponent=0.75,\n    max_final_vocab=None,\n    epochs=5,\n    batch_words=10000\n)\n'

In [None]:
!pip uninstall -y numpy gensim
!pip install numpy gensim

Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Successfully uninstalled numpy-1.26.4
Found existing installation: gensim 4.3.3
Uninstalling gensim-4.3.3:
  Successfully uninstalled gensim-4.3.3
Collecting numpy
  Using cached numpy-2.2.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
Collecting gensim
  Using cached gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy
  Using cached numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Using cached gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
Using cached numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
Installing collected packages: numpy, gensim
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tsf

In [None]:
#Word2Vec 모델 학습
from gensim.models import Word2Vec

word2vec=Word2Vec(
    sentences=tokens,
    vector_size=128,
    window=5,
    min_count=1,
    sg=1,
    epochs=3,
    max_final_vocab=10000
)

#word2vec.save("../models/word2vec.model")
#word2vec=Word2Vec.load("../models/word2vec.model")

In [None]:
#임베딩 추출 및 유사도 계산
word="연기"
print(word2vec.wv[word])
print(word2vec.wv.most_similar(word, topn=5))
print(word2vec.wv.similarity(w1=word, w2="연기력"))

[-0.35138252 -0.09575734  0.01078036  0.3296205  -0.06404735 -0.04847586
 -0.02165333 -0.17068434 -0.49626878  0.46267617  0.02163552 -0.27002376
 -0.32610792  0.03496901  0.03636095 -0.0592006  -0.21744326  0.07258674
 -0.21599753  0.2781314   0.6629464   0.16208528 -0.15428735 -0.18397978
 -0.23799847 -0.18186544 -0.265073   -0.19382411  0.06363969 -0.18098818
 -0.32142508  0.33520392  0.30456883 -0.14357653  0.12910873 -0.28297436
  0.1370951  -0.17673874 -0.07532267 -0.49993727 -0.06604859  0.11669078
 -0.07926828 -0.4765588  -0.31296754  0.25067112 -0.25892273 -0.27944908
  0.24863765  0.00984355  0.7199508   0.31880382  0.08363467  0.21541536
 -0.3441615   0.17021452  0.29313296  0.36821678 -0.00770938  0.3297394
  0.19861107 -0.35226375  0.22778349 -0.10607314 -0.33860916  0.52227736
 -0.04290876  0.11622152  0.4018581  -0.31056097 -0.38013083 -0.16095804
 -0.3973865   0.09471136 -0.03457201 -0.13507968 -0.27435318 -0.28389287
 -0.15536116  0.25298804  0.15752414  0.17916612  0.

**fastText**

In [None]:
'''
fasttext=gensim.models.FastText(
    sentences=None,
    corpus_file=None,
    vector_size=100,
    alpha=0.025,
    window=5,
    min_count=5,
    workers=3,
    sg=0,
    hs=0,
    cbow_mean=1,
    negative=5,
    ns_exponent=0.75,
    max_final_vocab=None,
    epochs=5,
    batch_words=10000,
    min_n=3,
    max_n=6
)
'''

'\nfasttext=gensim.models.FastText(\n    sentences=None,\n    corpus_file=None,\n    vector_size=100,\n    alpha=0.025,\n    window=5,\n    min_count=5,\n    workers=3,\n    sg=0,\n    hs=0,\n    cbow_mean=1,\n    negative=5,\n    ns_exponent=0.75,\n    max_final_vocab=None,\n    epochs=5,\n    batch_words=10000,\n    min_n=3,\n    max_n=6\n)\n'

In [None]:
#KorNLI 데이터세트 전처리
from Korpora import Korpora

corpus=Korpora.load("kornli")
corpus_texts=corpus.get_all_texts() + corpus.get_all_pairs()
tokens=[sentence.split() for sentence in corpus_texts]

print(tokens[:3])


    Korpora 는 다른 분들이 연구 목적으로 공유해주신 말뭉치들을
    손쉽게 다운로드, 사용할 수 있는 기능만을 제공합니다.

    말뭉치들을 공유해 주신 분들에게 감사드리며, 각 말뭉치 별 설명과 라이센스를 공유 드립니다.
    해당 말뭉치에 대해 자세히 알고 싶으신 분은 아래의 description 을 참고,
    해당 말뭉치를 연구/상용의 목적으로 이용하실 때에는 아래의 라이센스를 참고해 주시기 바랍니다.

    # Description
    Author : KakaoBrain
    Repository : https://github.com/kakaobrain/KorNLUDatasets
    References :
        - Ham, J., Choe, Y. J., Park, K., Choi, I., & Soh, H. (2020). KorNLI and KorSTS: New Benchmark
           Datasets for Korean Natural Language Understanding. arXiv preprint arXiv:2004.03289.
           (https://arxiv.org/abs/2004.03289)

    This is the dataset repository for our paper
    "KorNLI and KorSTS: New Benchmark Datasets for Korean Natural Language Understanding."
    (https://arxiv.org/abs/2004.03289)
    We introduce KorNLI and KorSTS, which are NLI and STS datasets in Korean.

    # License
    Creative Commons Attribution-ShareAlike license (CC BY-SA 4.0)
    Details in https://creativecommons.org/licenses

[kornli] download multinli.train.ko.tsv: 83.6MB [00:02, 28.7MB/s]                            
[kornli] download snli_1.0_train.ko.tsv: 78.5MB [00:00, 146MB/s]                            
[kornli] download xnli.dev.ko.tsv: 516kB [00:00, 1.89MB/s]                            
[kornli] download xnli.test.ko.tsv: 1.04MB [00:00, 3.59MB/s]                            


[['개념적으로', '크림', '스키밍은', '제품과', '지리라는', '두', '가지', '기본', '차원을', '가지고', '있다.'], ['시즌', '중에', '알고', '있는', '거', '알아?', '네', '레벨에서', '다음', '레벨로', '잃어버리는', '거야', '브레이브스가', '모팀을', '떠올리기로', '결정하면', '브레이브스가', '트리플', 'A에서', '한', '남자를', '떠올리기로', '결정하면', '더블', 'A가', '그를', '대신하러', '올라가고', 'A', '한', '명이', '그를', '대신하러', '올라간다.'], ['우리', '번호', '중', '하나가', '당신의', '지시를', '세밀하게', '수행할', '것이다.']]


In [None]:
#fastText 모델 실습
from gensim.models import FastText

fastText=FastText(
    sentences=tokens,
    vector_size=128,
    window=5,
    min_count=5,
    sg=1,
    epochs=3,
    min_n=2,
    max_n=6
)

#fastText.save("../models/fastText.model")
#fastText=FastText.load("../models/fastText.model")

In [None]:
#fastText OOV 처리
oov_token="사랑해요"
oov_vector=fastText.wv[oov_token]

print(oov_token in fastText.wv.index_to_key)
print(fastText.wv.most_similar(oov_vector, topn=5))

False
[('사랑해', 0.9067966938018799), ('사랑', 0.8586940169334412), ('사랑한', 0.8522242903709412), ('사랑해서', 0.843116819858551), ('사랑해.', 0.8415320515632629)]


**순환신경망**

In [None]:
'''
rnn=torch.nn.RNN(
    input_size,
    hidden_size,
    num_layers=1,
    nonlinearity="tanh",
    bias=False,
    batch_first=True,
    dropout=0,
    bidirectional=False
)
'''

'\nrnn=torch.nn.RNN(\n    input_size,\n    hidden_size,\n    num_layers=1,\n    nonlinearity="tanh",\n    bias=False,\n    batch_first=True,\n    dropout=0,\n    bidirectional=False\n)\n'

In [None]:
#양방향 다층 신경망
import torch
from torch import nn

input_size=128
output_size=256
num_layers=3
bidirectional=True

model=nn.RNN(
    input_size=input_size,
    hidden_size=output_size,
    num_layers=num_layers,
    nonlinearity='tanh',
    batch_first=True,
    bidirectional=bidirectional,
)

batch_size=4
sequence_len=6

inputs=torch.randn(batch_size, sequence_len, input_size)
h_0=torch.rand(num_layers*(int(bidirectional) + 1), batch_size, output_size)

outputs, hidden=model(inputs, h_0)
print(outputs.shape)
print(hidden.shape)

torch.Size([4, 6, 512])
torch.Size([6, 4, 256])


In [None]:
'''
lstm=torch.nn.LSTM(
    input_size,
    hidden_size,
    num_layers=1,
    bias=False,
    batch_first=True,
    dropout=0,
    bidirectional=False,
    proj_size=0
)
'''

'\nlstm=torch.nn.LSTM(\n    input_size,\n    hidden_size,\n    num_layers=1,\n    bias=False,\n    batch_first=True,\n    dropout=0,\n    bidirectional=False,\n    proj_size=0\n)\n'

In [None]:
#양방향 다층 장단기 메모리
import torch
from torch import nn

input_size=128
ouput_size=256
num_layers=3
bidirectional=True
proj_size=64

model=nn.LSTM(
    input_size=input_size,
    hidden_size=ouput_size,
    num_layers=num_layers,
    batch_first=True,
    bidirectional=bidirectional,
    proj_size=proj_size,
)

batch_size=4
sequence_len=6

inputs=torch.randn(batch_size, sequence_len, input_size)
h_0=torch.rand(
    num_layers*(int(bidirectional) + 1),
    batch_size,
    proj_size if proj_size > 0 else ouput_size,
)
c_0=torch.rand(num_layers*(int(bidirectional) + 1), batch_size, ouput_size)

outputs, (h_n, c_n)=model(inputs, (h_0, c_0))

print(outputs.shape)
print(h_n.shape)
print(c_n.shape)

torch.Size([4, 6, 128])
torch.Size([6, 4, 64])
torch.Size([6, 4, 256])


  result = _VF.lstm(


In [None]:
#문장 분류 모델
from torch import nn

class SentenceClassifier(nn.Module):
  def __init__(
          self,
          n_vocab,
          hidden_dim,
          embedding_dim,
          n_layers,
          dropout=0.5,
          bidirectional=True,
          model_type='lstm'
    ):
          super().__init__()

          self.embedding=nn.Embedding(
              num_embeddings=n_vocab,
              embedding_dim=embedding_dim,
              padding_idx=0
          )
          if model_type == "rnn":
            self.model = nn.RNN(
                input_size=embedding_dim,
                hidden_size=hidden_dim,
                num_layers=n_layers,
                bidirectional=bidirectional,
                dropout=dropout,
                batch_first=True,
            )
          elif model_type == "lstm":
            self.model = nn.LSTM(
                input_size=embedding_dim,
                hidden_size=hidden_dim,
                num_layers=n_layers,
                bidirectional=bidirectional,
                dropout=dropout,
                batch_first=True,
            )

          if bidirectional:
            self.classifier=nn.Linear(hidden_dim*2, 1)
          else:
            self.classifier=nn.Linear(hidden_dim, 1)
          self.dropout=nn.Dropout(dropout)

  def forward(self, inputs):
    embeddings=self.embedding(inputs)
    output, _=self.model(embeddings)
    last_output=output[:, -1, :]
    last_output=self.dropout(last_output)
    logits=self.classifier(last_output)
    return logits

In [None]:
#데이터세트 불러오기
import pandas as pd
from Korpora import Korpora

corpus=Korpora.load("nsmc")
corpus_df=pd.DataFrame(corpus.test)

train=corpus_df.sample(frac=0.9, random_state=42)
test=corpus_df.drop(train.index)

print(train.head(5).to_markdown())
print("Training Data Size:", len(train))
print("Testing Data Size:", len(test))


    Korpora 는 다른 분들이 연구 목적으로 공유해주신 말뭉치들을
    손쉽게 다운로드, 사용할 수 있는 기능만을 제공합니다.

    말뭉치들을 공유해 주신 분들에게 감사드리며, 각 말뭉치 별 설명과 라이센스를 공유 드립니다.
    해당 말뭉치에 대해 자세히 알고 싶으신 분은 아래의 description 을 참고,
    해당 말뭉치를 연구/상용의 목적으로 이용하실 때에는 아래의 라이센스를 참고해 주시기 바랍니다.

    # Description
    Author : e9t@github
    Repository : https://github.com/e9t/nsmc
    References : www.lucypark.kr/docs/2015-pyconkr/#39

    Naver sentiment movie corpus v1.0
    This is a movie review dataset in the Korean language.
    Reviews were scraped from Naver Movies.

    The dataset construction is based on the method noted in
    [Large movie review dataset][^1] from Maas et al., 2011.

    [^1]: http://ai.stanford.edu/~amaas/data/sentiment/

    # License
    CC0 1.0 Universal (CC0 1.0) Public Domain Dedication
    Details in https://creativecommons.org/publicdomain/zero/1.0/

[Korpora] Corpus `nsmc` is already installed at /root/Korpora/nsmc/ratings_train.txt
[Korpora] Corpus `nsmc` is already installed at /root/Korpora/nsmc/ra

In [None]:
#데이터 토큰화 및 단어 사전 구축
from konlpy.tag import Okt
from collections import Counter

def build_vocab(corpus, n_vocab, special_tokens):
    counter=Counter()
    for tokens in corpus:
        counter.update(tokens)
    vocab=special_tokens
    for token, count in counter.most_common(n_vocab):
        vocab.append(token)
    return vocab

tokenizer=Okt()
train_tokens=[tokenizer.morphs(review) for review in train.text]
test_tokens=[tokenizer.morphs(review) for review in test.text]

vocab=build_vocab(corpus=train_tokens, n_vocab=5000, special_tokens=["<pad>", "<unk>"])
token_to_id={token: idx for idx, token in enumerate(vocab)}
id_to_token={idx: token for idx, token in enumerate(vocab)}

print(vocab[:10])
print(len(vocab))

['<pad>', '<unk>', '.', '이', '영화', '의', '..', '가', '에', '...']
5002


In [None]:
#정수 인코딩 및 패딩
import numpy as np

def pad_sequences(sequences, max_length, pad_value):
    result = list()
    for sequence in sequences:
        sequence=sequence[:max_length]
        pad_length=max_length - len(sequence)
        padded_sequence=sequence + [pad_value] * pad_length
        result.append(padded_sequence)
    return np.asarray(result)

unk_id=token_to_id["<unk>"]
train_ids=[
    [token_to_id.get(token, unk_id) for token in review] for review in train_tokens
]
test_ids=[
    [token_to_id.get(token, unk_id) for token in review] for review in test_tokens
]

max_length=32
pad_id=token_to_id["<pad>"]
train_ids=pad_sequences(train_ids, max_length, pad_id)
test_ids=pad_sequences(test_ids, max_length, pad_id)

print(train_ids[0])
print(test_ids[0])

[ 223 1716   10 4036 2095  193  755    4    2 2330 1031  220   26   13
 4839    1    1    1    2    0    0    0    0    0    0    0    0    0
    0    0    0    0]
[3307    5 1997  456    8    1 1013 3906    5    1    1   13  223   51
    3    1 4684    6    0    0    0    0    0    0    0    0    0    0
    0    0    0    0]


In [None]:
#데이터로더 적용
import torch
from torch.utils.data import TensorDataset, DataLoader


train_ids=torch.tensor(train_ids)
test_ids=torch.tensor(test_ids)

train_labels=torch.tensor(train.label.values, dtype=torch.float32)
test_labels=torch.tensor(test.label.values, dtype=torch.float32)

train_dataset=TensorDataset(train_ids, train_labels)
test_dataset=TensorDataset(test_ids, test_labels)

train_loader=DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader=DataLoader(test_dataset, batch_size=16, shuffle=False)

In [None]:
#손실 함수와 최적화 함수 정의
from torch import optim

n_vocab=len(token_to_id)
hidden_dim=64
embedding_dim=128
n_layers=2

device="cuda" if torch.cuda.is_available() else "cpu"
classifier=SentenceClassifier(
    n_vocab=n_vocab, hidden_dim=hidden_dim, embedding_dim=embedding_dim, n_layers=n_layers
).to(device)
criterion=nn.BCEWithLogitsLoss().to(device)
optimizer=optim.RMSprop(classifier.parameters(), lr=0.001)

In [None]:
#모델 학습 및 테스트
def train(model, datasets, criterion, optimizer, device, interval):
    model.train()
    losses=list()

    for step, (input_ids, labels) in enumerate(datasets):
        input_ids=input_ids.to(device)
        labels=labels.to(device).unsqueeze(1)

        logits=model(input_ids)
        loss=criterion(logits, labels)
        losses.append(loss.item())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if step % interval == 0:
            print(f"Train Loss {step}: {np.mean(losses)}")


def test(model, datasets, criterion, device):
    model.eval()
    losses=list()
    corrects=list()

    for step, (input_ids, labels) in enumerate(datasets):
        input_ids=input_ids.to(device)
        labels=labels.to(device).unsqueeze(1)

        logits=model(input_ids)
        loss=criterion(logits, labels)
        losses.append(loss.item())
        yhat=torch.sigmoid(logits)>.5
        corrects.extend(
            torch.eq(yhat, labels).cpu().tolist()
        )

    print(f"Val Loss: {np.mean(losses)}, Val Accuracy: {np.mean(corrects)}")


epochs=5
interval=500

for epoch in range(epochs):
    train(classifier, train_loader, criterion, optimizer, device, interval)
    test(classifier, test_loader, criterion, device)

Train Loss 0: 0.6964503526687622
Train Loss 500: 0.6935324485668403
Train Loss 1000: 0.6929982728534169
Train Loss 1500: 0.6757427425800682
Train Loss 2000: 0.6529014282289712
Train Loss 2500: 0.6295936134446863
Val Loss: 0.4977589296266294, Val Accuracy: 0.7594
Train Loss 0: 0.25559377670288086
Train Loss 500: 0.4828844573088511
Train Loss 1000: 0.47568262628206126
Train Loss 1500: 0.46985076532651393
Train Loss 2000: 0.4642885667660545
Train Loss 2500: 0.4590223305752543
Val Loss: 0.4423000474516957, Val Accuracy: 0.7796
Train Loss 0: 0.5183154344558716
Train Loss 500: 0.39548741858043596
Train Loss 1000: 0.40511455912362565
Train Loss 1500: 0.4021449322445563
Train Loss 2000: 0.3973186500575291
Train Loss 2500: 0.3979281688161918
Val Loss: 0.4268634514972425, Val Accuracy: 0.8042
Train Loss 0: 0.28801777958869934
Train Loss 500: 0.34975270482296955
Train Loss 1000: 0.3506694947342415
Train Loss 1500: 0.35307166569039156
Train Loss 2000: 0.3553510350563835
Train Loss 2500: 0.35526691

In [None]:
#학습된 모델로부터 임베딩 추출
token_to_embedding=dict()
embedding_matrix=classifier.embedding.weight.detach().cpu().numpy()

for word, emb in zip(vocab, embedding_matrix):
  token_to_embedding[word]=emb

token=vocab[1000]
print(token, token_to_embedding[token])

보고싶다 [-6.5684944e-02  9.8038960e-01  6.7936951e-01 -7.2716779e-01
 -5.3028971e-01 -9.6538115e-01 -1.7423812e-01 -3.5035372e-01
  1.3865409e+00  2.3131403e-01 -4.4144753e-01 -1.2203298e+00
  2.0606885e+00 -2.7916052e+00  1.1546307e+00 -1.7085491e+00
  1.1573409e+00 -4.6776158e-01 -2.0752938e+00  6.1785638e-01
 -7.0677716e-01  5.2808112e-01  6.3309377e-01 -2.3409292e-01
 -1.8632468e+00  4.0347210e-01 -5.7640767e-01  7.4862212e-01
  1.2225380e+00  2.2368523e-01  1.1497176e+00  1.8548336e+00
  8.0419749e-01 -8.8360643e-01  6.9644058e-01 -1.1349346e-01
  1.1079184e+00 -6.8089902e-01 -1.0229847e+00  1.0506445e+00
 -1.2056209e+00 -9.9437767e-01  9.1950691e-01 -6.9805789e-01
  1.1958611e+00  2.7181113e-01 -3.1870151e-01 -3.0849561e-01
 -2.5942824e+00  1.0716674e+00 -9.4971126e-01 -8.3079308e-01
  7.6429561e-02 -2.4256243e-01 -1.0329024e+00  1.1752014e+00
  1.9150287e+00 -1.4501592e+00 -1.5901750e+00  1.1315821e+00
 -1.6957401e+00 -1.9149452e+00 -5.6553084e-01  5.6782079e-01
 -4.4920194e-04  1.

In [None]:
'''
#사전 학습된 모델로 임베딩 계층 초기화
from gensim.models import Word2Vec

word2vec=Word2Vec.load("../models/word2vec.model")
init_embeddings=np.zeros((n_vocab, embedding_dim))

for index, token in id_to_token.items():
    if token not in ["<pad>", "<unk>"]:
        init_embeddings[index]=word2vec.wv[token]

embedding_layer=nn.Embedding.from_pretrained(
    torch.tensor(init_embeddings, dtype=torch.float32)
)
'''

'\n#사전 학습된 모델로 임베딩 계층 초기화\nfrom gensim.models import Word2Vec\n\nword2vec=Word2Vec.load("../models/word2vec.model")\ninit_embeddings=np.zeros((n_vocab, embedding_dim))\n\nfor index, token in id_to_token.items():\n    if token not in ["<pad>", "<unk>"]:\n        init_embeddings[index]=word2vec.wv[token]\n\nembedding_layer=nn.Embedding.from_pretrained(\n    torch.tensor(init_embeddings, dtype=torch.float32)\n)\n'

In [None]:
#사전 학습된 임베딩 계층 적용
class SentenceClassifier(nn.Module):
    def __init__(
        self,
        n_vocab,
        hidden_dim,
        embedding_dim,
        n_layers,
        dropout=0.5,
        bidirectional=True,
        model_type="lstm",
        pretrained_embedding=None
    ):
        super().__init__()
        if pretrained_embedding is not None:
            self.embedding=nn.Embedding.from_pretrained(
                torch.tensor(pretrained_embedding, dtype=torch.float32)
            )
        else:
            self.embedding=nn.Embedding(
                num_embeddings=n_vocab,
                embedding_dim=embedding_dim,
                padding_idx=0
            )

        if model_type == "rnn":
            self.model=nn.RNN(
                input_size=embedding_dim,
                hidden_size=hidden_dim,
                num_layers=n_layers,
                bidirectional=bidirectional,
                dropout=dropout,
                batch_first=True,
            )
        elif model_type == "lstm":
            self.model=nn.LSTM(
                input_size=embedding_dim,
                hidden_size=hidden_dim,
                num_layers=n_layers,
                bidirectional=bidirectional,
                dropout=dropout,
                batch_first=True,
            )

        if bidirectional:
            self.classifier=nn.Linear(hidden_dim * 2, 1)
        else:
            self.classifier=nn.Linear(hidden_dim, 1)
        self.dropout=nn.Dropout(dropout)

    def forward(self, inputs):
        embeddings=self.embedding(inputs)
        output, _=self.model(embeddings)
        last_output=output[:, -1, :]
        last_output=self.dropout(last_output)
        logits=self.classifier(last_output)
        return logits

In [None]:
'''
#사전 학습된 임베딩을 사용한 모델 학습
classifier=SentenceClassifier(
    n_vocab=n_vocab, hidden_dim=hidden_dim, embedding_dim=embedding_dim,
    n_layers=n_layers, pretrained_embedding=init_embeddings
).to(device)
criterion=nn.BCEWithLogitsLoss().to(device)
optimizer=optim.RMSprop(classifier.parameters(), lr=0.001)

epochs=5
interval=500

for epoch in range(epochs):
    train(classifier, train_loader, criterion, optimizer, device, interval)
    test(classifier, test_loader, criterion, device)
'''

'\n#사전 학습된 임베딩을 사용한 모델 학습\nclassifier=SentenceClassifier(\n    n_vocab=n_vocab, hidden_dim=hidden_dim, embedding_dim=embedding_dim,\n    n_layers=n_layers, pretrained_embedding=init_embeddings\n).to(device)\ncriterion=nn.BCEWithLogitsLoss().to(device)\noptimizer=optim.RMSprop(classifier.parameters(), lr=0.001)\n\nepochs=5\ninterval=500\n\nfor epoch in range(epochs):\n    train(classifier, train_loader, criterion, optimizer, device, interval)\n    test(classifier, test_loader, criterion, device)\n'

**합성곱 신경망**

In [None]:
'''
conv=torch.nn.Conv2d(
    in_channels,
    out_channels,
    kernel_size,
    stride=1,
    padding=0,
    dilation=1,
    groups=1,
    bias=True,
    padding_mode='zeros'
)
'''

"\nconv=torch.nn.Conv2d(\n    in_channels,\n    out_channels,\n    kernel_size,\n    stride=1,\n    padding=0,\n    dilation=1,\n    groups=1,\n    bias=True,\n    padding_mode='zeros'\n)\n"

In [None]:
'''
pool=torch.nn.MaxPool2d(
    kernel_size,
    stride=None,
    padding=0,
    dilation=1
)
'''

'\npool=torch.nn.MaxPool2d(\n    kernel_size,\n    stride=None,\n    padding=0,\n    dilation=1\n)\n'

In [None]:
'''
pool=torch.nn.AvgPool2d(
    kernel_size,
    stride=None,
    padding=0,
    count_include_pad=True
)
'''

'\npool=torch.nn.AvgPool2d(\n    kernel_size,\n    stride=None,\n    padding=0,\n    count_include_pad=True\n)\n'

In [None]:
#합성곱 모델
import torch
from torch import nn

class CNN(nn.Module):
    def __init__(self):
        super().__init__()

        self.conv1=nn.Sequential(
            nn.Conv2d(
                in_channels=3, out_channels=16, kernel_size=3, stride=2, padding=1
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )

        self.conv2=nn.Sequential(
            nn.Conv2d(
                in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=1
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )

        self.fc=nn.Linear(32 * 32 * 32, 10)

    def forward(self, x):
        x=self.conv1(x)
        x=self.conv2(x)
        x=torch.flatten(x)
        x=self.fc(x)
        return x

In [None]:
#합성곱 기반 문장 분류 모델 정의
import torch
from torch import nn

class SentenceClassifier(nn.Module):
    def __init__(self, pretrained_embedding, filter_sizes, max_length, dropout=0.5):
        super().__init__()

        self.embedding=nn.Embedding.from_pretrained(
            torch.tensor(pretrained_embedding, dtype=torch.float32)
        )
        embedding_dim=self.embedding.weight.shape[1]

        conv=[]
        for size in filter_sizes:
            conv.append(
                nn.Sequential(
                    nn.Conv1d(
                        in_channels=embedding_dim,
                        out_channels=1,
                        kernel_size=size
                    ),
                    nn.ReLU(),
                    nn.MaxPool1d(kernel_size=max_length-size-1),
                )
            )
        self.conv_filters=nn.ModuleList(conv)

        output_size=len(filter_sizes)
        self.pre_classifier=nn.Linear(output_size, output_size)
        self.dropout=nn.Dropout(dropout)
        self.classifier=nn.Linear(output_size, 1)

    def forward(self, inputs):
        embeddings=self.embedding(inputs)
        embeddings=embeddings.permute(0, 2, 1)

        conv_outputs=[conv(embeddings) for conv in self.conv_filters]
        concat_outputs=torch.cat([conv.squeeze(-1) for conv in conv_outputs], dim=1)

        logits=self.pre_classifier(concat_outputs)
        logits=self.dropout(logits)
        logits=self.classifier(logits)
        return logits

In [None]:
#데이터세트 불러오기
import pandas as pd
from Korpora import Korpora

corpus=Korpora.load("nsmc")
corpus_df=pd.DataFrame(corpus.test)

train=corpus_df.sample(frac=0.9, random_state=42)
test=corpus_df.drop(train.index)

print(train.head(5).to_markdown())
print("Training Data Size:", len(train))
print("Testing Data Size:", len(test))

#데이터 토큰화 및 단어 사전 구축
from konlpy.tag import Okt
from collections import Counter

def build_vocab(corpus, n_vocab, special_tokens):
    counter=Counter()
    for tokens in corpus:
        counter.update(tokens)
    vocab=special_tokens
    for token, count in counter.most_common(n_vocab):
        vocab.append(token)
    return vocab

tokenizer=Okt()
train_tokens=[tokenizer.morphs(review) for review in train.text]
test_tokens=[tokenizer.morphs(review) for review in test.text]

vocab=build_vocab(corpus=train_tokens, n_vocab=5000, special_tokens=["<pad>", "<unk>"])
token_to_id={token: idx for idx, token in enumerate(vocab)}
id_to_token={idx: token for idx, token in enumerate(vocab)}

print(vocab[:10])
print(len(vocab))

#정수 인코딩 및 패딩
import numpy as np

def pad_sequences(sequences, max_length, pad_value):
    result = list()
    for sequence in sequences:
        sequence=sequence[:max_length]
        pad_length=max_length - len(sequence)
        padded_sequence=sequence + [pad_value] * pad_length
        result.append(padded_sequence)
    return np.asarray(result)

unk_id=token_to_id["<unk>"]
train_ids=[
    [token_to_id.get(token, unk_id) for token in review] for review in train_tokens
]
test_ids=[
    [token_to_id.get(token, unk_id) for token in review] for review in test_tokens
]

max_length=32
pad_id=token_to_id["<pad>"]
train_ids=pad_sequences(train_ids, max_length, pad_id)
test_ids=pad_sequences(test_ids, max_length, pad_id)

print(train_ids[0])
print(test_ids[0])

#데이터로더 적용
import torch
from torch.utils.data import TensorDataset, DataLoader


train_ids=torch.tensor(train_ids)
test_ids=torch.tensor(test_ids)

train_labels=torch.tensor(train.label.values, dtype=torch.float32)
test_labels=torch.tensor(test.label.values, dtype=torch.float32)

train_dataset=TensorDataset(train_ids, train_labels)
test_dataset=TensorDataset(test_ids, test_labels)

train_loader=DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader=DataLoader(test_dataset, batch_size=16, shuffle=False)


    Korpora 는 다른 분들이 연구 목적으로 공유해주신 말뭉치들을
    손쉽게 다운로드, 사용할 수 있는 기능만을 제공합니다.

    말뭉치들을 공유해 주신 분들에게 감사드리며, 각 말뭉치 별 설명과 라이센스를 공유 드립니다.
    해당 말뭉치에 대해 자세히 알고 싶으신 분은 아래의 description 을 참고,
    해당 말뭉치를 연구/상용의 목적으로 이용하실 때에는 아래의 라이센스를 참고해 주시기 바랍니다.

    # Description
    Author : e9t@github
    Repository : https://github.com/e9t/nsmc
    References : www.lucypark.kr/docs/2015-pyconkr/#39

    Naver sentiment movie corpus v1.0
    This is a movie review dataset in the Korean language.
    Reviews were scraped from Naver Movies.

    The dataset construction is based on the method noted in
    [Large movie review dataset][^1] from Maas et al., 2011.

    [^1]: http://ai.stanford.edu/~amaas/data/sentiment/

    # License
    CC0 1.0 Universal (CC0 1.0) Public Domain Dedication
    Details in https://creativecommons.org/publicdomain/zero/1.0/

[Korpora] Corpus `nsmc` is already installed at /root/Korpora/nsmc/ratings_train.txt
[Korpora] Corpus `nsmc` is already installed at /root/Korpora/nsmc/ra

In [None]:
#손실 함수와 최적화 함수 정의
'''
from torch import optim

n_vocab=len(token_to_id)
hidden_dim=64
embedding_dim=128
n_layers=2

device="cuda" if torch.cuda.is_available() else "cpu"
filter_sizes=[3,3,4,4,5,5]
classifier=SentenceClassifier(
    pretrained_embedding=init_embeddings,
    filter_sizes=filter_sizes,
    max_length=max_length
).to(device)

criterion=nn.BCEWithLogitsLoss().to(device)
optimizer=optim.Adam(classifier.parameters(), lr=0.001)
'''

'\nfrom torch import optim\n\nn_vocab=len(token_to_id)\nhidden_dim=64\nembedding_dim=128\nn_layers=2\n\ndevice="cuda" if torch.cuda.is_available() else "cpu"\nfilter_sizes=[3,3,4,4,5,5]\nclassifier=SentenceClassifier(\n    pretrained_embedding=init_embeddings,\n    filter_sizes=filter_sizes,\n    max_length=max_length\n).to(device)\n\ncriterion=nn.BCEWithLogitsLoss().to(device)\noptimizer=optim.Adam(classifier.parameters(), lr=0.001)\n'

In [None]:
#모델 학습 및 테스트
'''
def train(model, datasets, criterion, optimizer, device, interval):
    model.train()
    losses=list()

    for step, (input_ids, labels) in enumerate(datasets):
        input_ids=input_ids.to(device)
        labels=labels.to(device).unsqueeze(1)

        logits=model(input_ids)
        loss=criterion(logits, labels)
        losses.append(loss.item())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if step % interval == 0:
            print(f"Train Loss {step}: {np.mean(losses)}")


def test(model, datasets, criterion, device):
    model.eval()
    losses=list()
    corrects=list()

    for step, (input_ids, labels) in enumerate(datasets):
        input_ids=input_ids.to(device)
        labels=labels.to(device).unsqueeze(1)

        logits=model(input_ids)
        loss=criterion(logits, labels)
        losses.append(loss.item())
        yhat=torch.sigmoid(logits)>.5
        corrects.extend(
            torch.eq(yhat, labels).cpu().tolist()
        )

    print(f"Val Loss: {np.mean(losses)}, Val Accuracy: {np.mean(corrects)}")


epochs=5
interval=500

for epoch in range(epochs):
    train(classifier, train_loader, criterion, optimizer, device, interval)
    test(classifier, test_loader, criterion, device)
'''

'\ndef train(model, datasets, criterion, optimizer, device, interval):\n    model.train()\n    losses=list()\n\n    for step, (input_ids, labels) in enumerate(datasets):\n        input_ids=input_ids.to(device)\n        labels=labels.to(device).unsqueeze(1)\n\n        logits=model(input_ids)\n        loss=criterion(logits, labels)\n        losses.append(loss.item())\n\n        optimizer.zero_grad()\n        loss.backward()\n        optimizer.step()\n\n        if step % interval == 0:\n            print(f"Train Loss {step}: {np.mean(losses)}")\n\n\ndef test(model, datasets, criterion, device):\n    model.eval()\n    losses=list()\n    corrects=list()\n\n    for step, (input_ids, labels) in enumerate(datasets):\n        input_ids=input_ids.to(device)\n        labels=labels.to(device).unsqueeze(1)\n\n        logits=model(input_ids)\n        loss=criterion(logits, labels)\n        losses.append(loss.item())\n        yhat=torch.sigmoid(logits)>.5\n        corrects.extend(\n            torch.