# 재현성 구현

In [1]:
import pandas as pd
import numpy as np
import torch
from tqdm.auto import tqdm
import random
import os

def reset_seeds(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

DATA_PATH = "/content/drive/MyDrive/data/"
SEED = 42

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

# 데이터 불러오기

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
train = pd.read_csv(f'{DATA_PATH}review_train.csv')
test = pd.read_csv(f'{DATA_PATH}review_test.csv')

train.shape, test.shape

((2000, 3), (1000, 2))

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      2000 non-null   object
 1   review  2000 non-null   object
 2   target  2000 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 47.0+ KB


In [6]:
train.head()

Unnamed: 0,id,review,target
0,train_0,이런 최고의 영화를 이제서야 보다니,1
1,train_1,안봤지만 유승준나와서 비추.,0
2,train_2,시대를 못 따라간 연출과 촌스러운 영상미.,0
3,train_3,원소전 굿,1
4,train_4,ㅋㅋㅋㅋ 개봉영화평점단사람이1명 ㅋㅋㅋㅋ,1


# Kiwi Morpheme

In [7]:
%pip install kiwipiepy

Collecting kiwipiepy
  Downloading kiwipiepy-0.20.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.1 kB)
Collecting kiwipiepy-model<0.21,>=0.20 (from kiwipiepy)
  Downloading kiwipiepy_model-0.20.0.tar.gz (34.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.7/34.7 MB[0m [31m56.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading kiwipiepy-0.20.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m65.1 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: kiwipiepy-model
  Building wheel for kiwipiepy-model (setup.py) ... [?25l[?25hdone
  Created wheel for kiwipiepy-model: filename=kiwipiepy_model-0.20.0-py3-none-any.whl size=34818026 sha256=cf595381abcbe859586130612276227f1e69c27e9fceaf292d3e5d96b0900a7e
  Stored in directory: /root/.cache/pip/wheels/b6/b1/66/2be9840f

In [8]:
from kiwipiepy import Kiwi

kiwi = Kiwi()
kiwi

Kiwi(num_workers=1, model_path=None, integrate_allomorph=True, load_default_dict=True, load_typo_dict=True, model_type='knlm', typos=None, typo_cost_threshold=2.5)

In [9]:
text = train['review'][0]
text

'이런 최고의 영화를 이제서야 보다니'

In [10]:
kiwi.analyze(text, top_n=2) # 형태소 토큰화

[([Token(form='이런', tag='MM', start=0, len=2),
   Token(form='최고', tag='NNG', start=3, len=2),
   Token(form='의', tag='JKG', start=5, len=1),
   Token(form='영화', tag='NNG', start=7, len=2),
   Token(form='를', tag='JKO', start=9, len=1),
   Token(form='이제서야', tag='MAG', start=11, len=4),
   Token(form='보', tag='VV', start=16, len=1),
   Token(form='다니', tag='EF', start=17, len=2)],
  -63.7940559387207),
 ([Token(form='이런', tag='MM', start=0, len=2),
   Token(form='최고', tag='NNG', start=3, len=2),
   Token(form='의', tag='JKG', start=5, len=1),
   Token(form='영화', tag='NNG', start=7, len=2),
   Token(form='를', tag='JKO', start=9, len=1),
   Token(form='이제', tag='NNG', start=11, len=2),
   Token(form='서', tag='JKB', start=13, len=1),
   Token(form='야', tag='JX', start=14, len=1),
   Token(form='보', tag='VV', start=16, len=1),
   Token(form='다니', tag='EC', start=17, len=2)],
  -64.6888656616211)]

## analyze 메서드
- 문서 전달 시

In [11]:
result = kiwi.analyze(text) # 형태소 토큰화
result[0][0]

[Token(form='이런', tag='MM', start=0, len=2),
 Token(form='최고', tag='NNG', start=3, len=2),
 Token(form='의', tag='JKG', start=5, len=1),
 Token(form='영화', tag='NNG', start=7, len=2),
 Token(form='를', tag='JKO', start=9, len=1),
 Token(form='이제서야', tag='MAG', start=11, len=4),
 Token(form='보', tag='VV', start=16, len=1),
 Token(form='다니', tag='EF', start=17, len=2)]

## tokenize 메서드
- 하나의 형태소 분석결과 반환

In [12]:
result = kiwi.tokenize(text)
result

[Token(form='이런', tag='MM', start=0, len=2),
 Token(form='최고', tag='NNG', start=3, len=2),
 Token(form='의', tag='JKG', start=5, len=1),
 Token(form='영화', tag='NNG', start=7, len=2),
 Token(form='를', tag='JKO', start=9, len=1),
 Token(form='이제서야', tag='MAG', start=11, len=4),
 Token(form='보', tag='VV', start=16, len=1),
 Token(form='다니', tag='EF', start=17, len=2)]

In [13]:
result[0].form # 토큰 문자열

'이런'

In [14]:
result[0].tag # 품사 문자열

'MM'

- iterable한 객체 전달할 경우, map 객체 반환

In [15]:
result = kiwi.tokenize(train['review'].iloc[:2])

for tokens in result:
  print(tokens)
  break

[Token(form='이런', tag='MM', start=0, len=2), Token(form='최고', tag='NNG', start=3, len=2), Token(form='의', tag='JKG', start=5, len=1), Token(form='영화', tag='NNG', start=7, len=2), Token(form='를', tag='JKO', start=9, len=1), Token(form='이제서야', tag='MAG', start=11, len=4), Token(form='보', tag='VV', start=16, len=1), Token(form='다니', tag='EF', start=17, len=2)]


## 불용어

In [16]:
from kiwipiepy.utils import Stopwords

stopwords = Stopwords()
stopwords.stopwords # set자료형 > 중복 제거 + 연산가능

{('ᆫ', 'ETM'),
 ('ᆫ', 'JX'),
 ('ᆫ다', 'EF'),
 ('ᆯ', 'ETM'),
 ('가', 'JKS'),
 ('같', 'VA'),
 ('것', 'NNB'),
 ('게', 'EC'),
 ('겠', 'EP'),
 ('고', 'EC'),
 ('고', 'JKQ'),
 ('과', 'JC'),
 ('과', 'JKB'),
 ('그', 'MM'),
 ('그', 'NP'),
 ('기', 'ETN'),
 ('까지', 'JX'),
 ('나', 'NP'),
 ('년', 'NNB'),
 ('는', 'ETM'),
 ('는', 'JX'),
 ('다', 'EC'),
 ('다', 'EF'),
 ('다고', 'EC'),
 ('다는', 'ETM'),
 ('대하', 'VV'),
 ('더', 'MAG'),
 ('던', 'ETM'),
 ('도', 'JX'),
 ('되', 'VV'),
 ('되', 'XSV'),
 ('들', 'XSN'),
 ('등', 'NNB'),
 ('따르', 'VV'),
 ('때', 'NNG'),
 ('때문', 'NNB'),
 ('라', 'EC'),
 ('라는', 'ETM'),
 ('로', 'JKB'),
 ('를', 'JKO'),
 ('만', 'JX'),
 ('만', 'NR'),
 ('말', 'NNG'),
 ('며', 'EC'),
 ('면', 'EC'),
 ('면서', 'EC'),
 ('명', 'NNB'),
 ('받', 'VV'),
 ('보', 'VV'),
 ('부터', 'JX'),
 ('사람', 'NNG'),
 ('성', 'XSN'),
 ('수', 'NNB'),
 ('아니', 'VCN'),
 ('않', 'VX'),
 ('어', 'EC'),
 ('어', 'EF'),
 ('어서', 'EC'),
 ('어야', 'EC'),
 ('없', 'VA'),
 ('었', 'EP'),
 ('에', 'JKB'),
 ('에게', 'JKB'),
 ('에서', 'JKB'),
 ('와', 'JC'),
 ('와', 'JKB'),
 ('우리', 'NP'),
 ('원', 'NNB'),


- 불용어 추가하기

In [17]:
# 1개 추가
stopwords.add('크아아아악')
stopwords.stopwords

{('ᆫ', 'ETM'),
 ('ᆫ', 'JX'),
 ('ᆫ다', 'EF'),
 ('ᆯ', 'ETM'),
 ('가', 'JKS'),
 ('같', 'VA'),
 ('것', 'NNB'),
 ('게', 'EC'),
 ('겠', 'EP'),
 ('고', 'EC'),
 ('고', 'JKQ'),
 ('과', 'JC'),
 ('과', 'JKB'),
 ('그', 'MM'),
 ('그', 'NP'),
 ('기', 'ETN'),
 ('까지', 'JX'),
 ('나', 'NP'),
 ('년', 'NNB'),
 ('는', 'ETM'),
 ('는', 'JX'),
 ('다', 'EC'),
 ('다', 'EF'),
 ('다고', 'EC'),
 ('다는', 'ETM'),
 ('대하', 'VV'),
 ('더', 'MAG'),
 ('던', 'ETM'),
 ('도', 'JX'),
 ('되', 'VV'),
 ('되', 'XSV'),
 ('들', 'XSN'),
 ('등', 'NNB'),
 ('따르', 'VV'),
 ('때', 'NNG'),
 ('때문', 'NNB'),
 ('라', 'EC'),
 ('라는', 'ETM'),
 ('로', 'JKB'),
 ('를', 'JKO'),
 ('만', 'JX'),
 ('만', 'NR'),
 ('말', 'NNG'),
 ('며', 'EC'),
 ('면', 'EC'),
 ('면서', 'EC'),
 ('명', 'NNB'),
 ('받', 'VV'),
 ('보', 'VV'),
 ('부터', 'JX'),
 ('사람', 'NNG'),
 ('성', 'XSN'),
 ('수', 'NNB'),
 ('아니', 'VCN'),
 ('않', 'VX'),
 ('어', 'EC'),
 ('어', 'EF'),
 ('어서', 'EC'),
 ('어야', 'EC'),
 ('없', 'VA'),
 ('었', 'EP'),
 ('에', 'JKB'),
 ('에게', 'JKB'),
 ('에서', 'JKB'),
 ('와', 'JC'),
 ('와', 'JKB'),
 ('우리', 'NP'),
 ('원', 'NNB'),


In [18]:
# 2개 이상
stopwords.add(['컹스','에베베'])
stopwords.stopwords

{('ᆫ', 'ETM'),
 ('ᆫ', 'JX'),
 ('ᆫ다', 'EF'),
 ('ᆯ', 'ETM'),
 ('가', 'JKS'),
 ('같', 'VA'),
 ('것', 'NNB'),
 ('게', 'EC'),
 ('겠', 'EP'),
 ('고', 'EC'),
 ('고', 'JKQ'),
 ('과', 'JC'),
 ('과', 'JKB'),
 ('그', 'MM'),
 ('그', 'NP'),
 ('기', 'ETN'),
 ('까지', 'JX'),
 ('나', 'NP'),
 ('년', 'NNB'),
 ('는', 'ETM'),
 ('는', 'JX'),
 ('다', 'EC'),
 ('다', 'EF'),
 ('다고', 'EC'),
 ('다는', 'ETM'),
 ('대하', 'VV'),
 ('더', 'MAG'),
 ('던', 'ETM'),
 ('도', 'JX'),
 ('되', 'VV'),
 ('되', 'XSV'),
 ('들', 'XSN'),
 ('등', 'NNB'),
 ('따르', 'VV'),
 ('때', 'NNG'),
 ('때문', 'NNB'),
 ('라', 'EC'),
 ('라는', 'ETM'),
 ('로', 'JKB'),
 ('를', 'JKO'),
 ('만', 'JX'),
 ('만', 'NR'),
 ('말', 'NNG'),
 ('며', 'EC'),
 ('면', 'EC'),
 ('면서', 'EC'),
 ('명', 'NNB'),
 ('받', 'VV'),
 ('보', 'VV'),
 ('부터', 'JX'),
 ('사람', 'NNG'),
 ('성', 'XSN'),
 ('수', 'NNB'),
 ('아니', 'VCN'),
 ('않', 'VX'),
 ('어', 'EC'),
 ('어', 'EF'),
 ('어서', 'EC'),
 ('어야', 'EC'),
 ('없', 'VA'),
 ('었', 'EP'),
 ('에', 'JKB'),
 ('에게', 'JKB'),
 ('에베베', 'NNP'),
 ('에서', 'JKB'),
 ('와', 'JC'),
 ('와', 'JKB'),
 ('우리', 'NP')

In [19]:
# 품사지정 지정 후 추가 > 튜플로 전달
stopwords.add( ('크아아아악', 'NNP'))
stopwords.stopwords

{('ᆫ', 'ETM'),
 ('ᆫ', 'JX'),
 ('ᆫ다', 'EF'),
 ('ᆯ', 'ETM'),
 ('가', 'JKS'),
 ('같', 'VA'),
 ('것', 'NNB'),
 ('게', 'EC'),
 ('겠', 'EP'),
 ('고', 'EC'),
 ('고', 'JKQ'),
 ('과', 'JC'),
 ('과', 'JKB'),
 ('그', 'MM'),
 ('그', 'NP'),
 ('기', 'ETN'),
 ('까지', 'JX'),
 ('나', 'NP'),
 ('년', 'NNB'),
 ('는', 'ETM'),
 ('는', 'JX'),
 ('다', 'EC'),
 ('다', 'EF'),
 ('다고', 'EC'),
 ('다는', 'ETM'),
 ('대하', 'VV'),
 ('더', 'MAG'),
 ('던', 'ETM'),
 ('도', 'JX'),
 ('되', 'VV'),
 ('되', 'XSV'),
 ('들', 'XSN'),
 ('등', 'NNB'),
 ('따르', 'VV'),
 ('때', 'NNG'),
 ('때문', 'NNB'),
 ('라', 'EC'),
 ('라는', 'ETM'),
 ('로', 'JKB'),
 ('를', 'JKO'),
 ('만', 'JX'),
 ('만', 'NR'),
 ('말', 'NNG'),
 ('며', 'EC'),
 ('면', 'EC'),
 ('면서', 'EC'),
 ('명', 'NNB'),
 ('받', 'VV'),
 ('보', 'VV'),
 ('부터', 'JX'),
 ('사람', 'NNG'),
 ('성', 'XSN'),
 ('수', 'NNB'),
 ('아니', 'VCN'),
 ('않', 'VX'),
 ('어', 'EC'),
 ('어', 'EF'),
 ('어서', 'EC'),
 ('어야', 'EC'),
 ('없', 'VA'),
 ('었', 'EP'),
 ('에', 'JKB'),
 ('에게', 'JKB'),
 ('에베베', 'NNP'),
 ('에서', 'JKB'),
 ('와', 'JC'),
 ('와', 'JKB'),
 ('우리', 'NP')

- 불용어 삭제하기

In [20]:
stopwords.remove( ('크아아아악', 'NNP'))
stopwords.stopwords

{('ᆫ', 'ETM'),
 ('ᆫ', 'JX'),
 ('ᆫ다', 'EF'),
 ('ᆯ', 'ETM'),
 ('가', 'JKS'),
 ('같', 'VA'),
 ('것', 'NNB'),
 ('게', 'EC'),
 ('겠', 'EP'),
 ('고', 'EC'),
 ('고', 'JKQ'),
 ('과', 'JC'),
 ('과', 'JKB'),
 ('그', 'MM'),
 ('그', 'NP'),
 ('기', 'ETN'),
 ('까지', 'JX'),
 ('나', 'NP'),
 ('년', 'NNB'),
 ('는', 'ETM'),
 ('는', 'JX'),
 ('다', 'EC'),
 ('다', 'EF'),
 ('다고', 'EC'),
 ('다는', 'ETM'),
 ('대하', 'VV'),
 ('더', 'MAG'),
 ('던', 'ETM'),
 ('도', 'JX'),
 ('되', 'VV'),
 ('되', 'XSV'),
 ('들', 'XSN'),
 ('등', 'NNB'),
 ('따르', 'VV'),
 ('때', 'NNG'),
 ('때문', 'NNB'),
 ('라', 'EC'),
 ('라는', 'ETM'),
 ('로', 'JKB'),
 ('를', 'JKO'),
 ('만', 'JX'),
 ('만', 'NR'),
 ('말', 'NNG'),
 ('며', 'EC'),
 ('면', 'EC'),
 ('면서', 'EC'),
 ('명', 'NNB'),
 ('받', 'VV'),
 ('보', 'VV'),
 ('부터', 'JX'),
 ('사람', 'NNG'),
 ('성', 'XSN'),
 ('수', 'NNB'),
 ('아니', 'VCN'),
 ('않', 'VX'),
 ('어', 'EC'),
 ('어', 'EF'),
 ('어서', 'EC'),
 ('어야', 'EC'),
 ('없', 'VA'),
 ('었', 'EP'),
 ('에', 'JKB'),
 ('에게', 'JKB'),
 ('에베베', 'NNP'),
 ('에서', 'JKB'),
 ('와', 'JC'),
 ('와', 'JKB'),
 ('우리', 'NP')

## 결합하기

In [21]:
text = train['review'][1]
text

'안봤지만 유승준나와서 비추.'

In [22]:
result = kiwi.tokenize(text)
result

[Token(form='안', tag='MAG', start=0, len=1),
 Token(form='보', tag='VV', start=1, len=1),
 Token(form='었', tag='EP', start=1, len=1),
 Token(form='지만', tag='EC', start=2, len=2),
 Token(form='유승준', tag='NNP', start=5, len=3),
 Token(form='나오', tag='VV', start=8, len=2),
 Token(form='어서', tag='EC', start=9, len=2),
 Token(form='비추', tag='VV', start=12, len=2),
 Token(form='.', tag='SF', start=14, len=1)]

In [23]:
tokens = [ (t.form, t.tag) for t in result ] # 토큰 문자열, 품사 문자열
tokens

[('안', 'MAG'),
 ('보', 'VV'),
 ('었', 'EP'),
 ('지만', 'EC'),
 ('유승준', 'NNP'),
 ('나오', 'VV'),
 ('어서', 'EC'),
 ('비추', 'VV'),
 ('.', 'SF')]

## 토큰화 해보기
- 불용어 제거+ N, V로 시작하는 품사들만 토큰화 해보기

In [24]:
stopwords = Stopwords()
result = kiwi.tokenize(train['review'], stopwords = stopwords)

train_list = []
for tokens in tqdm(result, total = len(train['review'])):
  token = [ t.form for t in tokens if t.tag[0] in 'NV']
  # print(token)
  train_list.append(token)

  0%|          | 0/2000 [00:00<?, ?it/s]

In [25]:
[ len(tokens) for tokens in train_list]

[2,
 3,
 5,
 2,
 4,
 3,
 8,
 3,
 3,
 8,
 4,
 5,
 9,
 14,
 17,
 10,
 4,
 3,
 15,
 6,
 5,
 5,
 3,
 4,
 2,
 4,
 13,
 2,
 6,
 27,
 3,
 0,
 12,
 20,
 4,
 7,
 1,
 8,
 9,
 1,
 15,
 1,
 3,
 6,
 9,
 10,
 8,
 2,
 6,
 12,
 7,
 5,
 5,
 6,
 4,
 6,
 8,
 4,
 2,
 5,
 6,
 5,
 8,
 5,
 5,
 2,
 3,
 5,
 7,
 6,
 3,
 4,
 3,
 6,
 5,
 3,
 10,
 3,
 9,
 12,
 6,
 2,
 10,
 3,
 4,
 3,
 9,
 14,
 10,
 12,
 21,
 3,
 5,
 1,
 14,
 5,
 17,
 16,
 6,
 6,
 11,
 2,
 0,
 0,
 3,
 6,
 5,
 4,
 3,
 1,
 4,
 4,
 15,
 8,
 3,
 13,
 8,
 5,
 6,
 2,
 7,
 7,
 10,
 4,
 9,
 5,
 3,
 4,
 3,
 9,
 5,
 20,
 5,
 4,
 8,
 1,
 3,
 10,
 2,
 6,
 2,
 5,
 1,
 8,
 5,
 3,
 9,
 1,
 2,
 18,
 6,
 4,
 2,
 3,
 3,
 11,
 3,
 6,
 13,
 5,
 6,
 4,
 5,
 2,
 5,
 2,
 2,
 7,
 7,
 6,
 14,
 2,
 12,
 23,
 11,
 12,
 6,
 2,
 5,
 2,
 6,
 2,
 1,
 5,
 6,
 4,
 11,
 13,
 5,
 4,
 4,
 19,
 4,
 3,
 2,
 23,
 3,
 7,
 1,
 2,
 1,
 34,
 3,
 1,
 6,
 4,
 7,
 2,
 6,
 3,
 5,
 3,
 2,
 1,
 4,
 5,
 8,
 3,
 6,
 9,
 1,
 5,
 12,
 11,
 4,
 0,
 1,
 1,
 1,
 4,
 3,
 1,
 7,
 4,
 2,
 7,
 21,
 2,
 3,
 

In [26]:
min(len(tokens) for tokens in train_list) # 버리는 샘플 발생

0

In [27]:
cnt = np.array([len(tokens) for tokens in train_list])
cnt

array([2, 3, 5, ..., 4, 3, 3])

In [28]:
mask = cnt == 0
mask.sum()

49

In [29]:
train.loc[mask]

Unnamed: 0,id,review,target
31,train_31,대박....,1
102,train_102,What a great drama!!!,1
103,train_103,Space Jason!!!!,0
225,train_225,the roles play very real touching,1
307,train_307,참신하지는 않다,0
342,train_342,...,0
470,train_470,별로,0
524,train_524,harry potter go!,1
546,train_546,글쎄~ 별로던데~,0
581,train_581,ㅋㅋ,1


# spacy 형태소 분석기
- 딥러닝 기반

In [30]:
!python -m spacy download ko_core_news_sm

Collecting ko-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ko_core_news_sm-3.7.0/ko_core_news_sm-3.7.0-py3-none-any.whl (14.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.7/14.7 MB[0m [31m54.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: ko-core-news-sm
Successfully installed ko-core-news-sm-3.7.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('ko_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [31]:
import spacy

In [32]:
nlp = spacy.load('ko_core_news_sm')

In [33]:
doc = nlp(text)
doc

안봤지만 유승준나와서 비추.

In [34]:
doc[0].text, doc[0].lemma_, doc[0].tag_

('안봤지만', '안봤지+만', 'nq+jxt')

In [35]:
train_list = []
for text in tqdm(train['review']):
  doc = nlp(text)
  tmp_list = []
  for tokens in doc:
    tmp = tokens.lemma_.split('+')
    tmp_list.extend(tmp)

  train_list.append(tmp_list)

  0%|          | 0/2000 [00:00<?, ?it/s]

In [38]:
len(train_list)

2000

# konlpy 형태소 분석기
- C++, 자바 등 다른 언어로 개발된 오픈소스 형태소 분석 라이브러리
- 파이썬에서도 쉽게 사용 가능한 라이브러리

- 사용방법
  - 클래스 객체 생성
  - morphs 메서드 + pos 메서드 사용
  - morphs 메서드
    - 토큰화(tokenize)

  - pos 메서드
    - 품사 태깅이 추가된 토큰화

In [39]:
%pip install konlpy

Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting JPype1>=0.7.0 (from konlpy)
  Downloading jpype1-1.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m81.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jpype1-1.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (493 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.8/493.8 kB[0m [31m36.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: JPype1, konlpy
Successfully installed JPype1-1.5.1 konlpy-0.6.0


In [40]:
from konlpy.tag import Okt, Komoran, Hannanum, Kkma

text

'보다가감동...진짜울뻔'

## Okt 클래스

In [41]:
tokenizer = Okt() # 객체 생성

In [42]:
tokenizer.morphs(text) # 토큰화

['보다가', '감동', '...', '진짜', '울', '뻔']

In [43]:
tokenizer.pos(text) # 품사태깅

[('보다가', 'Verb'),
 ('감동', 'Noun'),
 ('...', 'Punctuation'),
 ('진짜', 'Noun'),
 ('울', 'Modifier'),
 ('뻔', 'Noun')]

## Komoran 클래스

In [44]:
tokenizer = Komoran()
tokenizer.morphs(text) # 토큰화

['보', '다가', '감동', '...', '진짜', '울', '뻔']

In [45]:
tokenizer.pos(text) # 품사태깅

[('보', 'VV'),
 ('다가', 'EC'),
 ('감동', 'NNG'),
 ('...', 'SE'),
 ('진짜', 'NNG'),
 ('울', 'NNP'),
 ('뻔', 'NNB')]

## Hannanum 클래스
- 결과 안좋음

In [46]:
tokenizer = Hannanum()
tokenizer.morphs(text) # 토큰화

['보다가감동', '...', '진짜울뻔']

In [47]:
tokenizer.pos(text) # 품사태깅

[('보다가감동', 'N'), ('...', 'S'), ('진짜울뻔', 'N')]

## Kkma 클래스

In [48]:
tokenizer = Kkma()
tokenizer.morphs(text) # 토큰화

['보', '다가', '감동', '...', '진짜', '울', 'ㄹ', '뻔']

In [49]:
tokenizer.pos(text) # 품사태깅

[('보', 'VV'),
 ('다가', 'ECD'),
 ('감동', 'NNG'),
 ('...', 'SE'),
 ('진짜', 'MAG'),
 ('울', 'VV'),
 ('ㄹ', 'ETD'),
 ('뻔', 'NNB')]

# mecab 형태소 분석기
- 성능 우수함

In [50]:
%pip install python-mecab-ko

Collecting python-mecab-ko
  Downloading python_mecab_ko-1.3.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting python-mecab-ko-dic (from python-mecab-ko)
  Downloading python_mecab_ko_dic-2.1.1.post2-py3-none-any.whl.metadata (1.4 kB)
Downloading python_mecab_ko-1.3.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (577 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/577.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━[0m [32m399.4/577.1 kB[0m [31m11.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m577.1/577.1 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading python_mecab_ko_dic-2.1.1.post2-py3-none-any.whl (34.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.5/34.5 MB[0m [31m28.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-mecab-ko-d

In [51]:
from mecab import MeCab

In [52]:
tokenizer = MeCab()
tokenizer.morphs(text) # 토큰화

['보', '다가', '감동', '.', '..', '진짜', '울', '뻔']

In [53]:
tokenizer.pos(text) # 품사태깅

[('보', 'VV'),
 ('다가', 'EC'),
 ('감동', 'NNG'),
 ('.', 'SF'),
 ('..', 'SY'),
 ('진짜', 'MAG'),
 ('울', 'VV+ETM'),
 ('뻔', 'NNB')]

# kiwi로 학습데이터, 테스트데이터 만들기

- kiwi를 이용해 학습데이터와 테스트 데이터를 토큰화하여 train_list, test_list에 담기

In [54]:
# kiwi - Kiwi()
# result = kiwi.tokenize(train['review'])

# train_list = []
# for tokens in tqdm(result, total = len(train['review'])):
#   token = [ t.form for t in tokens]
#   # print(token)
#   train_list.append(token)

In [55]:
kiwi = Kiwi()
result = kiwi.tokenize(train['review'])
train_list= [ [t.form for t in tokens] for tokens in tqdm(result, total = len(train['review'])) ]

  0%|          | 0/2000 [00:00<?, ?it/s]

In [56]:
# result = kiwi.tokenize(train['review'])

# test_list = []
# for tokens in tqdm(result, total = len(train['review'])):
#   token = [ t.form for t in tokens ]
#   # print(token)
#   test_list.append(token)

In [57]:
result = kiwi.tokenize(test['review'])
test_list= [ [t.form for t in tokens] for tokens in tqdm(result, total = len(test['review'])) ]

  0%|          | 0/1000 [00:00<?, ?it/s]

- TF-IDF 벡터화

In [58]:
from sklearn.feature_extraction.text import TfidfVectorizer

vec = TfidfVectorizer(max_features = 500)
train_data = vec.fit_transform(
    [' '.join(t) for t in train_list]
).A # numpy 변환

In [59]:
test_data = vec.transform(
    [' '.join(t) for t in test_list]
).A # numpy 변환

In [60]:
train_data.shape, test_data.shape

((2000, 500), (1000, 500))

In [61]:
(train_data.sum(axis = 1) == 0).sum() # 샘플 손실

131

In [62]:
target = train['target'].to_numpy().reshape(-1,1)
target

array([[1],
       [0],
       [0],
       ...,
       [1],
       [1],
       [1]])

In [63]:
target.shape

(2000, 1)

# 데이터셋 클래스 구현

In [64]:
class ReviewDataset(torch.utils.data.Dataset):
  def __init__(self, x, y= None):
    self.x = x
    self.y = y

  def __len__(self): # 샘플 수 반환
    return self.x.shape[0] # len(self.x)

  def __getitem__(self, idx):
    item = {}
    item['x'] = torch.Tensor(self.x[idx])

    if self.y is not None:
      item['y'] = torch.Tensor(self.y[idx]) # 데이터유형: float32
    return item

- 결과 확인하기

In [65]:
dt = ReviewDataset(train_data, target)
dt[0]

{'x': tensor([0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000

In [66]:
dl = torch.utils.data.DataLoader(dt, batch_size = 2, shuffle = False)
batch = next(iter(dl))
batch['x']

tensor([[0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0

# 신경망클래스 구현

In [67]:
class Net(torch.nn.Module):
  def __init__(self, in_features):
    super().__init__()
    self.seq = torch.nn.Sequential(
        torch.nn.Linear(in_features, in_features // 2),
        torch.nn.ReLU(),
        torch.nn.Linear(in_features // 2, in_features // 4),
        torch.nn.ReLU(),
        torch.nn.Linear(in_features // 4, 1) # 이진분류 target 1개
    )
  def forward(self,x):
    return self.seq(x)

- 결과 확인하기

In [68]:
model = Net(train_data.shape[1])
model(batch['x'])

tensor([[-0.0476],
        [-0.0502]], grad_fn=<AddmmBackward0>)

# 학습데이터 loop함수 구현

In [69]:
def train_loop(dl, model, loss_fn, optimizer, device):
  epoch_loss = 0
  model.train() # 학습모드
  for batch in dl:
    pred = model(batch['x'].to(device))
    loss = loss_fn(pred, batch['y'].to(device))

    optimizer.zero_grad() # 기울기 0 초기화
    loss.backward() # 역전파
    optimizer.step() # 가중치업데이트

    epoch_loss += loss.item()
  epoch_loss /= len(dl)
  return epoch_loss

# 테스트데이터 loop함수 구현

In [70]:
@torch.no_grad() # 경사추적 중단
def test_loop(dl, model, loss_fn, device):
  epoch_loss = 0
  model.eval() # 평가모드

  act = torch.nn.Sigmoid() # 회귀에서 사용 x, 시그모이드: 0~1값으로 전환
  pred_list = []
  for batch in dl:
    pred = model(batch['x'].to(device))
    if batch.get('y') is not None:
      loss = loss_fn(pred, batch['y'].to(device))
      epoch_loss += loss.item()

    pred = act(pred) # 회귀에서 사용 x
    pred = pred.to('cpu').numpy()
    pred_list.append(pred)

  pred = np.concatenate(pred_list)
  epoch_loss /= len(dl)
  return epoch_loss, pred

# 하이퍼파라미터 정의

In [71]:
batch_size = 32
loss_fn = torch.nn.BCEWithLogitsLoss()
epochs = 100
n_splits = 5 #kfold의 k값

# 조합 후 KFold학습 수행

In [72]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
cv = KFold(n_splits, shuffle = True, random_state = SEED)

In [78]:
is_holdout = False
reset_seeds(SEED)
score_list = []

for i, (tri,vai) in enumerate(cv.split(train_data)):
  model = Net(train_data.shape[1]).to(device)
  optimizer = torch.optim.Adam(model.parameters())

  # 학습데이터
  train_dt = ReviewDataset(train_data[tri], target[tri])
  train_dl = torch.utils.data.DataLoader(train_dt, batch_size = batch_size, shuffle = True)

  #검증데이터
  valid_dt = ReviewDataset(train_data[vai], target[vai])
  valid_dl = torch.utils.data.DataLoader(valid_dt, batch_size = batch_size, shuffle = False)

  best_score = 0
  patience = 0
  for epoch in tqdm(range(epochs)):
    train_loss = train_loop(train_dl, model, loss_fn, optimizer, device)
    valid_loss, pred = test_loop(valid_dl, model, loss_fn, device)
    pred = (pred > 0.5).astype(int) # 이진분류
    # pred = np.where(pred > 0.5 ,1,0)
    # pred = np.argmax(pred, axis=1) # 다중분류
    score = accuracy_score(target[vai], pred)

    print(train_loss, valid_loss, score)
    patience += 1

    if score > best_score:
      patience = 0
      best_score = score
      torch.save(model.state_dict(), f'model_{i}.pt') # 가중치만 저장

    if patience == 5:
      break

  score_list.append(best_score)
  print(f'Fold-{i}, Best_ACC : {best_score}')

  if is_holdout:
    break

  0%|          | 0/100 [00:00<?, ?it/s]

0.6779016542434693 0.6348923444747925 0.7425
0.4906171798706055 0.528977456001135 0.73
0.33965747982263567 0.5838596683282119 0.7275
0.2703105825185776 0.6018388168169901 0.7325
0.21373428001999856 0.6627016663551331 0.73
0.16670855157077313 0.7283804577130538 0.7425
Fold-0, Best_ACC : 0.7425


  0%|          | 0/100 [00:00<?, ?it/s]

0.6815186095237732 0.6439240895784818 0.73
0.5076069188117981 0.4670379563019826 0.75
0.3504284451901913 0.5017967934791858 0.7525
0.29247548401355744 0.5410867814834301 0.75
0.23615211695432664 0.6063439089518327 0.7425
0.19252692088484763 0.6534140064166143 0.75
0.158438363969326 0.7037263971108657 0.7575
0.1290012864023447 0.7789507783376254 0.735
0.11237670689821243 0.8078050636328183 0.7525
0.10037568844854831 0.8618983970238612 0.745
0.09757903277873993 0.8881353139877319 0.7375
0.09070828262716532 0.9239786267280579 0.7325
Fold-1, Best_ACC : 0.7575


  0%|          | 0/100 [00:00<?, ?it/s]

0.682146384716034 0.6404711145621079 0.7075
0.5122410422563553 0.48458409080138576 0.75
0.35291823983192444 0.5241169975354121 0.7175
0.28368502140045165 0.5717123036201184 0.745
0.23063354969024658 0.6394529887116872 0.735
0.18686667889356612 0.7103899396382846 0.73
0.15128869093954564 0.7802940251735541 0.7375
Fold-2, Best_ACC : 0.75


  0%|          | 0/100 [00:00<?, ?it/s]

0.6830993902683258 0.6483701834311852 0.755
0.513739560842514 0.48113471498856175 0.745
0.3549695006012917 0.5345077354174393 0.7475
0.28203669518232344 0.5832182650382702 0.74
0.23061728447675706 0.6216307924343989 0.735
0.1797681902348995 0.6920329057253324 0.7275
Fold-3, Best_ACC : 0.755


  0%|          | 0/100 [00:00<?, ?it/s]

0.679105486869812 0.6349620819091797 0.735
0.5022326737642289 0.5015302346302912 0.7525
0.34549822837114336 0.5187520545262557 0.75
0.2715306407213211 0.5970469644436469 0.7425
0.2192111800611019 0.6480880700624906 0.7375
0.17189027175307273 0.7141022498791034 0.735
0.1346240770816803 0.8124356544934787 0.7325
Fold-4, Best_ACC : 0.7525


In [79]:
np.mean(score_list)

0.7515

# test데이터 추론하기

In [80]:
test_dt = ReviewDataset(test_data)
test_dt[0]

{'x': tensor([0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000

In [81]:
test_dl = torch.utils.data.DataLoader(test_dt, batch_size = batch_size, shuffle = False)

In [82]:
pred_list = []
for i in range(n_splits):
  model = Net(train_data.shape[1]).to(device)
  state_dict = torch.load(f'model_{i}.pt', weights_only=True)
  model.load_state_dict(state_dict)

  _, pred = test_loop(test_dl, model, loss_fn, device)
  pred_list.append(pred)

In [83]:
pred = np.mean(pred_list, axis = 0) # 산술평균 앙상블
pred = np.argmax(pred, axis = 1)
pred.shape

(1000,)