- 형태소 분석기는 품사를 태깅(tagger)해주는 라이브러리

# 데이터 불러오기

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# 재현성 구현

In [2]:
import pandas as pd
import numpy as np
import torch
from tqdm.auto import tqdm
import random
import os

def reset_seeds(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

DATA_PATH = "/content/drive/MyDrive/data/"
SEED = 42

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

# nltk(Natural Language Toolkit)
- python에서 가장 오래되고 유명한 자연어 처리 라이브러리

In [3]:
import nltk
nltk.download('punkt_tab') # 토크나이저 모델 > 함수 or 클래스 실행가능
nltk.download('stopwords') # 불용어 리스트
nltk.download('averaged_perceptron_tagger_eng') # 품사정보

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [4]:
train = pd.read_csv(f'{DATA_PATH}train_news.csv')
test = pd.read_csv(f'{DATA_PATH}test_news.csv')

train.shape, test.shape

((89320, 3), (38280, 2))

In [5]:
train.head()

Unnamed: 0,title,desc,target
0,Sudan Postpones Decision to Expel Oxfam and Sa...,Sudan has decided to postpone a decision to ex...,0
1,Coming Soon: Mobile TV,Cell phone manufacturers are teaming up to bri...,2
2,Experts warn of Internet flu vaccine scam,Although the United States is experiencing a s...,3
3,Bollor ups Havas stake to 20.2,Corporate raider Vincent Bollor said yesterday...,2
4,"Hurricane Ivan Kills 20 in Grenada, Heads West...",Reuters - Hurricane Ivan killed at least 20 pe...,0


In [6]:
text = train['desc'].loc[0]
text

'Sudan has decided to postpone a decision to expel the heads of two British aid agencies - Oxfam and Save the Children - citing administrative difficulties and humanitarian grounds.'

## 토큰화(tokenize)

In [7]:
from nltk.tokenize import word_tokenize
word_tokenize(text)

['Sudan',
 'has',
 'decided',
 'to',
 'postpone',
 'a',
 'decision',
 'to',
 'expel',
 'the',
 'heads',
 'of',
 'two',
 'British',
 'aid',
 'agencies',
 '-',
 'Oxfam',
 'and',
 'Save',
 'the',
 'Children',
 '-',
 'citing',
 'administrative',
 'difficulties',
 'and',
 'humanitarian',
 'grounds',
 '.']

In [8]:
type(word_tokenize(text))

list

## 불용어(stopwords)

In [9]:
from nltk.corpus import stopwords
stopwords.words('english') # 영어 불용어: 노이즈 제거

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [10]:
len(stopwords.words('english'))

179

## 품사 태깅(Part-of-Speech Tagging)

In [11]:
tokens = word_tokenize(text)
nltk.tag.pos_tag(tokens) # 토큰화된 객체 필요

[('Sudan', 'NNP'),
 ('has', 'VBZ'),
 ('decided', 'VBN'),
 ('to', 'TO'),
 ('postpone', 'VB'),
 ('a', 'DT'),
 ('decision', 'NN'),
 ('to', 'TO'),
 ('expel', 'VB'),
 ('the', 'DT'),
 ('heads', 'NNS'),
 ('of', 'IN'),
 ('two', 'CD'),
 ('British', 'JJ'),
 ('aid', 'NN'),
 ('agencies', 'NNS'),
 ('-', ':'),
 ('Oxfam', 'NNP'),
 ('and', 'CC'),
 ('Save', 'NNP'),
 ('the', 'DT'),
 ('Children', 'NNP'),
 ('-', ':'),
 ('citing', 'VBG'),
 ('administrative', 'JJ'),
 ('difficulties', 'NNS'),
 ('and', 'CC'),
 ('humanitarian', 'JJ'),
 ('grounds', 'NNS'),
 ('.', '.')]

- N or V or J로 시작하는 품사들의 단어만 다시 새로운 리스트로 담기

In [12]:
# 품사 태깅
pos_tags = nltk.pos_tag(tokens)

# N, V, J로 시작하는 품사의 단어만 필터링
filtered_words = [word for word, pos in pos_tags if pos.startswith(('N', 'V', 'J'))]
filtered_words

['Sudan',
 'has',
 'decided',
 'postpone',
 'decision',
 'expel',
 'heads',
 'British',
 'aid',
 'agencies',
 'Oxfam',
 'Save',
 'Children',
 'citing',
 'administrative',
 'difficulties',
 'humanitarian',
 'grounds']

- 정규표현식
  - \w: 알파벳, 숫자, _

In [13]:
train['clean'] = train['desc'].str.replace('[^\w] +', '', regex = True).str.lower() # 특정 문자 매칭 시, 삭제
test['clean'] = test['desc'].str.replace('[^\w] +', '', regex = True).str.lower()

train['clean']

Unnamed: 0,clean
0,sudan has decided to postpone a decision to ex...
1,cell phone manufacturers are teaming up to bri...
2,although the united states is experiencing a s...
3,corporate raider vincent bollor said yesterday...
4,reuters hurricane ivan killed at least 20 peop...
...,...
89315,vodafone uk has introduced a service that will...
89316,pullman last weekin studying usc game filmcoug...
89317,the former chairman of us software firm comput...
89318,an australian detainee being arraigned wednesd...


## Lemmatiztion(표제어 추출)
- 사전에 등재된 형태(lemma)로 바꾸는 것
- lemmatization는 문장에서 단어의 원형을 추출하는 과정
    - is, are -> be
    - having -> have


In [14]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

- .lemmatize('단어')
  - 해당 단어의 원형

In [15]:
wnl = WordNetLemmatizer() # 표제어 객체 생성
wnl.lemmatize('dogs')

'dog'

In [16]:
wnl = WordNetLemmatizer()
wnl.lemmatize('us')

'u'

In [17]:
wnl = WordNetLemmatizer()
wnl.lemmatize('is') # 오류도 많음

'is'

## spacy
- 딥러닝 기반의 형태소 분석 라이브러리

In [18]:
import spacy

In [19]:
nlp = spacy.load('en_core_web_sm') # 영어모델 load
nlp

<spacy.lang.en.English at 0x7883ddf9e530>

In [20]:
text = train['clean'].iloc[0]
text

'sudan has decided to postpone a decision to expel the heads of two british aid agencies oxfam and save the children citing administrative difficulties and humanitarian grounds.'

In [21]:
doc = nlp(text) # 토큰 클래스 객체, iterable
display(type(doc)) # 데이터타입: .doc.Doc > 문자열 형태
doc # 품사 태깅 결과

spacy.tokens.doc.Doc

sudan has decided to postpone a decision to expel the heads of two british aid agencies oxfam and save the children citing administrative difficulties and humanitarian grounds.

In [22]:
len(doc)

28

In [23]:
doc[0]

sudan

In [24]:
type(doc[0]) # 토큰 객체, 데이터타입: .token.Token

spacy.tokens.token.Token

In [25]:
doc[0].text # 토큰화된 텍스트의 원문

'sudan'

In [26]:
doc[0].lemma_ # 표제어: 단어 원형

'sudan'

In [27]:
doc[0].tag_ # 품사

'NNP'

In [28]:
doc[0].is_alpha # 알파벳 여부

True

In [29]:
doc[0].is_stop # 불용어 여부, sudan > 불용어 x

False

In [30]:
cols = ['단어', '표제어', '품사','알파벳여부','불용어여부']
data = [ [token.text, token.lemma_, token.tag_, token.is_alpha, token.is_stop] for token in doc]
data

[['sudan', 'sudan', 'NNP', True, False],
 ['has', 'have', 'VBZ', True, True],
 ['decided', 'decide', 'VBN', True, False],
 ['to', 'to', 'TO', True, True],
 ['postpone', 'postpone', 'VB', True, False],
 ['a', 'a', 'DT', True, True],
 ['decision', 'decision', 'NN', True, False],
 ['to', 'to', 'TO', True, True],
 ['expel', 'expel', 'VB', True, False],
 ['the', 'the', 'DT', True, True],
 ['heads', 'head', 'NNS', True, False],
 ['of', 'of', 'IN', True, True],
 ['two', 'two', 'CD', True, True],
 ['british', 'british', 'JJ', True, False],
 ['aid', 'aid', 'NN', True, False],
 ['agencies', 'agency', 'NNS', True, False],
 ['oxfam', 'oxfam', 'NNS', True, False],
 ['and', 'and', 'CC', True, True],
 ['save', 'save', 'VB', True, False],
 ['the', 'the', 'DT', True, True],
 ['children', 'child', 'NNS', True, False],
 ['citing', 'cite', 'VBG', True, False],
 ['administrative', 'administrative', 'JJ', True, False],
 ['difficulties', 'difficulty', 'NNS', True, False],
 ['and', 'and', 'CC', True, True],
 

In [31]:
pd.DataFrame(data, columns = cols)

Unnamed: 0,단어,표제어,품사,알파벳여부,불용어여부
0,sudan,sudan,NNP,True,False
1,has,have,VBZ,True,True
2,decided,decide,VBN,True,False
3,to,to,TO,True,True
4,postpone,postpone,VB,True,False
5,a,a,DT,True,True
6,decision,decision,NN,True,False
7,to,to,TO,True,True
8,expel,expel,VB,True,False
9,the,the,DT,True,True


In [32]:
doc = nlp.tokenizer(text)
doc

sudan has decided to postpone a decision to expel the heads of two british aid agencies oxfam and save the children citing administrative difficulties and humanitarian grounds.

In [33]:
doc[1].tag_ # 품사 정보추출 불가

''

In [34]:
doc[1].lemma_ # 표제어 추출 불가

''

- 품사 N,V,J,R로 시작하는 토큰들만 토큰화하기
  - - nlp.tokenizer 사용

In [35]:
train_list = []

for text in tqdm(train['clean']):
  doc = nlp.tokenizer(text)
  # tmp = list(doc)
  tmp = [ t for t in doc if not t.is_alpha]
  train_list.append(tmp)

  0%|          | 0/89320 [00:00<?, ?it/s]

In [36]:
train_list

[[.],
 [.],
 [falldon't, 're, .],
 [20.2, #, 39;s, .],
 [20, people\as, grenadawhere\looting, swept\through, .],
 [1, -, 9, titans15, -, 12moving, 3, -, 0, 35, .],
 [-, #, 39lives, #, 39demands, .],
 [-, -, .],
 [(, filed26/11/2004)increasing, .],
 [#, 39;s, 19, -, -],
 [ , (, as2,000, (, 1,250, miles)a, 's, .],
 [.],
 [3, -, 1, .],
 [-, 30, -],
 [(, #, 39;s, (, ), .],
 [-you, .],
 [400, .],
 [65, .],
 [-at, .],
 [", ...],
 ['s, -, 's, \$10, .],
 [28, o'neal, 19, 100, -, 94, .],
 [-, .],
 [-, -, #, 39;s, .],
 [ ,
  (,
  jan8,
  \$1.1,
  corp&lt;a,
  href="http://www.reuters.co.uk,
  /,
  financequotelookup.jhtml?ticker,
  =,
  msft.o,
  =,
  =,
  =,
  news"&gt;msft.o&lt;/a&gtafter,
  extensionplaintiff'sattorneys,
  .],
 [ , (, 's, -, 5, -, 4, .],
 [2003, .],
 [.],
 [-, .],
 [otsegominn.isquot;crunch, #, 39housecereal, 49, 7, -, -, 6, -, -, .],
 [-a, -, 1, 2, 57, .],
 [-, 20, .],
 [3.5, 2005but, 175,000the],
 [(, #, 39;s, #, 39;t, #, 39;s, -, .],
 [.],
 [100, .],
 [corp.which, 11, .],


- nltk 활용해 모든 문서에 대해 불용어 제거와 명사+동사+형용사+부사만 토큰화해서 train_list에 담기
- test_list 동일 작업

In [37]:
train_list = []
stop_words = stopwords.words('english') # 불용어

for text in tqdm(train['clean']):
  tokens = word_tokenize(text) # 토큰화
  tokens = nltk.tag.pos_tag(tokens) # 품사 태깅
  tokens = [ t for t,p in tokens if t not in stop_words and p[0] in "NVJR"] # 특정품사
  # print(tokens)
  train_list.append(' '.join(tokens))


  0%|          | 0/89320 [00:00<?, ?it/s]

In [38]:
test_list = []
stop_words = stopwords.words('english') # 불용어

for text in tqdm(test['clean']):
  tokens = word_tokenize(text) # 토큰화
  tokens = nltk.tag.pos_tag(tokens) # 품사 태깅
  tokens = [ t for t,p in tokens if t not in stop_words and p[0] in "NVJR"] # 특정품사
  # print(tokens)
  test_list.append(' '.join(tokens))


  0%|          | 0/38280 [00:00<?, ?it/s]

In [39]:
from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer(max_features=500)
train_data = vec.fit_transform(train_list).A

In [40]:
train_data

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [41]:
test_data = vec.transform(test_list).A
test_data

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

- 스케일링

In [42]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
train_data = scaler.fit_transform(train_data)
test_data = scaler.transform(test_data)

- 정답데이터 만들기

In [43]:
train['target'].nunique()

4

In [44]:
target = train['target'].to_numpy()
target.shape, type(target)

((89320,), numpy.ndarray)

# 데이터셋 클래스 구현

In [45]:
class NewsDataset(torch.utils.data.Dataset):
  def __init__(self, x, y= None):
    self.x = x
    self.y = y

  def __len__(self): # 샘플 수 반환
    return self.x.shape[0] # len(self.x)

  def __getitem__(self, idx):
    item = {}
    item['x'] = torch.Tensor(self.x[idx])

    if self.y is not None:
      item['y'] = torch.tensor(self.y[idx])
    return item

- 결과 확인하기

In [46]:
dt = NewsDataset(train_data, target)
dt[0]

{'x': tensor([0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.3333, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.5000, 0.0000, 0.0000,
         0.0000, 0.0000

In [47]:
dl = torch.utils.data.DataLoader(dt, batch_size = 2, shuffle = False)
batch = next(iter(dl))
batch['x']

tensor([[0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.3333, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.5000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0

# 신경망 모델클래스 구현

In [48]:
class Net(torch.nn.Module):
  def __init__(self, in_features):
    super().__init__()
    self.seq = torch.nn.Sequential(
        torch.nn.Linear(in_features, in_features // 2),
        torch.nn.ReLU(),
        torch.nn.Linear(in_features // 2, in_features // 4),
        torch.nn.ReLU(),
        torch.nn.Linear(in_features // 4, 4) # 다중분류 target 4개
    )
  def forward(self,x):
    return self.seq(x)

- 결과 확인하기

In [49]:
model = Net(train_data.shape[1])
model(batch['x'])

tensor([[0.0320, 0.0220, 0.0696, 0.0067],
        [0.0344, 0.0243, 0.0624, 0.0015]], grad_fn=<AddmmBackward0>)

- customize 클래스 구현

In [50]:
# class ResidualBlock(torch.nn.Module):
#   def __init__(self, in_features):
#     super().__init__()
#     self.fx = torch.nn.Sequential(
#         torch.nn.Linear(in_features, in_features),
#         torch.nn.ReLU(),
#         torch.nn.Dropout(0.5),
#         torch.nn.Linear(in_features, in_features)
#     )
#     self.relu = torch.nn.ReLU()
#   def forward(self, x):
#     fx = self.fx(x)
#     hx = fx + x
#     return self.relu(hx)

- customize클래스 결과 확인하기

In [51]:
# model = ResidualBlock(train_data.shape[1])
# model(batch['x'])

In [52]:
# class Net(torch.nn.Module):
#   def __init__(self, in_features, n_layers):
#     super().__init__()

#     self.init_layer = torch.nn.Sequential(
#         torch.nn.Linar(in_features, in_features // 2),
#         torch.nn.BatchNorm1d(in_features // 2), # 피처 개수 절반 감소
#         torch.nn.LeakyReLU()
#     )

#     res_list = [ResidualBlock(in_features // 2) for _ in range(n_layers)]
#     self.seq = torch.nn.Sequential(*res_list)
#     self.output_layer = torch.nn.Linear(in_features // 2, 4) # 다중분류 target 4개
#   def forward(self,x):
#     return self.output_layer(x)

- customize 결과 확인하기

In [53]:
# model = ResidualBlock(train_data.shape[1])
# model(batch['x'])

# 학습데이터 loop함수 구현

In [54]:
def train_loop(dl, model, loss_fn, optimizer, device):
  epoch_loss = 0
  model.train() # 학습모드
  for batch in dl:
    pred = model(batch['x'].to(device))
    loss = loss_fn(pred, batch['y'].to(device))

    optimizer.zero_grad() # 기울기 0 초기화
    loss.backward() # 역전파
    optimizer.step() # 가중치업데이트

    epoch_loss += loss.item()
  epoch_loss /= len(dl)
  return epoch_loss

# 테스트데이터 loop함수 구현

In [55]:
@torch.no_grad() # 경사추적 중단
def test_loop(dl, model, loss_fn, device):
  epoch_loss = 0
  model.eval() # 평가모드

  act = torch.nn.Softmax(dim = 1) # 회귀에서 사용 x, 시그모이드: 0~1값으로 전환
  pred_list = []
  for batch in dl:
    pred = model(batch['x'].to(device))
    if batch.get('y') is not None:
      loss = loss_fn(pred, batch['y'].to(device))
      epoch_loss += loss.item()

    pred = act(pred) # 회귀에서 사용 x
    pred = pred.to('cpu').numpy()
    pred_list.append(pred)

  pred = np.concatenate(pred_list)
  epoch_loss /= len(dl)
  return epoch_loss, pred

# 하이퍼파라미터 정의

In [56]:
batch_size = 32
loss_fn = torch.nn.CrossEntropyLoss()
epochs = 100
n_splits = 5 #kfold의 k값

# 조합 후 KFold학습 수행

In [57]:
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
cv = KFold(n_splits, shuffle = True, random_state = SEED)

In [58]:
is_holdout = False
reset_seeds(SEED)
score_list = []

for i, (tri,vai) in enumerate(cv.split(train_data)):
  model = Net(train_data.shape[1]).to(device)
  optimizer = torch.optim.Adam(model.parameters())

  # 학습데이터
  train_dt = NewsDataset(train_data[tri], target[tri])
  train_dl = torch.utils.data.DataLoader(train_dt, batch_size = batch_size, shuffle = True)

  #검증데이터
  valid_dt = NewsDataset(train_data[vai], target[vai])
  valid_dl = torch.utils.data.DataLoader(valid_dt, batch_size = batch_size, shuffle = False)

  best_score = 0
  patience = 0
  for epoch in tqdm(range(epochs)):
    train_loss = train_loop(train_dl, model, loss_fn, optimizer, device)
    valid_loss, pred = test_loop(valid_dl, model, loss_fn, device)
    # pred = (pred > 0.5).astype(int) # 이진분류
    pred = np.argmax(pred, axis=1) # 다중분류
    score = f1_score(target[vai], pred, average = 'micro')

    print(train_loss, valid_loss, score)
    patience += 1

    if score > best_score:
      patience = 0
      best_score = score
      torch.save(model.state_dict(), f'model_{i}.pt') # 가중치만 저장

    if patience == 5:
      break

  score_list.append(best_score)
  print(f'Fold-{i}, Best_f1_score : {best_score}')

  if is_holdout:
    break

  0%|          | 0/100 [00:00<?, ?it/s]

0.5771148931109953 0.5247757324752312 0.8014442454097627
0.5027504933291468 0.5105618209659734 0.804411106135244
0.46470358850333493 0.5064143551270309 0.8103448275862069
0.4173963458727306 0.5051883470820187 0.8119122257053292
0.36399313214243767 0.5267393444806391 0.8096730855351545
0.3099215173675936 0.5567391740561385 0.8086094939543216
0.2571617887199819 0.6164883873117205 0.8016681594267802
0.2101748498941597 0.7037417752425018 0.7972458575906852
0.172155393083174 0.8236788076236977 0.7944469323779668
Fold-0, Best_f1_score : 0.8119122257053292


  0%|          | 0/100 [00:00<?, ?it/s]

0.5776979319727906 0.5214314135541216 0.8025078369905956
0.5050212087262334 0.5179872082043205 0.8042991491267353
0.46813286707786184 0.4959338169831496 0.8110725481415136
0.42258249487822974 0.4964362194958228 0.8146551724137931
0.3731661364686393 0.5187565381896944 0.8081056874160323
0.3223942636082218 0.5539676196779889 0.8061464397671294
0.2711232418558934 0.6109842589135034 0.8043551276309897
0.22356020890266384 0.6908709262639338 0.8045230631437528
0.18428313876308278 0.7838136996534089 0.7943909538737125
Fold-1, Best_f1_score : 0.8146551724137931


  0%|          | 0/100 [00:00<?, ?it/s]

0.5757601936462478 0.5355397725147937 0.7965181370353784
0.5033070473955503 0.518522501538286 0.8050268696820421
0.4653961396489611 0.503028203688282 0.8113524406627854
0.42099537841885604 0.5071268130713584 0.8114084191670399
0.3713228474323172 0.5203801634153334 0.8117442901925661
0.3209798136918709 0.5597986918149352 0.8050268696820421
0.2695889395903137 0.6054710147565827 0.8046909986565159
0.22449001514232325 0.6948753230320373 0.8010523958799821
0.18465612723462474 0.7926518472981154 0.7917599641737573
0.1509984158675165 0.8992205006945112 0.7927115987460815
Fold-2, Best_f1_score : 0.8117442901925661


  0%|          | 0/100 [00:00<?, ?it/s]

0.5763661253828667 0.526179089715741 0.8049149126735333
0.5020912064533836 0.5100606297657614 0.8116323331840574
0.46264418634634835 0.5063594026567684 0.8103448275862069
0.417605148143716 0.5066876203514808 0.8130877742946708
0.3651059735691499 0.5225188906216237 0.8130317957904165
0.31126968163381336 0.5627851254824876 0.8085535154500672
0.2596343744994945 0.6231060436325337 0.8049708911777878
0.21453950224395055 0.695144939694507 0.8017241379310345
0.17745652686607674 0.78204834462545 0.8001567398119123
Fold-3, Best_f1_score : 0.8130877742946708


  0%|          | 0/100 [00:00<?, ?it/s]

0.5770398771487913 0.5218821562999902 0.8043551276309897
0.5037852655804269 0.5161604560338534 0.8013882669055082
0.4653226780520391 0.4949953960044866 0.8132557098074339
0.4192911057369729 0.5047094503380958 0.8106247201074788
0.36881531231302184 0.5147152559995012 0.8092812360053739
0.3161039503797575 0.5536640863499083 0.8090013434841021
0.2647693200905228 0.6125129248033909 0.8001007613076578
0.21748970462567355 0.697075673900478 0.7978616211374832
Fold-4, Best_f1_score : 0.8132557098074339


In [59]:
np.mean(score_list)

0.8129310344827585

# test데이터 추론하기

In [60]:
test_dt = NewsDataset(test_data)
test_dt[0]

{'x': tensor([0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.3333, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.5000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000

In [61]:
test_dl = torch.utils.data.DataLoader(test_dt, batch_size = batch_size, shuffle = False)

In [62]:
pred_list = []
for i in range(n_splits):
  model = Net(train_data.shape[1]).to(device)
  state_dict = torch.load(f'model_{i}.pt', weights_only=True)
  model.load_state_dict(state_dict)

  _, pred = test_loop(test_dl, model, loss_fn, device)
  pred_list.append(pred)

In [63]:
pred = np.mean(pred_list, axis = 0) # 산술평균 앙상블
pred = np.argmax(pred, axis = 1)
pred.shape

(38280,)