
#### torchtext 라이브러리로 텍스트 분류 <hr>
- [1]단계 - 데이터 전처리 : 숫자형식으로 변환하는 것 까지

- [1-1] 데이터 준비 => 내장 데이터셋 활용  
    * AG_NEWS 데이터셋 반복자 : 레이블(label) + 문장의 튜플(tuple) 형태

In [104]:
### 모듈 로딩 
import torchtext
import torch
from torchtext.datasets import AG_NEWS

torchtext.__version__

'0.17.2+cpu'

In [106]:
### ===> Pytorch 내장 데이터셋 뉴스 학습 데이터 추출
train_iter, test_iter = AG_NEWS()

- [1-2] 데이터 처리 파이프라인 준비 <hr>

In [107]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

### ==> 특별 문자 토큰
UNK = '<UNK>'
PAD = '<PAD>'

### ==> 토커나이즈 생성
tokenizer = get_tokenizer('basic_english')

In [88]:
### ===> 토큰 제너레이터 함수 : 데이터 추출하여 토큰화 
def yield_tokens(data_iter):
    for _, text in data_iter:
        # 라벨, 텍스트 --> 텍스트 토큰화
        yield tokenizer(text)

In [89]:
### ===> 단어사전 생성
vocab = build_vocab_from_iterator(yield_tokens(train_iter), 
                                  specials=[UNK])

### ===> <UNK> 인덱스 0으로 설정
vocab.set_default_index(vocab[UNK])

In [90]:
### ===> 텍스트 >>>> 정수 인코딩
text_pipeline = lambda x: vocab(tokenizer(x))

### ===> 레이틀 >>> 정수 인코딩 (0~3)
label_pipeline = lambda x: int(x) - 1

- [1-3] 데이터 배치(batch)와 반복자 생성 <hr>
    * torch.utils.data.DataLoader : getitem(), len() 구현한 맵 형태(map-style)
    * collate_fn() : DataLoader로부터 생성된 샘플 배치 함수 
        - 입력 : DataLoader에 배치 크기(batch size)가 있는 배치(batch) 데이터

In [91]:
### ===> 모듈로딩
from torch.utils.data import DataLoader
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset

In [92]:
### ===> 실행 디바이스 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [108]:
### ===> DataLoader에서 배치크기만큼 데이터셋 반환 함수 
def collate_batch(batch):
    # 배치크기 만큼의 라벨, 텍스트, 오프셋 값 저장 변수 
    label_list, text_list, offsets = [], [], [0]
    
    # 1개씩 뉴스기사, 라벨 추출 해서 저장 
    for (_label, _text) in batch:
         # 라벨 인코딩 후 저장
         label_list.append(label_pipeline(_label))
         
         # 텍스트 인코딩 후 저장
         processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
         text_list.append(processed_text)
         
         # 텍스트 offset 즉 , 텍스트 크기/길이 저장
         offsets.append(processed_text.size(0))
    
    # 텐서화 진행     
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    
    return label_list.to(device), text_list.to(device), offsets.to(device)

- [1-4]데이터셋 준비 <hr>

In [94]:
### ===> 학습용, 검증용, 테스트용 DataSet 준비 
BATCH_SIZE = 32

### 학습용, 검증용, 테스트용 Dataset, DataLoader 준비
trainDL = DataLoader( train_iter, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch )
testDL  = DataLoader( test_iter, batch_size=BATCH_SIZE, shuffle=True,  collate_fn=collate_batch )

In [95]:
### ===> 확인
import torch.nn as nn

HIDDEN_SIZE = 3
EMBEDD_DIM  = 10
VOCAB_SIZE  = len(vocab) 

for labels, texts, offests in trainDL:
    print(labels.shape, labels, )
    print(texts.shape, texts, sep='\n\n')
    print(offests.shape, offests)
    
    # embedding = nn.EmbeddingBag(VOCAB_SIZE, EMBEDD_DIM, sparse=False)
    # result =embedding(texts, offests)
    # print(result.shape, result.ndim)
    
    # rnn = nn.RNN(EMBEDD_DIM, HIDDEN_SIZE, batch_first=True)
    # output, fn=rnn(result)
    # print('[RNN]============\n', output, fn)
    break

torch.Size([32, 10]) 2
 tensor([[-0.4415, -0.6726,  0.3127],
        [-0.8577, -0.4919,  0.4364],
        [-0.8075, -0.4232,  0.1802],
        [-0.6846, -0.5509,  0.0737],
        [-0.6224, -0.3496,  0.4169],
        [-0.7757, -0.2900,  0.3750],
        [-0.7625, -0.5083,  0.2790],
        [-0.8242, -0.3862,  0.3161],
        [-0.6136, -0.2336,  0.2761],
        [-0.5345, -0.6679, -0.3465],
        [-0.7885, -0.6065, -0.0407],
        [-0.7854, -0.4097, -0.0153],
        [-0.4621, -0.3325,  0.0480],
        [-0.7990, -0.4478, -0.1109],
        [-0.6724, -0.6953,  0.3589],
        [-0.8295, -0.1134,  0.5401],
        [-0.4827, -0.3993,  0.3873],
        [-0.4912, -0.4899,  0.1477],
        [-0.8387, -0.4681,  0.4121],
        [-0.7527, -0.0494,  0.4377],
        [-0.5338, -0.5573,  0.3731],
        [-0.8375, -0.3220,  0.2115],
        [-0.5681, -0.5502, -0.2035],
        [-0.6891, -0.1344,  0.0648],
        [-0.6046, -0.5930,  0.1568],
        [-0.7800, -0.3389,  0.0522],
        [-0.71

- [2] 학습 준비 <hr>
- [2-1] 모델 설계

In [96]:
### ===> 모듈로딩
import torch.nn as nn
import torch.optim as optim 

In [109]:
### ==> 모델 설계
### 입력층 : EmbeddingBag Layer - 레이어와 분류(classification) 목적을 위한 선형 레이어, 텍스트의 길이는 오프셋(offset)으로 저장, 패딩(padding) 필요하지는 않음
### 은닉층 : Linear - 4개 클래스 분류 
class TextModel(nn.Module):
    def __init__(self, VOCAB_SIZE, EMBEDD_DIM, HIDDEN_SIZE, NUM_CLASS):
        super().__init__()
        # 모델 구성 층 정의
        self.embedding = nn.EmbeddingBag(VOCAB_SIZE, EMBEDD_DIM, sparse=False)
        self.fc = nn.Linear(EMBEDD_DIM, NUM_CLASS)
        
        # self.embedding = nn.EmbeddingBag(VOCAB_SIZE, EMBEDD_DIM, sparse=False)
        # self.rnn = nn.RNN(EMBEDD_DIM, HIDDEN_SIZE)
        # self.fc = nn.Linear(HIDDEN_SIZE, NUM_CLASS)
        self.init_weights()

    # 가중치 초기화 
    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    # 순방향 학습 진행
    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        # output, _ = self.rnn(embedded)
        # return self.fc(output[:, -1,:])
        return self.fc(embedded)

- [2-2] 학습 관련  변수 및 인스턴스 준비 <hr>

In [98]:
### ==> 학습 관련 하이퍼파라미터와 인스턴스
HIDDEN_SIZE = 3
EMBEDD_DIM  = 64
VOCAB_SIZE  = len(vocab) 
NUM_CLASS   = len( set( [ label for label, _ in train_iter ] ) )
EPOCHS      = 100  
LR          = 5 
BATCH_SIZE  = 32

In [99]:
### ==> 학습 관련 인스턴스
MODEL = TextModel(VOCAB_SIZE, EMBEDD_DIM, 
                  HIDDEN_SIZE, NUM_CLASS).to(device)

CRITERION = nn.CrossEntropyLoss()
OPTIMIZER = optim.SGD(MODEL.parameters(), lr=LR)
SCHEDULER = optim.lr_scheduler.StepLR(OPTIMIZER, 1.0, gamma=0.1)

In [100]:
MODEL

TextModel(
  (embedding): EmbeddingBag(95811, 64, mode='mean')
  (fc): Linear(in_features=64, out_features=4, bias=True)
)

- [2-3]학습관련 함수들 <hr>

In [110]:
### ===> 학습 진행 함수
def train(model, dataloader, optimizer, criterion, epoch):
    # 학습 모드
    model.train()
    
    # 학습 평가 관련 변수들
    total_acc, total_count = 0, 0
    log_interval = 300

    # 배치 학습 진행
    for idx, (label, text, offsets) in enumerate(dataloader):
        
        label, text, offsets = label.to(device), text.to(device), offsets.to(device), 
        
        # 학습진행
        predicted_label = model(text, offsets)
        #print(f'predicted_label : {predicted_label.shape}   label : {label.shape} ')
        
        # 손실 계산 및 W,b 업데이트
        optimizer.zero_grad()
        loss = criterion(predicted_label, label)
        loss.backward()
        # 기울기 소실 및 폭주 예방을 위한 양극단 값 자르기 
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        
        # 배치 학습 평가 
        total_acc += (predicted_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            print( f"===> epoch {epoch:3d} | {idx:5d}/{len(dataloader):5d} batches  ===> accuracy {total_acc /total_count:8.3f}" )
            total_acc, total_count = 0, 0

        break

In [102]:
### ===> 검증 및 테스트 함수
def evaluate(model,dataloader, criterion):
    # 검증 모드 
    model.eval()
    
    # 검증 평가 관련 변수
    total_acc, total_count = 0, 0

    # 검증 진행
    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            predicted_label = model(text, offsets)
            loss = criterion(predicted_label, label)
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc / total_count

In [103]:
### ====> 예측 함수
def predict(model, text, text_pipeline):
    with torch.no_grad():
        # 토큰화 > 정수 변환  > 텐서
        text = torch.tensor(text_pipeline(text))
        output = model(text, torch.tensor([0]))
        return output.argmax(1).item() + 1

- [3] 학습 진행 <hr>

In [83]:
### 학습 및 검증 진행
for epoch in range(1, EPOCHS + 1):
    total_accu = train(MODEL, trainDL, OPTIMIZER, CRITERION, epoch)
    accu_val = evaluate(MODEL, testDL, CRITERION)
    if total_accu is not None and total_accu > accu_val:
        SCHEDULER.step()
    else:
        total_accu = accu_val
    print("-" * 59)
    print(f"===> end of epoch {epoch:3d}     valid accuracy {accu_val:8.3f} ")
    print("-" * 59)

-----------------------------------------------------------
===> end of epoch   1     valid accuracy    0.251 
-----------------------------------------------------------




-----------------------------------------------------------
===> end of epoch   2     valid accuracy    0.253 
-----------------------------------------------------------
-----------------------------------------------------------
===> end of epoch   3     valid accuracy    0.264 
-----------------------------------------------------------
-----------------------------------------------------------
===> end of epoch   4     valid accuracy    0.270 
-----------------------------------------------------------
-----------------------------------------------------------
===> end of epoch   5     valid accuracy    0.257 
-----------------------------------------------------------
-----------------------------------------------------------
===> end of epoch   6     valid accuracy    0.266 
-----------------------------------------------------------
-----------------------------------------------------------
===> end of epoch   7     valid accuracy    0.283 
----------------------------------

- [4] 평가 데이터로 모델 평가<hr>



In [69]:
print('TESTING......')
accu_test = evaluate(MODEL, testDL, CRITERION)
print(f'Test Acc : {accu_test:8.3f}')

TESTING......
Test Acc :    0.628


- [5] 임의 데이터로 모델 평가 <hr> 




In [111]:
ag_news_label = {1: "World", 2: "Sports", 3: "Business", 4: "Sci/Tec"}


### ==> 임의의 데이터 
ex_text_str = "MEMPHIS, Tenn. – Four days ago, Jon Rahm was \
    enduring the season’s worst weather conditions on Sunday at The \
    Open on his way to a closing 75 at Royal Portrush, which \
    considering the wind and the rain was a respectable showing. \
    Thursday’s first round at the WGC-FedEx St. Jude Invitational \
    was another story. With temperatures in the mid-80s and hardly any \
    wind, the Spaniard was 13 strokes better in a flawless round. \
    Thanks to his best putting performance on the PGA Tour, Rahm \
    finished with an 8-under 62 for a three-stroke lead, which \
    was even more impressive considering he’d never played the \
    front nine at TPC Southwind."

ex_text_str_02= "As food prices continued to rise, consumer prices continued to rise in the 3% range for the second consecutive month.\
                According to the \"March Consumer Price Trend\" released by the National Statistical Office on the 2nd, the consumer price index \
                last month was 113.94 (2020 = 100), up 3.1% from the same month last year.\
                This year's consumer price growth rate increased again to 3.1% in February after recording 2.8% in January this year.\
                Prices of agricultural, livestock and fisheries products rose 11.7 percent year-on-year, up more than 11.4 percent from the previous month.\
                Among them, agricultural prices rose 20.5% year-on-year, marking the second consecutive month of increase of 20% following the previous month's 20.9%.\
                In particular, the price of apples rose 88.2 percent, which was larger than the previous month (71.0 percent), the largest increase ever since January 1980, \
                when statistics began to be compiled."

print(f"NEWS => {ag_news_label[predict(MODEL, ex_text_str, text_pipeline)]}")

NEWS => World
