##### [ LSTM기반 텍스트 생성]
- 기사의 제목을 생성 ==> 앞 부분 2개 단어 입력으로 나머지 생성

- [1] 데이터 준비 <hr>

In [55]:
### ===> 모듈로딩
import pandas as pd
import os
import string

In [56]:
### ===> 데이터 관련 경로
data_dir = './data/text/'
filename = './data/text/ArticlesApril2017.csv'

In [57]:
dataDF= pd.read_csv(filename)

dataDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 886 entries, 0 to 885
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   abstract          22 non-null     object
 1   articleID         886 non-null    object
 2   articleWordCount  886 non-null    int64 
 3   byline            886 non-null    object
 4   documentType      886 non-null    object
 5   headline          886 non-null    object
 6   keywords          886 non-null    object
 7   multimedia        886 non-null    int64 
 8   newDesk           886 non-null    object
 9   printPage         886 non-null    int64 
 10  pubDate           886 non-null    object
 11  sectionName       886 non-null    object
 12  snippet           886 non-null    object
 13  source            886 non-null    object
 14  typeOfMaterial    886 non-null    object
 15  webURL            886 non-null    object
dtypes: int64(3), object(13)
memory usage: 110.9+ KB


In [58]:
dataDF.head(2)

Unnamed: 0,abstract,articleID,articleWordCount,byline,documentType,headline,keywords,multimedia,newDesk,printPage,pubDate,sectionName,snippet,source,typeOfMaterial,webURL
0,,58def1347c459f24986d7c80,716,By STEPHEN HILTNER and SUSAN LEHMAN,article,Finding an Expansive View of a Forgotten Peop...,"['Photography', 'New York Times', 'Niger', 'Fe...",3,Insider,2,2017-04-01 00:15:41,Unknown,One of the largest photo displays in Times his...,The New York Times,News,https://www.nytimes.com/2017/03/31/insider/nig...
1,,58def3237c459f24986d7c84,823,By GAIL COLLINS,article,"And Now, the Dreaded Trump Curse","['United States Politics and Government', 'Tru...",3,OpEd,23,2017-04-01 00:23:58,Unknown,Meet the gang from under the bus.,The New York Times,Op-Ed,https://www.nytimes.com/2017/03/31/opinion/and...


- [2] 커스텀 테이서셋 <hr>

In [41]:
import numpy as np
import glob

from torch.utils.data import Dataset, DataLoader

In [42]:
class TextGeneration(Dataset):    

    def __init__(self):
        all_headlines = []

        # 모든 헤드라인 텍스트 로딩
        for filename in glob.glob(data_dir+"*.csv"):
            if 'Articles' in filename:
                article_df = pd.read_csv(filename)

                # 데이터셋의 headline의 값을 all_headlines에 추가
                all_headlines.extend(list(article_df.headline.values))
                break

        # headline 중 unknown 값 제거
        all_headlines = [h for h in all_headlines if h != "Unknown"]
        
        # 구두점 제거 및 전처리가 된 문장들을 리스트로 반환
        self.corpus = [self.clean_text(x) for x in all_headlines]
        self.BOW = {}

        # 모든 문장의 단어 추출해 고유번호 지정
        for line in self.corpus:
            for word in line.split():
                if word not in self.BOW.keys():
                    self.BOW[word] = len(self.BOW.keys())

        # 모델의 입력으로 사용할 데이터
        self.data = self.generate_sequence(self.corpus)
   
    ##  전처리 함수 
    def clean_text(self, txt):
        # 소문자 변환 및 특수문자 제거
        txt = "".join(v for v in txt if v not in string.punctuation).lower()
        return txt         
        
    ## 단어 순서 지정 함수     
    def generate_sequence(self, txt):
        seq = []

        for line in txt:
            line = line.split()
            line_bow = [self.BOW[word] for word in line]  # 단어 => 숫자 

            data=[([line_bow[i], line_bow[i+1]], line_bow[i+2])  for i in range(len(line_bow)-2)]
            
            seq.extend(data)
        return seq
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, i):
        data = np.array(self.data[i][0])  
        label = np.array(self.data[i][1]).astype(np.float32)  

        return data, label

- [3] 모델 정의 <hr>

In [43]:
import torch
import torch.nn as nn

In [44]:
### 하이퍼파라미터 
EMBEDDING_DIM = 16
HIDDEN_SIZE = 64
NUM_LAYERS = 5

In [45]:

class LSTM(nn.Module):
   ## 모델 구조 정의 
   def __init__(self, num_embeddings):
       super(LSTM, self).__init__()

       # 임베딩층
       self.embed = nn.Embedding( num_embeddings=num_embeddings, embedding_dim=EMBEDDING_DIM)
       
       # LSTM 5개층
       self.lstm = nn.LSTM( input_size=EMBEDDING_DIM, hidden_size=HIDDEN_SIZE, 
                            num_layers=NUM_LAYERS, batch_first=True)
       
       # 분류 위한 MLP층
       self.fc1 = nn.Linear(128, num_embeddings)
       self.fc2 = nn.Linear(num_embeddings,num_embeddings)

       # 활성화 함수
       self.relu = nn.ReLU()
       

   ## 순방향 학습 진행 함수 
   def forward(self, x):
       x = self.embed(x)

       # LSTM 모델 예측값
       x, _ = self.lstm(x)
       x = torch.reshape(x, (x.shape[0], -1))
       x = self.fc1(x)
       x = self.relu(x)
       x = self.fc2(x)

       return x

- [3] 학습 준비 <hr>

In [46]:
### ===> 모듈로딩
import tqdm

from torch.utils.data.dataloader import DataLoader
from torch.optim.adam import Adam

In [47]:
# 디바이스 설정
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# 학습관련 하이퍼파라미터 
BATCH_SIZE = 64
LR = 0.001
EPOCHS = 200

# 데이터셋 정의
TEXT_DS = TextGeneration()  
DATA_LD = DataLoader(TEXT_DS, batch_size=BATCH_SIZE)
print(f'TEXT_DS.BOW : {len(TEXT_DS.BOW)}개')
VOCAB_SIZE = len(TEXT_DS.BOW)

# 모델 정의
MODEL = LSTM(num_embeddings=VOCAB_SIZE).to(DEVICE) 
print(f'MODE DESC\n{MODEL}')

OPTIMIZER = Adam(MODEL.parameters(), lr=LR)

TEXT_DS.BOW : 2482개
MODE DESC
LSTM(
  (embed): Embedding(2482, 16)
  (lstm): LSTM(16, 64, num_layers=5, batch_first=True)
  (fc1): Linear(in_features=128, out_features=2482, bias=True)
  (fc2): Linear(in_features=2482, out_features=2482, bias=True)
  (relu): ReLU()
)


In [48]:
def training():
    # 학습 모드 설정 
    MODEL.train()
    
    for epoch in range(EPOCHS):
        # 진행 프로그래스바 출력 연동
        iterator = tqdm.tqdm(DATA_LD)
        
        for data, label in iterator:
            # 기울기 초기화
            OPTIMIZER.zero_grad()

            # 모델의 예측값
            pred = MODEL(torch.tensor(data, dtype=torch.long).to(DEVICE))

            # 정답 레이블 >>> long 텐서 반환
            loss = nn.CrossEntropyLoss()(
                pred, torch.tensor(label, dtype=torch.long).to(DEVICE))
            
            # 오차 역전파
            loss.backward()
            OPTIMIZER.step()

            iterator.set_description(f"epoch{epoch} loss:{loss.item()}")

        torch.save(MODEL.state_dict(), f"lstm_{epoch}.pth")

In [49]:
### 학습 진행
training()

  pred = MODEL(torch.tensor(data, dtype=torch.long).to(DEVICE))
  pred, torch.tensor(label, dtype=torch.long).to(DEVICE))
epoch0 loss:7.371738433837891: 100%|██████████| 63/63 [00:13<00:00,  4.71it/s] 
epoch1 loss:7.014400482177734: 100%|██████████| 63/63 [00:14<00:00,  4.34it/s] 
epoch2 loss:6.794592380523682: 100%|██████████| 63/63 [00:11<00:00,  5.57it/s] 
epoch3 loss:6.57385778427124: 100%|██████████| 63/63 [00:10<00:00,  5.75it/s]  
epoch4 loss:6.361748695373535: 100%|██████████| 63/63 [00:10<00:00,  6.00it/s] 
epoch5 loss:6.066788673400879: 100%|██████████| 63/63 [00:08<00:00,  7.24it/s] 
epoch6 loss:5.953599452972412: 100%|██████████| 63/63 [00:08<00:00,  7.25it/s] 
epoch7 loss:5.90260648727417: 100%|██████████| 63/63 [00:08<00:00,  7.28it/s]  
epoch8 loss:5.759116172790527: 100%|██████████| 63/63 [00:09<00:00,  6.90it/s] 
epoch9 loss:5.67263126373291: 100%|██████████| 63/63 [00:10<00:00,  6.23it/s] 
epoch10 loss:5.622330665588379: 100%|██████████| 63/63 [00:16<00:00,  3.89it/s]

In [52]:
### ===> 생성 
def generate(model, BOW, string="finding an ", strlen=10):
   device = "cuda" if torch.cuda.is_available() else "cpu"

   print(f"input word: {string}")

   with torch.no_grad():
       for p in range(strlen):
           # 입력 문장 텐서로 변경
           words = torch.tensor(
               [BOW[w] for w in string.split()], dtype=torch.long).to(device)

           # 입력 텐서 shape 변환
           input_tensor = torch.unsqueeze(words[-2:], dim=0)
           output = model(input_tensor)                                 # 모델 이용해 예측
           output_word = (torch.argmax(output).cpu().numpy())
           string += list(BOW.keys())[output_word]          # 문장에 예측된 단어 추가
           string += " "

   print(f"predicted sentence: {string}")


In [53]:

MODEL.load_state_dict(torch.load("lstm.pth", map_location=DEVICE))
pred = generate(MODEL, TEXT_DS.BOW)

input word: finding an 
predicted sentence: finding an expansive view of a lynching victory peace on another trade 
