라이브러리 임포트

In [None]:
from importlib.metadata import version
import nltk
import tensorflow
import summa
import pandas as pd

print("nltk 버전:", nltk.__version__)
print("TensorFlow 버전:", tensorflow.__version__)
print("Pandas 버전:", pd.__version__)a
print("Summa 버전:", version('summa'))

nltk 버전: 3.9.1
TensorFlow 버전: 2.18.0
Pandas 버전: 2.2.2
Summa 버전: 1.2.0


In [2]:
import urllib.request
import pandas as pd

# 데이터 다운로드
url = "https://raw.githubusercontent.com/sunnysai12345/News_Summary/master/news_summary_more.csv"
filename = "news_summary_more.csv"
urllib.request.urlretrieve(url, filename)

# 데이터 로드
data = pd.read_csv(filename, encoding='iso-8859-1')

In [3]:
data.sample(10)

Unnamed: 0,headlines,text
94373,Man turns portable toilets into homes for the ...,"TK Devine from Los Angeles, US turns portable ..."
86209,Death toll from Kabul truck-bomb attack rises ...,The death toll from last week's truck-bomb exp...
4678,Iran confirms it is in talks with Taliban,Iran is holding talks with the Afghan Taliban ...
19993,Gorakhpur tragedy due to internal hospital pol...,UP CM Yogi Adityanath has said that the deaths...
27449,"Call it 'magic' train, not bullet train: Rahul...",Taking a dig at the Centre's bullet train proj...
36890,DHFL shares plunge 16% following reports of go...,Dewan Housing Finance (DHFL) shares on Thursda...
35140,"There is no currency shortage, enough cash in ...",The RBI has clarified that there is no currenc...
54598,DNA test to determine Thailand lottery jackpot...,A DNA test will decide the rightful winner of ...
35658,Russia spied on emails of ex-spy's daughter: UK,Russian intelligence agencies were spying on t...
4554,Let Mumbai be open all night on New Year: Aadi...,In a letter to Maharashtra CM Devendra Fadnavi...


텍스트 전처리

In [None]:
import nltk
print("NLTK 버전:", nltk.__version__)

# 필요한 리소스 다운로드
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

NLTK 버전: 3.9.1


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\FOCUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\FOCUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\FOCUS\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [14]:
import re
from nltk.corpus import stopwords

# 수축어 사전 정의 (예시)
contraction_mapping = {
    "can't": "cannot",
    "won't": "will not",
    # 필요한 수축어들을 추가하세요.
}

def preprocess_text(text):
    # 수축어 치환
    for contraction, expansion in contraction_mapping.items():
        text = text.replace(contraction, expansion)
    
    # 특수문자 및 숫자 제거
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    
    # 소문자 변환
    text = text.lower()
    
    # 불용어 제거
    stop_words = set(stopwords.words('english'))
    words = nltk.word_tokenize(text)
    words = [word for word in words if word not in stop_words]

    # 다시 문장으로 합치기
    text = ' '.join(words)
    return text

# 본문과 요약문에 전처리 적용
data['text'] = data['text'].apply(preprocess_text)
data['headlines'] = data['headlines'].apply(preprocess_text)



인코더 및 디코더 입력 데이터 토큰화

In [15]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# 파라미터 설정
max_text_len = 80
max_summary_len = 10

# 인코더 입력 데이터 토큰화
x_tokenizer = Tokenizer()
x_tokenizer.fit_on_texts(data['text'])
x_sequences = x_tokenizer.texts_to_sequences(data['text'])
x_padded = pad_sequences(x_sequences, maxlen=max_text_len, padding='post')

# 디코더 입력 데이터 토큰화
y_tokenizer = Tokenizer()
y_tokenizer.fit_on_texts(data['headlines'])
y_sequences = y_tokenizer.texts_to_sequences(data['headlines'])
y_padded = pad_sequences(y_sequences, maxlen=max_summary_len, padding='post')

# 단어 집합 크기
x_vocab_size = len(x_tokenizer.word_index) + 1
y_vocab_size = len(y_tokenizer.word_index) + 1

어텐션 메커니즘을 활용한 seq2seq 모델 구축

In [17]:
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate
from tensorflow.keras.models import Model

# 인코더 정의
encoder_inputs = Input(shape=(max_text_len,))
enc_emb = Embedding(x_vocab_size, 128, trainable=True)(encoder_inputs)
encoder_lstm = LSTM(256, return_sequences=True, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)

# 디코더 정의
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(y_vocab_size, 128, trainable=True)
dec_emb = dec_emb_layer(decoder_inputs)
decoder_lstm = LSTM(256, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=[state_h, state_c])

# 어텐션 메커니즘
attn_layer = tf.keras.layers.Attention()
attn_out = attn_layer([decoder_outputs, encoder_outputs])

# 어텐션 결과와 디코더 출력 결합
decoder_concat_input = Concatenate(axis=-1)([decoder_outputs, attn_out])

# 출력층
decoder_dense = Dense(y_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_concat_input)

# 최종 모델 정의
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)


In [18]:
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')
history = model.fit(
    [x_padded, y_padded[:, :-1]], 
    y_padded[:, 1:].reshape(y_padded.shape[0], y_padded.shape[1] - 1, 1),
    epochs=10,
    batch_size=64,
    validation_split=0.1
)

Epoch 1/10
[1m 404/1384[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m12:21[0m 757ms/step - loss: 6.8560

KeyboardInterrupt: 