<a href="https://colab.research.google.com/github/kiseijuuzzz/Python_Quest/blob/main/news_sammary.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import re
import nltk
import pandas as pd
import urllib.request
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate, Attention
import matplotlib.pyplot as plt

# 필요한 NLTK 자원 다운로드
nltk.download('stopwords')
nltk.download('punkt')

# 데이터셋 다운로드
url = "https://raw.githubusercontent.com/sunnysai12345/News_Summary/master/news_summary_more.csv"
file_path = "news_summary_more.csv"
urllib.request.urlretrieve(url, filename=file_path)

# 데이터셋 불러오기
data = pd.read_csv(file_path, encoding='iso-8859-1')

# 전처리 함수 정의
def preprocess_text(text):
    text = text.lower()  # 소문자 변환
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)  # 특수문자 제거
    stop_words = set(stopwords.words('english'))  # 영어 불용어 집합 가져오기
    word_tokens = word_tokenize(text)  # 단어 토큰화
    text = ' '.join([w for w in word_tokens if not w in stop_words])  # 불용어 제거
    return text

# 전처리 적용
data['text'] = data['text'].apply(preprocess_text)
data['headlines'] = data['headlines'].apply(preprocess_text)

# 입력과 출력 데이터 분리
X = data['text']
Y = data['headlines']

# 학습과 테스트 데이터 분리
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# 토크나이저 정의 및 텍스트 데이터에 적용
x_tokenizer = Tokenizer()
x_tokenizer.fit_on_texts(list(X_train))

y_tokenizer = Tokenizer()
y_tokenizer.fit_on_texts(list(Y_train))

# 디코더 입력에 'starttoken' 추가
y_tokenizer.word_index['starttoken'] = len(y_tokenizer.word_index) + 1
y_tokenizer.index_word[len(y_tokenizer.word_index)] = 'starttoken'

# 정수 인코딩과 패딩 for Encoder Inputs
X_train_seq = x_tokenizer.texts_to_sequences(X_train)
X_test_seq = x_tokenizer.texts_to_sequences(X_test)
X_train_pad = pad_sequences(X_train_seq, maxlen=30, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=30, padding='post')

# 정수 인코딩과 패딩 for Decoder Inputs and Outputs (Start token and padding)
Y_train_seq = y_tokenizer.texts_to_sequences(Y_train)
Y_test_seq = y_tokenizer.texts_to_sequences(Y_test)

Y_train_input = pad_sequences(Y_train_seq, maxlen=8, padding='post')
Y_test_input = pad_sequences(Y_test_seq, maxlen=8, padding='post')

# 어텐션 메커니즘을 사용한 seq2seq 모델 설계
latent_dim = 300

encoder_inputs = Input(shape=(30,))
enc_emb = Embedding(len(x_tokenizer.word_index) + 1, latent_dim, trainable=True)(encoder_inputs)

# Encoder LSTM
encoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)

# Decoder LSTM
decoder_inputs = Input(shape=(8,))
dec_emb_layer = Embedding(len(y_tokenizer.word_index) + 1, latent_dim, trainable=True)
dec_emb = dec_emb_layer(decoder_inputs)

decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=[state_h, state_c])

# 어텐션 메커니즘 추가
attn_layer = Attention()
attn_out = attn_layer([decoder_outputs, encoder_outputs])

# 어텐션 출력과 디코더 LSTM 출력 연결
decoder_concat_input = Concatenate(axis=-1, name='concat_layer')([decoder_outputs, attn_out])

# Dense 레이어
decoder_dense = Dense(len(y_tokenizer.word_index) + 1, activation='softmax')
decoder_outputs = decoder_dense(decoder_concat_input)

# 모델 정의
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# 모델 컴파일
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')

# 어텐션 메커니즘을 사용한 seq2seq 모델 학습
history = model.fit([X_train_pad, Y_train_input[:, :-1]], Y_train_input[:, 1:],
                    epochs=50, batch_size=128, validation_data=([X_test_pad, Y_test_input[:, :-1]], Y_test_input[:, 1:]))

# 학습 과정 시각화
plt.plot(history.history['loss'], label='훈련 손실')
plt.plot(history.history['val_loss'], label='검증 손실')
plt.xlabel('에포크')
plt.ylabel('손실')
plt.legend()
plt.show()

# 학습된 모델을 사용하여 추상적 요약 생성
generated_summaries = []
for i in range(len(X_test)):
    text = X_test.iloc[i]
    text = "starttoken " + text + " endtoken"
    text_seq = x_tokenizer.texts_to_sequences([text])
    text_pad = pad_sequences(text_seq, maxlen=30, padding='post')

    pred_seq = model.predict([text_pad, Y_test_input[i].reshape(1, -1)[:, :-1]])
    pred_seq = pred_seq.argmax(axis=-1)

    pred_summary = ' '.join([y_tokenizer.index_word[idx] for idx in pred_seq[0]])
    pred_summary = pred_summary.replace('starttoken', '').replace('endtoken', '').strip()
    generated_summaries.append(pred_summary)

# 실제 결과와 요약문 비교
for i in range(5):  # 상위 5개 샘플에 대해서만 확인
    print("원래 요약문:", Y_test.iloc[i])
    print("추상적 요약 결과:", generated_summaries[i])
    print()

# 추출적 요약 결과 확인
for i in range(5):  # 상위 5개 샘플에 대해서만 확인
    text = X_test.iloc[i]
    extracted_summary = ' '.join(sent_tokenize(text)[:2])
    print("원문:", text)
    print("추출적 요약 결과:", extracted_summary)
    print()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
