<a href="https://colab.research.google.com/github/SeohyunLyoo/Study/blob/main/Attention%20-%20Summary.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import tensorflow as tf
import nltk
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
%cd drive/MyDrive/Study



# **1. 데이터 준비 및 불용어 취합**

In [12]:
data = pd.read_csv('Reviews.csv', nrows = 100000)
print(len(data)), print(data.info())

100000
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 10 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   Id                      100000 non-null  int64 
 1   ProductId               100000 non-null  object
 2   UserId                  100000 non-null  object
 3   ProfileName             99994 non-null   object
 4   HelpfulnessNumerator    100000 non-null  int64 
 5   HelpfulnessDenominator  100000 non-null  int64 
 6   Score                   100000 non-null  int64 
 7   Time                    100000 non-null  int64 
 8   Summary                 99998 non-null   object
 9   Text                    100000 non-null  object
dtypes: int64(5), object(5)
memory usage: 7.6+ MB
None


(None, None)

In [13]:
data.isnull().sum()

Unnamed: 0,0
Id,0
ProductId,0
UserId,0
ProfileName,6
HelpfulnessNumerator,0
HelpfulnessDenominator,0
Score,0
Time,0
Summary,2
Text,0


In [14]:
df = data[['Text', 'Summary']]
df.head()

Unnamed: 0,Text,Summary
0,I have bought several of the Vitality canned d...,Good Quality Dog Food
1,Product arrived labeled as Jumbo Salted Peanut...,Not as Advertised
2,This is a confection that has been around a fe...,"""Delight"" says it all"
3,If you are looking for the secret ingredient i...,Cough Medicine
4,Great taffy at a great price. There was a wid...,Great taffy


In [15]:
### Text 및 Summary 항목 중복 여부 체크
df['Text'].nunique(), df['Summary'].nunique()

(88426, 72348)

In [16]:
### Text열 중복 데이터 삭제 (Summary는 중복 데이터 유지, Text에 대한 동일 Summary 지원)
df.drop_duplicates(subset = ['Text'], inplace = True)
print(len(df))

88426


In [17]:
### Text 및 Summary Null 값 여부 확인
df['Text'].isnull().sum(), df['Summary'].isnull().sum()

(0, 1)

In [18]:
### Summary 항목 null 값인 데이터 삭제
df.dropna(axis=0, inplace=True)
df['Text'].isnull().sum(), df['Summary'].isnull().sum()

(0, 0)

In [19]:
### NLTK 불용어 취합
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
len(stop_words)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


179

In [20]:
print(stop_words)

{'is', 'few', 'didn', "you've", 'they', 'we', "haven't", 'was', 'how', 'theirs', 'until', 'then', 'it', 'll', 'shan', 'too', 'on', 'had', 'yours', 'before', "shouldn't", 'out', 'why', 'don', 'over', 'all', 'just', 'once', 'ma', 'same', 'i', 'shouldn', 'here', 'each', 'ours', "shan't", "couldn't", 'after', "aren't", 'above', "isn't", 'very', 'having', 'when', 'only', 'won', 'been', 'in', "doesn't", 'has', 'ourselves', 'but', 'down', 'with', 'are', 'o', 'because', 'which', 'him', "don't", 'haven', 'hasn', "needn't", 'wasn', 'couldn', 'against', 'you', 'off', 'm', 'y', "you're", 'by', 'their', 'mightn', 'our', 'hers', 'from', 'his', 'myself', "it's", 'and', 'will', 'can', 'most', 't', 'my', 'again', 'through', 'if', 'those', 'wouldn', 'being', 'than', 'into', 'own', "hasn't", 'a', "should've", 'ain', "she's", 'the', "you'd", 'your', 'himself', 'them', 'should', 'not', "mustn't", 'these', 're', 'yourself', 'where', 'while', 'aren', 'hadn', 'other', 'am', 'needn', 'what', 'of', 've', 'under

# **2. 데이터 전처리**

In [None]:
!pip install contractions
import contractions

In [33]:
def preprocess_sentence(sentence, remove_stopwords=True):

    if not isinstance(sentence, str):
        sentence = str(sentence)

    sentence = sentence.lower()                             ### 텍스트 소문자화
    sentence = BeautifulSoup(sentence, "lxml").text         ### ,  등의 html 태그 제거
    sentence = re.sub(r'\([^)]*\)', '', sentence)           ### 괄호로 닫힌 문자열  제거 Ex) my husband (and myself) for => my husband for
    sentence = re.sub('"','', sentence)                     ### 쌍따옴표 " 제거

    ###sentence = ' '.join([contractions[t] if t in contractions else t for t in sentence.split(" ")])           ### 약어 정규화
    sentence = ' '.join([contractions.fix(t) for t in sentence.split(" ")])

    sentence = re.sub(r"'s\b","",sentence)                  ### 소유격 제거. Ex) roland's -> roland
    sentence = re.sub("[^a-zA-Z]", " ", sentence)           ### 영어 외 문자(숫자, 특수문자 등) 공백으로 변환
    sentence = re.sub('[m]{2,}', 'mm', sentence)            ### m이 3개 이상이면 2개로 변경. Ex) ummmmmmm yeah -> umm yeah

    ### 불용어 제거
    if remove_stopwords:
        tokens = ' '.join(word for word in sentence.split() if not word in stop_words if len(word) > 1)
    ### 불용어 미제거 (Summary)
    else:
        tokens = ' '.join(word for word in sentence.split() if len(word) > 1)

    return tokens

In [29]:
### Text : 불용어 제거, Summary : 불용어 유지
temp_text = 'Everything I bought was great, infact I ordered twice and the third ordered was<br />for my mother and father.'
temp_summary = 'Great way to start (or finish) the day!!!'
print(preprocess_sentence(temp_text))
print(preprocess_sentence(temp_summary, 0))

everything bought great infact ordered twice third ordered wasfor mother father
great way to start the day


In [36]:
### Text 전처리
clean_text = []

for s in df['Text']:
    clean_text.append(preprocess_sentence(s))

print(clean_text[:5])

['bought several vitality canned dog food products found good quality product looks like stew processed meat smells better labrador finicky appreciates product better', 'product arrived labeled jumbo salted peanuts peanuts actually small sized unsalted sure error vendor intended represent product jumbo', 'confection around centuries light pillowy citrus gelatin nuts case filberts cut tiny squares liberally coated powdered sugar tiny mouthful heaven chewy flavorful highly recommend yummy treat familiar story lewis lion witch wardrobe treat seduces edmund selling brother sisters witch', 'looking secret ingredient robitussin believe found got addition root beer extract ordered made cherry soda flavor medicinal', 'great taffy great price wide assortment yummy taffy delivery quick taffy lover deal']


In [37]:
### Summary 전처리
clean_summary = []

for s in df['Summary']:
    clean_summary.append(preprocess_sentence(s, 0))

clean_summary[:5]

['good quality dog food',
 'not as advertised',
 'delight says it all',
 'cough medicine',
 'great taffy']

In [38]:
df['Text'] = clean_text
df['Summary'] = clean_summary
df.head()

Unnamed: 0,Text,Summary
0,bought several vitality canned dog food produc...,good quality dog food
1,product arrived labeled jumbo salted peanuts p...,not as advertised
2,confection around centuries light pillowy citr...,delight says it all
3,looking secret ingredient robitussin believe f...,cough medicine
4,great taffy great price wide assortment yummy ...,great taffy


In [40]:
# 길이가 공백인 샘플은 NULL 값으로 변환
df.replace('', np.nan, inplace=True)
print(df.isnull().sum())

Text        0
Summary    56
dtype: int64


In [41]:
### SUMMARY 항목이 NULL인 경우 해당 건 삭제
df.dropna(axis=0, inplace=True)
len(df)

88369

In [42]:
df.isnull().sum()

Unnamed: 0,0
Text,0
Summary,0


In [44]:
### Text / Summary Padding Size 결정
text_max_len = 50
summary_max_len = 10

In [47]:
df = df[df['Text'].apply(lambda x: len(x.split()) <= text_max_len)]
df = df[df['Summary'].apply(lambda x: len(x.split()) <= summary_max_len)]
print('전체 샘플수 :',(len(df)))

전체 샘플수 : 67610


In [48]:
df[:5]

Unnamed: 0,Text,Summary
0,bought several vitality canned dog food produc...,good quality dog food
1,product arrived labeled jumbo salted peanuts p...,not as advertised
2,confection around centuries light pillowy citr...,delight says it all
3,looking secret ingredient robitussin believe f...,cough medicine
4,great taffy great price wide assortment yummy ...,great taffy


In [49]:
### decoder_input, decoder_output 시작/종료 토큰 추가
df['decoder_input'] = df['Summary'].apply(lambda x : 'sostoken ' + x)
df['decoder_output'] = df['Summary'].apply(lambda x : x + ' eostoken')
df.head()

Unnamed: 0,Text,Summary,decoder_input,decoder_output
0,bought several vitality canned dog food produc...,good quality dog food,sostoken good quality dog food,good quality dog food eostoken
1,product arrived labeled jumbo salted peanuts p...,not as advertised,sostoken not as advertised,not as advertised eostoken
2,confection around centuries light pillowy citr...,delight says it all,sostoken delight says it all,delight says it all eostoken
3,looking secret ingredient robitussin believe f...,cough medicine,sostoken cough medicine,cough medicine eostoken
4,great taffy great price wide assortment yummy ...,great taffy,sostoken great taffy,great taffy eostoken


In [51]:
### Encoder / Decoder의 레이블 지정
encoder_input  = np.array(df['Text'])
decoder_input  = np.array(df['decoder_input'])
decoder_target = np.array((df['decoder_output']))