<a href="https://colab.research.google.com/github/Rnlcksgdkd/Project_AI/blob/ando/NLP_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **I. 텍스트 전처리**

> ## **형태소 분석기 어간추출**

In [None]:
import okt
okt = Okt()
okt.morphs(text , stem = True)

> ## **최대단어수 및 최대단어길이 제한**

In [None]:
sentences_new = []

for sentence in X:
  sentences_new.append( [ word[:10] for word in sentence[:30]] )
X = sentences_new

for i in range(5): print(X[i])

> ## **html 태그 삭제**

In [None]:
I : input
O : output

import BeautifulSoup
output = BeautifulSoup(input , 'html5lib').get_text()

> ## **정규식 이용**

In [None]:
# 한글용 문자정리함수
def clean_str(string):

    string = re.sub(r"[^가-힣A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    string = re.sub(r"\'{2,}", "\'", string)
    string = re.sub(r"\'", "", string)

    return string.lower()


In [None]:

# 한글 제외 정리
def clean_hangle(string):
  output = re.sub("[가-힣ㄱ-ㅎㅏ-ㅣ\\s]" , "" , input )
  return output

# 영어 제외 정리
def clean_english(string):
  output = re.sub("[^a-zA-Z]" , " " , input)
  return output



> ## **불용어 사전 정의**

In [None]:

# 영어 불용어 정의 (nltk 에서 정의된 불용어)

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))


In [None]:

# 한글 불용어 정의

stop_words = ['은', '는' , '이' , '가' , '하', '아', '것' , '들', '의', '있' , '되' , '수' , '보' , '주' , '등' ,'한' ]


> ## **전처리 함수**
  - re 모듈 이용한 문자정리
  - okt 형태소 추출
  - 불용어 처리  
  


In [None]:

def preprocessing(text , okt , limit_words  , stop_words , print_option = False ):

  if print_option : print("원본".ljust(15 , ' ') + ": " , text)
  
  text = re.sub("[^가-힣ㄱ-ㅎㅏ-ㅣ\\s]" , "" , text)
  if print_option : print("정규표현식 처리".ljust(15 , ' ') + ": " , text)

  max_word = limit_words[0]
  max_length = limit_words[1]
  text = [word[:max_length] for word in sentence[:max_word]]
  if print_option : print("문장 길이 조절".ljust(15 , ' ') + ": " , text)

  
  text = okt.morphs(text , stem = True)
  if print_option : print("okt 형태소 추출".ljust(15 , ' ') + ": " , text)

  text = [token for token in text if not token in stop_words]
  if print_option : print("불용어 처리".ljust(15 , ' ') + ": " , text)

  

  print(" ")
  print(" ")

  return text

# **II. Tokenizer/Vectorizer**


> ## **Keras Tokenizer/Padding 이용**
  - 말뭉치 (sentences) , num_words (단어사전 길이)  
  

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

def Tokenize_Padding(sentences , num_words = 20000):
  tokenizer = Tokenizer(num_words = 20000)
  tokenizer.fit_on_texts(sentences)
  X = tokenizer.texts_to_sequences(sentences)
  X = pad_sequences(train_x , padding = 'post')
  return tokenizer , X


In [None]:

tokenizer = fit_tokenizer(20000 ,clean_text)

train_inputs = Tokenize_Padding(tokenizer , clean_text , 15 ,  'post')
test_inputs = Tokenize_Padding(tokenizer , clean_text_test , 15 , 'post')

train_labels = np.array(train_df['label'])
test_labels = np.array(test_df['label'])

train_inputs.shape , test_inputs.shape , train_labels.shape , test_labels.shape

# III. **Data Save**

In [None]:
import json

data_path = "/content/Project_AI/ando/NaverMovie/"
train_df.to_csv(data_path + "Naver_MR_train.csv" , index = False)
test_df.to_csv(data_path +"Naver_MR_test.csv" , index = False)

np.save(data_path +"Naver_MR_train_input.npy" , train_inputs)
np.save(data_path +"Naver_MR_test_input.npy" , train_inputs)
np.save(data_path + "Naver_MR_train_label.npy" , train_labels)
np.save(data_path + "Naver_MR_test_label.npy" , train_labels)

data_configs = {}

data_configs['vocab'] = word_vocab
data_configs['vocab_size'] = len(word_vocab) + 1

json.dump(data_configs , open(data_path + "data_config.json" , 'w') , ensure_ascii=False)