# 안녕하세요^^ 
# AIVLE 미니 프로젝트에 오신 여러분을 환영합니다.
* 본 과정에서는 실제 사례와 데이터를 기반으로 문제를 해결하는 전체 과정을 자기 주도형 실습으로 진행해볼 예정입니다.
* 앞선 교육과정을 정리하는 마음과 지금까지 배운 내용을 바탕으로 문제 해결을 해볼게요!
* 미니 프로젝트를 통한 문제 해결 과정 'A에서 Z까지', 지금부터 시작합니다!

## Text Preprocessing
### reference
> * [Google guide](https://developers.google.com/machine-learning/guides/text-classification/step-3)
> * N-grams
>> * [scikit-learn working with text data](https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html#)
>> * [scikit-learn text feature extraction](https://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction)
>> * [한글 자료](https://datascienceschool.net/03%20machine%20learning/03.01.03%20Scikit-Learn%EC%9D%98%20%EB%AC%B8%EC%84%9C%20%EC%A0%84%EC%B2%98%EB%A6%AC%20%EA%B8%B0%EB%8A%A5.html)
> * Sequence
>> * [keras text classification](https://keras.io/examples/nlp/text_classification_from_scratch/)
>> * [tensorflow text classification](https://www.tensorflow.org/tutorials/keras/text_classification)

### 0. 라이브러리 설치 및 불러오기

In [1]:
## import sklearn
import pandas as pd
import matplotlib.font_manager as fm
import matplotlib.pyplot as plt
import tensorflow as tf
import nltk

from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score,f1_score,confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

#fm.findSystemFonts()
plt.rcParams['font.family']= ["Malgun Gothic"]
plt.rcParams["axes.unicode_minus"]=False

# GPU 환경 설정하기
# assert tf.test.is_gpu_available() == True, 'GPU 설정을 확인하세요.'
print(tf.config.list_physical_devices('GPU'))
print(tf.config.list_logical_devices('GPU'))

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
[LogicalDevice(name='/device:GPU:0', device_type='GPU')]


### 1.  데이터 가져오기

In [2]:
# 데이터를 가져옵니다.
data = pd.read_csv('./data/spam.csv')
data.dropna(axis=0, inplace=True)

#### 1-1. processing label

In [None]:
# label 데이터를 수치형으로 변환합니다.
data['label'].loc[data['label']=='ham'] = 0
data['label'].loc[data['label']=='spam'] = 1
data.head()

In [4]:
x = data['text']
y = data['label']

### 2. Train Validation(Test) Split

In [None]:
# train validation set으로 분리합니다.
X_tr, X_val, Y_tr, Y_val = train_test_split(x, y, test_size=.2)

In [None]:
X_te = pd.read_csv('./data/spam_test_text.csv')['text']
Y_te = pd.read_csv('./data/spam_test_label.csv')['label']

### 3. Vectorize texts

#### 3-1. N-grams Vectorize [참고](https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html#extracting-features-from-text-files)

In [6]:
# 1. Count Vectorize
from sklearn.feature_extraction.text import CountVectorizer

count_vec = CountVectorizer(min_df=1)
x_train_c = count_vec.fit_transform(X_tr)
x_val_c = count_vec.transform(X_val)
x_te_c = count_vec.transform(X_te)
x_train_c.shape 

(16071, 29532)

In [None]:
# count_mecab_vec = CountVectorizer(tokenizer=mecab.morphs, min_df=1)
# x_train_mecab_c = count_mecab_vec.fit_transform(X_tr)
# x_val_mecab_c = count_mecab_vec.transform(X_val)
# x_te_mecab_c = count_mecab_vec.transform(X_te)

In [7]:
# 2. Tf-idf Transform
from sklearn.feature_extraction.text import TfidfTransformer

transformer = TfidfTransformer(smooth_idf=True)
x_train_tfidf = transformer.fit_transform(x_train_c)
x_val_tfidf = transformer.transform(x_val_c)
x_te_tfidf = transformer.transform(x_te_c)
x_train_tfidf.shape

(16071, 29532)

In [None]:
# tfidf_vec = TfidfVectorizer(tokenizer=mecab.morphs)
# x_train_tfidfv = tfidf_vec.fit_transform(X_tr)
# x_val_tfidfv = tfidf_vec.transform(X_val)
# x_te_tfidfv = tfidf_vec.transform(X_te)
# x_train_tfidfv.shape

In [None]:
# TfidfVectorizer = CountVectorizer + TfidfTransformer (but 완벽하게 같지는 않음) 
# (x_train_tfidf != x_train_tfidfv)

In [None]:
# print('Count Vectorizer Vocabulary size: ', len(count_vec.vocabulary_))
# print('Count Vectorizer(Mecab tokenizer) Vocabulary size: ', len(count_mecab_vec.vocabulary_))
# print('TF-IDF Vectorizer(Mecab tokenizer) Vocabulary size: ', len(tfidf_vec.vocabulary_))

In [8]:
# Select top 'k' of the vectorized features
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

TOP_K = 20000
selector = SelectKBest(f_classif, k=min(TOP_K, x_train_tfidf.shape[1]))
selector.fit(x_train_tfidf, Y_tr)
x_train_ngram = selector.transform(x_train_tfidf).astype('float32')
x_val_ngram = selector.transform(x_val_tfidf).astype('float32')

In [9]:
x_train_ngram.shape

(16071, 20000)

#### 3-2. Sequence Vectorize [참고](https://developers.google.com/machine-learning/guides/text-classification/step-3)

In [None]:
import tensorflow as tf
import numpy as np
from tensorflow.python.keras.preprocessing import sequence
from tensorflow.python.keras.preprocessing import text

TOP_K = 20000
MAX_SEQUENCE_LENGTH = 100

X_mor_tr = X_tr.apply(lambda x:' '.join(mecab.morphs(x)))
X_mor_val = X_val.apply(lambda x:' '.join(mecab.morphs(x)))
X_mor_te = X_te.apply(lambda x:' '.join(mecab.morphs(x)))

tokenizer = text.Tokenizer(num_words=TOP_K, char_level=False)
tokenizer.fit_on_texts(X_mor_tr)

X_mor_tr_seq = tokenizer.texts_to_sequences(X_mor_tr)
X_mor_val_seq = tokenizer.texts_to_sequences(X_mor_val)
X_mor_te_seq = tokenizer.texts_to_sequences(X_mor_te)

max_length = len(max(X_mor_tr_seq, key=len))
if max_length > MAX_SEQUENCE_LENGTH:
    max_length = MAX_SEQUENCE_LENGTH

print(max_length)

X_mor_tr_seq = sequence.pad_sequences(X_mor_tr)
X_mor_val_seq = sequence.pad_sequences(X_mor_val)
X_mor_te_seq = sequence.pad_sequences(X_mor_te)

In [None]:
import pickle

# saving
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

len(tokenizer.word_index)+1

### 4. Save data

#### 4-1. Save N-grams

In [12]:
# N-grams 방식으로 vectorize한 데이터의 shape을 확인해봅니다.
import numpy as np

print('X_train tfidf shape:', x_train_tfidf.shape)
print('X_val tfidf shape:', x_val_tfidf.shape)
print('X_te tfidf shape:', x_te_tfidf.shape)

X_train seq shape: (16071, 20000)
X_val seq shape: (4018, 20000)


In [13]:
# 모델 학습시에 활용 가능하도록 전처리 데이터를 저장해보도록 하겠습니다.
import scipy.sparse

scipy.sparse.save_npz('X_tfidf_train', x_train_tfidf)
scipy.sparse.save_npz('X_tfidf_val', x_val_tfidf)
scipy.sparse.save_npz('X_tfidf_te', x_te_tfidf)

#### 4-2. Save sequence

In [None]:
print('X_mor_train seq shape:', X_mor_tr_seq.shape)
print('X_mor_val seq shape:', X_mor_val_seq.shape)
print('X_mor_te seq shape:', X_mor_te_seq.shape)

np.save('X_mor_sequence_train', arr=X_mor_tr_seq)
np.save('X_mor_sequence_val', arr=X_mor_val_seq)
np.save('X_mor_sequence_te', arr=X_mor_te_seq)

X_train seq shape: (16071, 20000)
X_val seq shape: (4018, 20000)


#### 4-3. Save label

In [15]:
# label 데이터의 shape을 확인하고 저장합니다.
print('Y_train seq shape:', Y_tr.shape)
print('Y_val seq shape:', Y_val.shape)
print('Y_te seq shape:', Y_te.shape)

Y_train seq shape: (16071,)
Y_val seq shape: (4018,)


In [16]:
np.save('y_train', arr=Y_tr)
np.save('y_val', arr=Y_val)
np.save('y_te', arr=Y_te)