# **데이터 살펴보기**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import warnings 
warnings.filterwarnings(action='ignore')
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import re

In [None]:
#경로 설정
import os
os.chdir('./')

In [None]:
#파일 불러오기
train = pd.read_csv('/content/drive/MyDrive/Classification_Author/data/new_train.csv', encoding = 'utf-8')
test = pd.read_csv('/content/drive/MyDrive/Classification_Author/data/test_x.csv', encoding = 'utf-8')
sample_submission = pd.read_csv('/content/drive/MyDrive/Classification_Author/data/sample_submission.csv', encoding = 'utf-8')

In [None]:
#train 데이터 살펴보기
train

In [None]:
#test 데이터 살펴보기
test

In [None]:
#sample_submission
sample_submission

# **전처리**

In [None]:
#부호를 제거해주는 함수
def alpha_num(text):
    return re.sub(r'[^A-Za-z0-9 ]', '', text)

train['text']=train['text'].apply(alpha_num)

In [None]:
#부호가 사라진 것을 확인할 수 있습니다.
train['text'][4]

'Have mercy gentlemen odin flung up his hands Dont write that anyway have some shame Here Ive torn my heart asunder before you and you seize the opportunity and are fingering the wounds in both halves Oh my God'

In [None]:
# 불용어 제거해주는 함수
def remove_stopwords(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in base_stopwords:
            final_text.append(i.strip())
    return " ".join(final_text)

# 불용어
base_stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", 
             "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", 
             "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", 
             "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", 
             "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", 
             "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", 
             "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", 
             "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", 
             "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", 
             "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", 
             "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]

In [None]:
len(base_stopwords)

153

In [None]:
#전처리 적용
train['text'] = train['text'].str.lower()
test['text'] = test['text'].str.lower()
train['text'] = train['text'].apply(alpha_num).apply(remove_stopwords)
test['text'] = test['text'].apply(alpha_num).apply(remove_stopwords)

In [None]:
# train test 분리
X_train = np.array([x for x in train['text']])
X_test = np.array([x for x in test['text']])
y_train = np.array([x for x in train['author']])


In [None]:
# nltk 라이브러리를 사용한 불용어 제거
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

nltk_stopword = set(stopwords.words('english'))

# Stopword 만 제거한 결과
def remove_stopwords_nltk(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in nltk_stopword:
            final_text.append(i.strip())
    return " ".join(final_text)


train['text'] = train['text'].str.lower()
test['text'] = test['text'].str.lower()
train['text'] = train['text'].apply(alpha_num).apply(remove_stopwords_nltk)
test['text'] = test['text'].apply(alpha_num).apply(remove_stopwords_nltk)

X_train = np.array([x for x in train['text']])
X_test = np.array([x for x in test['text']])
y_train = np.array([x for x in train['author']])


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
len(nltk_stopword)

179

In [None]:
train

Unnamed: 0,index,text,author
0,0,almost choking much much wanted say strange ex...,3
1,1,sister asked suppose,2
2,2,engaged one day walked perusing janes last let...,1
3,3,captain porch keeping carefully way treacherou...,4
4,4,mercy gentlemen odin flung hands dont write an...,3
...,...,...,...
54874,54874,mr smith odin whispered hardly dared hope come,2
54875,54875,told plan captain us settled details accomplis...,4
54876,54876,sincere wellwisher friend sister lucy odin,1
54877,54877,wanted lend money,3


# **모델링**

In [None]:
pip install fasttext

Collecting fasttext
[?25l  Downloading https://files.pythonhosted.org/packages/f8/85/e2b368ab6d3528827b147fdb814f8189acc981a4bc2f99ab894650e05c40/fasttext-0.9.2.tar.gz (68kB)
[K     |████▊                           | 10kB 12.0MB/s eta 0:00:01[K     |█████████▌                      | 20kB 11.0MB/s eta 0:00:01[K     |██████████████▎                 | 30kB 8.6MB/s eta 0:00:01[K     |███████████████████             | 40kB 8.0MB/s eta 0:00:01[K     |███████████████████████▉        | 51kB 4.7MB/s eta 0:00:01[K     |████████████████████████████▋   | 61kB 5.4MB/s eta 0:00:01[K     |████████████████████████████████| 71kB 3.1MB/s 
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp37-cp37m-linux_x86_64.whl size=3092123 sha256=033b62497449140ea137f8a3fdc6fd62fe2d66cc5f7b0fe320194039ac90f945
  Stored in directory: /root/.cache/pip/wheels/98/ba/7f/b154944a1cf5a8cee91c15

In [None]:
# FastText 사용
import fasttext
import gensim
from gensim.models.keyedvectors import KeyedVectors

In [None]:
pip install -U gensim

Collecting gensim
[?25l  Downloading https://files.pythonhosted.org/packages/44/52/f1417772965652d4ca6f901515debcd9d6c5430969e8c02ee7737e6de61c/gensim-4.0.1-cp37-cp37m-manylinux1_x86_64.whl (23.9MB)
[K     |████████████████████████████████| 23.9MB 165kB/s 
Installing collected packages: gensim
  Found existing installation: gensim 3.6.0
    Uninstalling gensim-3.6.0:
      Successfully uninstalled gensim-3.6.0
Successfully installed gensim-4.0.1


In [None]:
FastText = KeyedVectors.load_word2vec_format('/content/drive/MyDrive/Classification_Author/Embedding/fasttext.vec')

#FastText = gensim.models.Word2Vec.load_word2vec_format('/content/drive/MyDrive/colab/fasttext/fasttext.bin', binary = True)


In [None]:
print(FastText)

<gensim.models.keyedvectors.Word2VecKeyedVectors object at 0x7f37e2b16490>


In [None]:
Word2Vec_model = gensim.models.KeyedVectors.load_word2vec_format('/content/drive/MyDrive/Classification_Author/Embedding/GoogleNews-vectors-negative300.bin.gz', binary = True)

In [None]:
print(FastText_model)
print(Word2Vec_model)

<fasttext.FastText._FastText object at 0x7f383d0dad10>
<gensim.models.keyedvectors.Word2VecKeyedVectors object at 0x7f383ce52bd0>


In [None]:
#파라미터 설정
vocab_size = 47136
embedding_dim = 16
max_length = 500
padding_type='post'
#oov_tok = "<OOV>"

In [None]:
#tokenizer에 fit
tokenizer = Tokenizer(num_words = vocab_size)#, oov_token=oov_tok)
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index

In [None]:
print(tokenizer)
print('===')
print(word_index)

<keras_preprocessing.text.Tokenizer object at 0x7f37f931c5d0>
===


In [None]:
import time

In [None]:
# Fasttext 임베딩 과정
FT_embedding_matrix = np.zeros((vocab_size,100))

def get_vector(word):
    if word in FastText:
        return FastText[word]
    else:
        return None

for word, idx in word_index.items():
    temp = get_vector(word)
    if temp is not None:
        FT_embedding_matrix[idx] = temp
'''
for idx, word in word_index.items():
    if word in FastText:
      embedding_vector = FastText[word]
      FT_embedding_matrix[idx] = embedding_vector
'''  

In [None]:
# Word2Vec 임베딩 과정
W2V_embedding_matrix = np.zeros((vocab_size,300))

def get_vector(word):
    if word in Word2Vec_model:
        return Word2Vec_model[word]
    else:
        return None

for word,idx in word_index.items():
    temp = get_vector(word)
    if temp is not None:
        W2V_embedding_matrix[idx] = temp

In [None]:
#데이터를 sequence로 변환해주고 padding 해줍니다.
train_sequences = tokenizer.texts_to_sequences(X_train)
train_padded = pad_sequences(train_sequences, padding=padding_type, maxlen=max_length)

test_sequences = tokenizer.texts_to_sequences(X_test)
test_padded = pad_sequences(test_sequences, padding=padding_type, maxlen=max_length)

In [None]:
#가벼운 NLP모델 생성
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(5, activation='softmax')
])

In [None]:
# compile model
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# model summary
print(model.summary())


Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 500, 16)           754176    
_________________________________________________________________
global_average_pooling1d_3 ( (None, 16)                0         
_________________________________________________________________
dense_8 (Dense)              (None, 24)                408       
_________________________________________________________________
dense_9 (Dense)              (None, 5)                 125       
Total params: 754,709
Trainable params: 754,709
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
# fit model
num_epochs = 20
history = model.fit(train_padded, y_train, 
                    epochs=num_epochs, verbose=2, 
                    validation_split=0.2)

Epoch 1/20
1372/1372 - 17s - loss: 1.5650 - accuracy: 0.2754 - val_loss: 1.5452 - val_accuracy: 0.2695
Epoch 2/20
1372/1372 - 16s - loss: 1.4464 - accuracy: 0.3755 - val_loss: 1.3431 - val_accuracy: 0.3936
Epoch 3/20
1372/1372 - 15s - loss: 1.2297 - accuracy: 0.4932 - val_loss: 1.1785 - val_accuracy: 0.5155
Epoch 4/20
1372/1372 - 15s - loss: 1.1085 - accuracy: 0.5422 - val_loss: 1.1064 - val_accuracy: 0.5402
Epoch 5/20
1372/1372 - 15s - loss: 1.0281 - accuracy: 0.5877 - val_loss: 1.0500 - val_accuracy: 0.5712
Epoch 6/20
1372/1372 - 15s - loss: 0.9586 - accuracy: 0.6246 - val_loss: 1.0058 - val_accuracy: 0.6074
Epoch 7/20
1372/1372 - 15s - loss: 0.8961 - accuracy: 0.6558 - val_loss: 0.9543 - val_accuracy: 0.6311
Epoch 8/20
1372/1372 - 15s - loss: 0.8346 - accuracy: 0.6846 - val_loss: 0.9184 - val_accuracy: 0.6503
Epoch 9/20
1372/1372 - 15s - loss: 0.7783 - accuracy: 0.7087 - val_loss: 0.8838 - val_accuracy: 0.6663
Epoch 10/20
1372/1372 - 16s - loss: 0.7310 - accuracy: 0.7298 - val_loss:

In [None]:
from keras.layers import LSTM

In [None]:
#가벼운 NLP모델 생성(Using FastText)
model_using_FT = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, 100, weights = [FT_embedding_matrix] ,input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(5, activation='softmax')
])

In [None]:
# compile model
model_using_FT.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# model summary
print(model_using_FT.summary())


Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 500, 100)          4713600   
_________________________________________________________________
global_average_pooling1d_1 ( (None, 100)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 24)                2424      
_________________________________________________________________
dense_5 (Dense)              (None, 5)                 125       
Total params: 4,716,149
Trainable params: 4,716,149
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', patience=4, verbose=1)

# fit model
num_epochs = 20
history = model_using_FT.fit(train_padded, y_train, 
                    epochs=num_epochs, verbose=2, 
                    validation_split=0.2, callbacks = [early_stopping])

Epoch 1/20
1372/1372 - 80s - loss: 1.5443 - accuracy: 0.3047 - val_loss: 1.4692 - val_accuracy: 0.4145
Epoch 2/20
1372/1372 - 78s - loss: 1.3030 - accuracy: 0.4614 - val_loss: 1.1872 - val_accuracy: 0.5059
Epoch 3/20
1372/1372 - 81s - loss: 1.0778 - accuracy: 0.5659 - val_loss: 1.0403 - val_accuracy: 0.5986
Epoch 4/20
1372/1372 - 80s - loss: 0.9152 - accuracy: 0.6514 - val_loss: 0.9070 - val_accuracy: 0.6514
Epoch 5/20
1372/1372 - 78s - loss: 0.7893 - accuracy: 0.7107 - val_loss: 0.8221 - val_accuracy: 0.7005
Epoch 6/20
1372/1372 - 78s - loss: 0.6962 - accuracy: 0.7496 - val_loss: 0.7973 - val_accuracy: 0.7026
Epoch 7/20
1372/1372 - 80s - loss: 0.6245 - accuracy: 0.7775 - val_loss: 0.7405 - val_accuracy: 0.7319
Epoch 8/20
1372/1372 - 80s - loss: 0.5763 - accuracy: 0.7914 - val_loss: 0.7196 - val_accuracy: 0.7351
Epoch 9/20
1372/1372 - 80s - loss: 0.5339 - accuracy: 0.8084 - val_loss: 0.7065 - val_accuracy: 0.7458
Epoch 10/20
1372/1372 - 79s - loss: 0.5032 - accuracy: 0.8181 - val_loss:

In [None]:
#가벼운 NLP모델 생성(Using Word2Vec)
model_using_W2V = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, 300,weights = [W2V_embedding_matrix] ,input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(5, activation='softmax')
])

In [None]:
# compile model
model_using_W2V.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# model summary
print(model_using_W2V.summary())


Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 500, 300)          14140800  
_________________________________________________________________
global_average_pooling1d_2 ( (None, 300)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 24)                7224      
_________________________________________________________________
dense_7 (Dense)              (None, 5)                 125       
Total params: 14,148,149
Trainable params: 14,148,149
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', patience=4, verbose=1)

# fit model
num_epochs = 20
history = model_using_W2V.fit(train_padded, y_train, 
                    epochs=num_epochs, verbose=2, 
                    validation_split=0.2, callbacks = [early_stopping])

Epoch 1/20
1372/1372 - 274s - loss: 1.4404 - accuracy: 0.3739 - val_loss: 1.1875 - val_accuracy: 0.5454
Epoch 2/20
1372/1372 - 272s - loss: 1.0449 - accuracy: 0.5868 - val_loss: 0.9771 - val_accuracy: 0.6449
Epoch 3/20
1372/1372 - 255s - loss: 0.8325 - accuracy: 0.6896 - val_loss: 0.8175 - val_accuracy: 0.6887
Epoch 4/20
1372/1372 - 258s - loss: 0.6901 - accuracy: 0.7499 - val_loss: 0.7454 - val_accuracy: 0.7265
Epoch 5/20
1372/1372 - 256s - loss: 0.6018 - accuracy: 0.7821 - val_loss: 0.7372 - val_accuracy: 0.7239
Epoch 6/20
1372/1372 - 255s - loss: 0.5423 - accuracy: 0.8024 - val_loss: 0.7250 - val_accuracy: 0.7337
Epoch 7/20
1372/1372 - 248s - loss: 0.4957 - accuracy: 0.8193 - val_loss: 0.7388 - val_accuracy: 0.7259
Epoch 8/20
1372/1372 - 245s - loss: 0.4594 - accuracy: 0.8309 - val_loss: 0.7434 - val_accuracy: 0.7302
Epoch 9/20
1372/1372 - 238s - loss: 0.4287 - accuracy: 0.8422 - val_loss: 0.7582 - val_accuracy: 0.7268
Epoch 10/20
1372/1372 - 256s - loss: 0.4028 - accuracy: 0.8531 -

In [None]:
# predict values
pred = model_using_FT.predict_proba(test_padded)

In [None]:
pred

In [None]:
# submission
sample_submission[['0','1','2','3','4']] = pred
sample_submission

Unnamed: 0,index,0,1,2,3,4
0,0,0.244451,0.130629,0.211542,0.27031,0.143069
1,1,0.244451,0.130629,0.211542,0.27031,0.143069
2,2,0.244451,0.130629,0.211542,0.27031,0.143069
3,3,0.244451,0.130629,0.211542,0.27031,0.143069
4,4,0.244451,0.130629,0.211542,0.27031,0.143069
...,...,...,...,...,...,...
19612,19612,0.244451,0.130629,0.211542,0.27031,0.143069
19613,19613,0.244451,0.130629,0.211542,0.27031,0.143069
19614,19614,0.244451,0.130629,0.211542,0.27031,0.143069
19615,19615,0.244451,0.130629,0.211542,0.27031,0.143069


In [None]:
sample_submission.to_csv('/content/drive/MyDrive/Classification_Author/submission.csv', index = False, encoding = 'utf-8')

#Result of Train  
## Base stopwords + Keras Embedding  
1372/1372 - 7s - loss: 0.4276 - accuracy: 0.8455 - val_loss: 0.7625 - val_accuracy: 0.7305
## Base stopwords + FastText  
1372/1372 - 78s - loss: 0.4719 - accuracy: 0.8304 - val_loss: 0.7002 - val_accuracy: 0.7514
## Base stopwords + Word2Vec
1372/1372 - 256s - loss: 0.4028 - accuracy: 0.8531 - val_loss: 0.7596 - val_accuracy: 0.7366



