<a href="https://colab.research.google.com/github/TitusChoi/Novelist_Classification/blob/min/Baseline_Bi_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **데이터 살펴보기**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install attention

In [3]:
import pandas as pd
import warnings 
warnings.filterwarnings(action='ignore')
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import re
import gensim
from gensim.models.keyedvectors import KeyedVectors
from keras.layers import LSTM, Bidirectional, Dropout
from attention import Attention
from keras.optimizers import Adam,Nadam
import nltk

In [7]:
#파일 불러오기
train = pd.read_csv('/content/drive/MyDrive/Novelist_Classification/datasets/new_train.csv', encoding = 'utf-8')
test = pd.read_csv('/content/drive/MyDrive/Novelist_Classification/datasets/test_x.csv', encoding = 'utf-8')
sample_submission = pd.read_csv('/content/drive/MyDrive/Novelist_Classification/datasets/sample_submission.csv', encoding = 'utf-8')

# **전처리**

In [8]:
#부호를 제거해주는 함수
def alpha_num(text):
    return re.sub(r'[^A-Za-z0-9 ]', '', text)

In [9]:
# 불용어 제거해주는 함수
def remove_stopwords(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in base_stopwords:
            final_text.append(i.strip())
    return " ".join(final_text)

# 불용어
base_stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", 
             "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", 
             "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", 
             "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", 
             "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", 
             "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", 
             "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", 
             "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", 
             "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", 
             "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", 
             "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]

In [10]:
#전처리 적용
train['text'] = train['text'].str.lower()
test['text'] = test['text'].str.lower()
train['text'] = train['text'].apply(alpha_num).apply(remove_stopwords)
test['text'] = test['text'].apply(alpha_num).apply(remove_stopwords)

In [11]:
# train test 분리
X_train = np.array([x for x in train['text']])
X_test = np.array([x for x in test['text']])
y_train = np.array([x for x in train['author']])

# **모델링**

In [12]:
#파라미터 설정
vocab_size = 63728
embedding_dim = 16
max_length = 500
padding_type='post'
#oov_tok = "<OOV>"

In [13]:
#tokenizer에 fit
tokenizer = Tokenizer(num_words = vocab_size)#, oov_token=oov_tok)
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index

In [14]:
# 사전 학습된 glove 불러오고 임베딩 층에 적용시키기 
embedding_dict= dict()
f = open('/content/drive/MyDrive/Novelist_Classification/embbeding/glove.txt', encoding='utf8')

for line in f:
    word_vector = line.split()
    word = word_vector[0]
    word_vector_arr = np.asarray(word_vector[1:], dtype='float32')
    embedding_dict[word] = word_vector_arr
f.close

embedding_matrix = np.zeros((vocab_size, 50))

for word, i in word_index.items():
    temp = embedding_dict.get(word)
    if temp is not None:
        embedding_matrix[i] = temp

In [15]:
# Glove 임베딩 과정
vocab = nltk.FreqDist(np.hstack(train['text']))

glove = dict()
f = open('/content/drive/MyDrive/Novelist_Classification/embbeding/glove.txt',encoding='UTF8')
for line in f:
    values = line.split()
    word = values[0]
    vector = np.asarray(values[1:], dtype='float32')
    glove[word] = vector

f.close()

In [16]:
FastText = KeyedVectors.load_word2vec_format('/content/drive/MyDrive/Novelist_Classification/embbeding/fasttext.vec')

In [17]:
# Fasttext 임베딩 과정
FT_embedding_matrix = np.zeros((vocab_size,100))

def get_vector(word):
    if word in FastText:
        return FastText[word]
    else:
        return None

for word, idx in word_index.items():
    temp = get_vector(word)
    if temp is not None:
        FT_embedding_matrix[idx] = temp

In [18]:
#데이터를 sequence로 변환해주고 padding 해줍니다.
train_sequences = tokenizer.texts_to_sequences(X_train)
train_padded = pad_sequences(train_sequences, padding=padding_type, maxlen=max_length)

test_sequences = tokenizer.texts_to_sequences(X_test)
test_padded = pad_sequences(test_sequences, padding=padding_type, maxlen=max_length)

In [19]:
# loss감소가 10번 이상 미발생이면 stop
earlystopper = tf.keras.callbacks.EarlyStopping(monitor='val_loss',mode = 'min', patience=5, verbose=1)

In [None]:
# Bi-LSTM 2계층

In [52]:
#가벼운 NLP모델 생성
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    Bidirectional(LSTM(64,return_sequences=True,dropout=0.5)),
    Bidirectional(LSTM(64)),
    Dropout(0.5),
    tf.keras.layers.Dense(8, activation='softmax')
])

In [53]:
# compile model
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='Adam',
              metrics=['accuracy'])

# model summary
print(model.summary())


Model: "sequential_12"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_12 (Embedding)     (None, 500, 16)           1019648   
_________________________________________________________________
bidirectional_12 (Bidirectio (None, 500, 128)          41472     
_________________________________________________________________
bidirectional_13 (Bidirectio (None, 128)               98816     
_________________________________________________________________
dropout_12 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_12 (Dense)             (None, 8)                 1032      
Total params: 1,160,968
Trainable params: 1,160,968
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
# fit model
num_epochs = 30
history = model.fit(train_padded, y_train, 
                    epochs=num_epochs, verbose=2, 
                    validation_split=0.2, callbacks = [earlystopper])

In [54]:
#가벼운 NLP모델 생성(Using FastText)
model_using_FT = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, 100, weights = [FT_embedding_matrix] ,input_length=max_length),
    Bidirectional(LSTM(64,return_sequences=True,dropout=0.5)),
    Bidirectional(LSTM(64)),
    Dropout(0.5),
    tf.keras.layers.Dense(8, activation='softmax')
])

In [55]:
# compile model
model_using_FT.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# model summary
print(model_using_FT.summary())


Model: "sequential_13"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_13 (Embedding)     (None, 500, 100)          6372800   
_________________________________________________________________
bidirectional_14 (Bidirectio (None, 500, 128)          84480     
_________________________________________________________________
bidirectional_15 (Bidirectio (None, 128)               98816     
_________________________________________________________________
dropout_13 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_13 (Dense)             (None, 8)                 1032      
Total params: 6,557,128
Trainable params: 6,557,128
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
# fit model
num_epochs = 30
history = model_using_FT.fit(train_padded, y_train, 
                    epochs=num_epochs, verbose=2, 
                    validation_split=0.2, callbacks = [earlystopper])

In [56]:
#가벼운 NLP모델 생성(Using Glove)
model_using_Glove = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, 50,weights = [embedding_matrix] ,input_length=max_length),
    Bidirectional(LSTM(64,return_sequences=True,dropout=0.5)),
    Bidirectional(LSTM(64)),
    Dropout(0.5),
    tf.keras.layers.Dense(8, activation='softmax')
])

In [57]:
# compile model
model_using_Glove.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# model summary
print(model_using_Glove.summary())


Model: "sequential_14"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_14 (Embedding)     (None, 500, 50)           3186400   
_________________________________________________________________
bidirectional_16 (Bidirectio (None, 500, 128)          58880     
_________________________________________________________________
bidirectional_17 (Bidirectio (None, 128)               98816     
_________________________________________________________________
dropout_14 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_14 (Dense)             (None, 8)                 1032      
Total params: 3,345,128
Trainable params: 3,345,128
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
# fit model
num_epochs = 30
history = model_using_Glove.fit(train_padded, y_train, 
                    epochs=num_epochs, verbose=2, 
                    validation_split=0.2, callbacks = [earlystopper])

In [None]:
# Bi-LSTM + Attention Mechanism

In [46]:
#가벼운 NLP모델 생성
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    Bidirectional(LSTM(64,return_sequences=True,dropout=0.3)),
    Dropout(0.5),
    Attention(32),
    tf.keras.layers.Dense(8, activation='softmax')
])

In [47]:
# compile model
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='Adam',
              metrics=['accuracy'])

# model summary
print(model.summary())


Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 500, 16)           1019648   
_________________________________________________________________
bidirectional_9 (Bidirection (None, 500, 128)          41472     
_________________________________________________________________
dropout_9 (Dropout)          (None, 500, 128)          0         
_________________________________________________________________
last_hidden_state (Lambda)   (None, 128)               0         
_________________________________________________________________
attention_score_vec (Dense)  (None, 500, 128)          16384     
_________________________________________________________________
attention_score (Dot)        (None, 500)               0         
_________________________________________________________________
attention_weight (Activation (None, 500)              

In [None]:
# fit model
num_epochs = 30
history = model.fit(train_padded, y_train, 
                    epochs=num_epochs, verbose=2, 
                    validation_split=0.2, callbacks = [earlystopper])

In [48]:
#가벼운 NLP모델 생성(Using FastText)
model_using_FT = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, 100, weights = [FT_embedding_matrix] ,input_length=max_length),
    Bidirectional(LSTM(64,return_sequences=True,dropout=0.3)),
    Dropout(0.5),
    Attention(32),
    tf.keras.layers.Dense(8, activation='softmax')
])

In [49]:
# compile model
model_using_FT.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# model summary
print(model_using_FT.summary())


Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 500, 100)          6372800   
_________________________________________________________________
bidirectional_10 (Bidirectio (None, 500, 128)          84480     
_________________________________________________________________
dropout_10 (Dropout)         (None, 500, 128)          0         
_________________________________________________________________
last_hidden_state (Lambda)   (None, 128)               0         
_________________________________________________________________
attention_score_vec (Dense)  (None, 500, 128)          16384     
_________________________________________________________________
attention_score (Dot)        (None, 500)               0         
_________________________________________________________________
attention_weight (Activation (None, 500)             

In [None]:
# fit model
num_epochs = 30
history = model_using_FT.fit(train_padded, y_train, 
                    epochs=num_epochs, verbose=2, 
                    validation_split=0.2, callbacks = [earlystopper])

In [50]:
#가벼운 NLP모델 생성(Using Glove)
model_using_Glove = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, 50,weights = [embedding_matrix] ,input_length=max_length),
    Bidirectional(LSTM(64,return_sequences=True,dropout=0.3)),
    Dropout(0.5),
    Attention(32),
    tf.keras.layers.Dense(8, activation='softmax')
])

In [51]:
# compile model
model_using_Glove.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# model summary
print(model_using_Glove.summary())


Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, 500, 50)           3186400   
_________________________________________________________________
bidirectional_11 (Bidirectio (None, 500, 128)          58880     
_________________________________________________________________
dropout_11 (Dropout)         (None, 500, 128)          0         
_________________________________________________________________
last_hidden_state (Lambda)   (None, 128)               0         
_________________________________________________________________
attention_score_vec (Dense)  (None, 500, 128)          16384     
_________________________________________________________________
attention_score (Dot)        (None, 500)               0         
_________________________________________________________________
attention_weight (Activation (None, 500)             

In [None]:
# fit model
num_epochs = 30
history = model_using_Glove.fit(train_padded, y_train, 
                    epochs=num_epochs, verbose=2, 
                    validation_split=0.2, callbacks = [earlystopper])

In [None]:
# predict values
pred = model_using_FT.predict_proba(test_padded)

In [None]:
pred

In [None]:
# submission
sample_submission[['0','1','2','3','4']] = pred
sample_submission

Unnamed: 0,index,0,1,2,3,4
0,0,0.244451,0.130629,0.211542,0.27031,0.143069
1,1,0.244451,0.130629,0.211542,0.27031,0.143069
2,2,0.244451,0.130629,0.211542,0.27031,0.143069
3,3,0.244451,0.130629,0.211542,0.27031,0.143069
4,4,0.244451,0.130629,0.211542,0.27031,0.143069
...,...,...,...,...,...,...
19612,19612,0.244451,0.130629,0.211542,0.27031,0.143069
19613,19613,0.244451,0.130629,0.211542,0.27031,0.143069
19614,19614,0.244451,0.130629,0.211542,0.27031,0.143069
19615,19615,0.244451,0.130629,0.211542,0.27031,0.143069


In [None]:
sample_submission.to_csv('/content/drive/MyDrive/Novelist_Classification/datasets/submission.csv', index = False, encoding = 'utf-8')