## 12.2 20-Newsgroup Dataset Classification with GloVe and Keras
TA: Bokyung Son (*Computational Linguistics Lab*)

### Data Specification
This dataset is a collection of 20,000 messages, collected from 20 different netnews newsgroups. 1,000 messages from each of the 20 newsgroups were chosen at random and partitioned by newsgroup name. The list of newsgroups is as follows:

- alt.atheism
- talk.politics.guns
- talk.politics.mideast
- talk.politics.misc
- talk.religion.misc
- soc.religion.christian
- comp.sys.ibm.pc.hardware
- comp.graphics
- comp.os.ms-windows.misc
- comp.sys.mac.hardware
- comp.windows.x
- rec.autos
- rec.motorcycles
- rec.sport.baseball
- rec.sport.hockey
- sci.crypt
- sci.electronics
- sci.space
- sci.med
- misc.forsale

For Windows users,

[Download file](http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.tar.gz)

[Download 7-zip](https://www.7-zip.org/) and use it to unzip the tar.gz file

In [None]:
# For MAC/linux users,
!wget http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.tar.gz
!tar -xvzf news20.tar.gz

In [None]:
# Download pretrained GloVe
# !wget http://nlp.stanford.edu/data/glove.6B.zip

In [2]:
import os
import sys
sys.path.append(os.pardir) 

import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Embedding, Dense, Input, Bidirectional, LSTM
from keras.models import Model
from keras.callbacks import EarlyStopping, ModelCheckpoint

Using TensorFlow backend.


In [8]:
GLOVE_DIR = '/Users/sonbokyung/prg/embedding/glove.6B'
# GLOVE_DIR = '.'
TEXT_DATA_DIR = './20_newsgroup'
MAX_SEQUENCE_LENGTH = 1000 # padding전에 sequence 맞추기
MAX_FEATURES = 20000 # 몇개의 most common 단어를 볼것인가?
EMBEDDING_DIM = 100 # embedding하는 dimension 을 선택
VALIDATION_SPLIT = 0.15
TEST_SPLIT = 0.15

In [None]:
- 로이터는 이미 2000개정도의 word의 indexing 해서 들어왔다. (즉, 가공된 데이터셋)
- 실제 텍스트를 할떄는 그 작업을 직접 해야한다.
- 

### 1. Vectorize words

In [12]:
import codecs
# MAP: word -> embedding vector in embeddings set
# 뉴스기사의 word를 pretrain된 glove의 vector로 mapping 해준다
fileObj = codecs.open( "glove.6B.100d.txt", "r", "utf-8" )
u = fileObj.readlines()
embeddings_index = {}
for line in u:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


### 2. Prepare text samples and their labels

In [7]:
print('Processing text dataset...')

texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids

for name in sorted(os.listdir(TEXT_DATA_DIR)):
    path = os.path.join(TEXT_DATA_DIR, name)
    if os.path.isdir(path):
        label_id = len(labels_index)
        labels_index[name] = label_id
        for fname in sorted(os.listdir(path)):
            if fname.isdigit():
                fpath = os.path.join(path, fname)
                args = {'encoding': 'latin-1'}
                with open(fpath, **args) as f:
                    t = f.read()
                    i = t.find('\n\n')  # skip header
                    if 0 < i:
                        t = t[i:]
                    texts.append(t)
                labels.append(label_id)

print('Found %s texts.' % len(texts))

Processing text dataset...


NameError: name 'TEXT_DATA_DIR' is not defined

### 3. Preprocessing
See [Tokenizer documentation](https://keras.io/preprocessing/text/#tokenizer) and its [older version](https://faroit.github.io/keras-docs/1.2.2/preprocessing/text/#tokenizer)

In [None]:
# Tokenizer Class vectorizes and turns text into list of word indices(ranks)
# fit_on_texts : 
tokenizer = Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(texts)  # train on `texts (list of sample texts)`
sequences = tokenizer.texts_to_sequences(texts)  # return list of sequences (one per text)

In [None]:
# This helps determine MAX_SEQUENCE_LENGTH
# 전체 sqeuence의 lenth 분포를 보고 자르자 pad 값을
seqlen = np.array([len(sequence) for sequence in sequences])
np.histogram(seqlen, bins=50)

In [None]:
# Map: word -> rank/index(int) in text
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

In [None]:
# Pad
# 대부분이 1000이하인데 길이가 20000으로하면 낭비잖아요? 그래서~ 간단한게 한게 seqlen 에서 찾자
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [None]:
# Vectorize labels into one-hot
labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

In [None]:
# split train/validation/test data

# required: manually shuffle the data.
# by default, validation_split in `fit` does not shuffle the data
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
num_test_samples = int(TEST_SPLIT * data.shape[0])

# first portion goes for validation
x_val = data[:num_validation_samples]
y_val = labels[:num_validation_samples]

# last portion goes for test
x_test = data[-num_test_samples:]
y_test = labels[-num_test_samples:]

# remaining for train
x_train = data[num_validation_samples:-num_test_samples]
y_train = labels[num_validation_samples:-num_test_samples]

print(x_train.shape, x_val.shape, x_test.shape)

### 4. Embedding layer
See [Embedding documentation](https://faroit.github.io/keras-docs/1.2.2/layers/embeddings/#embedding)

In [None]:
# index -> pre-trained embedding
print('Preparing embedding matrix...')

# +1은 unknown word 때문에 했다.
num_words = min(MAX_FEATURES, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_FEATURES:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all zeros.
        embedding_matrix[i] = embedding_vector

# load pre-trained word embeddings into an Embedding layer
# NOTE: freeze the layer (trainable = False) to prevent the weights from being updated
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix], 
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)
# output: (samples, sequence length, embedding dim)

### 5. Training

In [None]:
# path where all models are saved
BASE_PATH = './news20_model/'
if not os.path.exists(BASE_PATH):
    os.mkdir(BASE_PATH)

In [None]:
def create_checkpoint(model_name):
    # creates a subdirectory under `BASE_PATH`
    MODEL_PATH = os.path.join(BASE_PATH, model_name)
    if not os.path.exists(MODEL_PATH):
        os.mkdir(MODEL_PATH)
    
    return ModelCheckpoint(filepath=os.path.join(MODEL_PATH, '{epoch:02d}-{val_loss:.4f}.hdf5'),
                           monitor='val_loss',
                           verbose=1,
                           save_best_only=True)

In [None]:
batch_size = 128
max_epochs = 30

early_stopping = EarlyStopping(monitor='val_loss',
                               patience=3)

In [None]:
print('Training...')

# train a 3-layer bi-LSTM model
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

x = Bidirectional(LSTM(128, return_sequences=True))(embedded_sequences)
x = Bidirectional(LSTM(128, return_sequences=True))(x)
x = Bidirectional(LSTM(128))(x)
preds = Dense(len(labels_index), activation='softmax')(x)

model = Model(sequence_input, preds)

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

checkpoint = create_checkpoint('lstm')  # checkpoint callback
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=max_epochs,
          validation_data=(x_val, y_val),
          shuffle=True,
          callbacks=[early_stopping, checkpoint])

In [None]:
loss, acc = model.evaluate(x_test, y_test,
                           batch_size=batch_size)

print(f'----- Evaluation loss and metrics for {len(y_test)} test samples -----')
print('Test loss:', loss)
print('Test accuracy:', acc)