# 1. Import data

## 1.1 Load AG News data from FastAi repo in local directory
https://registry.opendata.aws/fast-ai-nlp/

In [None]:
train_file = '../news_data/train.csv'
test_file = '../news_data/test.csv'

In [2]:
import pandas as pd
import numpy as np
import os

In [3]:
def load_data(file):
    columns = ['topic', 'head', 'body']
    df = pd.read_csv(file, header=None)
    df.columns = columns
    return df

### Combine datasets to preprocess together

In [146]:
train = load_data(train_file)
train['set'] = np.array([1,]*len(train))
print(f'Train set: {len(train)}')

test = load_data(test_file)
test['set'] = np.array([0,]*len(test))
print(f'Test set: {len(test)}')

ratio = round(len(test)/(len(train)+len(test)), 3)
print(f'Test size ratio = {ratio}')
df = pd.concat([train, test])

Train set: 120000
Test set: 7600
Test size ratio = 0.06


In [147]:
df.head()

Unnamed: 0,topic,head,body,set
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli...",1
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...,1
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...,1
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...,1
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco...",1


Topic labels:
1. World
2. Sports
3. Business
4. Sci/Tech

## 1.2 Import GloVe word vectors from Stanford NLP

Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 2014. <a href="https://nlp.stanford.edu/pubs/glove.pdf">GloVe: Global Vectors for Word Representation. </a>

In [90]:
glove_dir = '../../preprocessing/'
glove_zip = 'glove.6B.zip'
glove_url = 'https://nlp.stanford.edu/data/glove.6B.zip'

# Choose n-dimensional vector
v_size = 100
glove_file = f'glove.6B.{v_size}d.txt'

In [18]:
def download_and_unzip(url, local_dir, zip_name):
    import os
    if zip_name[:-4] in os.listdir(local_dir):
        print(f'File already downloaded and extracted at {local_dir}')
        return None
    import requests, zipfile
    r = requests.get(url, stream=True)
    zip_file = os.path.join(local_dir, zip_name)
    if r.status_code == 200:
        with open(zip_file, 'wb') as file:
            print(f'Saving zip file to {local_dir}..')
            file.write(r.content)
            z = zipfile.ZipFile(file)
            print(f'Unzipping files to {local_dir}..')
            z.extractall(zip_file[:-4])

download_and_unzip(glove_url, glove_dir, glove_zip)

File already downloaded and extracted at ../../preprocessing/


In [19]:
def load_glove(glove_file, local_dir, glove_zip):
    file_name = local_dir + '/' + glove_zip[:-4] + '/' + glove_file
    with open(file_name, 'r', encoding='utf-8') as f:
        index = {}
        for line in f:
            v = line.split()
            word = v.pop(0)
            coefs = np.array(v, dtype="float32")
            index[word] = coefs
        print(f'Loaded {len(index)} word vectors into memory..')
        return index

glove_vector = load_glove(glove_file, glove_dir, glove_zip)

Loaded 400000 word vectors into memory..


# 2. Preprocessing

## 2.1 Encode topic label (One-Hot)

In [156]:
from sklearn import preprocessing as pp
from tensorflow.keras.utils import to_categorical
encoder = pp.LabelEncoder()
encoded_topics = encoder.fit_transform(df['topic'].values)
print(f'Encoded {len(encoder.classes_)} classes of topic labels..')
encoded_topics = to_categorical(encoded_topics)
df['y'] = [row for row in encoded_topics]
encoded

Encoded 4 classes of topic labels..


array([[0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       ...,
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.]], dtype=float32)

## 2.2 Tokenize header sentences

In [149]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
max_sentence_length = 30

docs = df['head'].values
t = Tokenizer()
t.fit_on_texts(docs)

encoded_docs = t.texts_to_sequences(docs)
encoded_docs = pad_sequences(encoded_docs, maxlen=max_sentence_length, padding='post')
print(f'Tokenized {len(t.word_index) + 1} words from {len(encoded_docs)} sentences..')
encoded_docs
df['X'] = [row for row in encoded_docs]

Tokenized 37595 words from 127600 sentences..


## 2.3 Create embedding matrix using GloVe vector (100 d)

In [73]:
embedding_matrix = np.zeros([len(t.word_index) + 1, v_size])
for word, i in t.word_index.items():
    if word in glove_vector:
        embedding_matrix[i] = glove_vector[word]
print(embedding_matrix.shape)

(37595, 100)


## 2.4 Train-test-split
Using the original split provided by the repo

In [157]:
split = df.set.values
size = len(df)
X_train = np.asarray([encoded_docs[i] for i in range(size) if split[i]==1], dtype='int32')
y_train = np.asarray([encoded_topics[i] for i in range(size) if split[i]==1], dtype='int32')

X_test = np.asarray([encoded_docs[i] for i in range(size) if split[i]==0], dtype='int32')
y_test = np.asarray([encoded_topics[i] for i in range(size) if split[i]==0], dtype='int32')

In [177]:
egx = X_train[:200]
egy = y_train[:200]

# 3. Create model

## 3.1 Model definition

1. Embedding -> 2. Conv1D -> 3. Pooling1D -> 4. Dense -> 5. Output

In [272]:
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Dropout, Flatten, Dense

np.random.seed(42)
num_classes = len(encoder.classes_)

In [273]:
num_words = len(t.word_index) + 1

model = Sequential()

In [274]:
# 1.
model.add(Embedding(
    num_words,
    v_size,
    embeddings_initializer='uniform',
    weights=[embedding_matrix],
    input_length=max_sentence_length,
    trainable=False,
    name='embedding'
))

In [275]:
# 2.
model.add(Conv1D(
    filters=128,
    kernel_size=3,
    activation='relu',
    name='conv1d'
))

In [276]:
# 3.
model.add(MaxPooling1D(
    pool_size=5,
    name='maxpool'
))

In [277]:
# 3.5 
model.add(Flatten(name='flat'))
model.add(Dropout(0.25,name='dropout1'))

In [278]:
# 4.
model.add(Dense(
    128, 
    activation='relu',
    name='dense1'
))
model.add(Dropout(0.25,name='dropout2'))
model.add(Dense(
    64, 
    activation='relu',
    name='dense2'
))

In [279]:
# 5.
model.add(Dense(
    num_classes,
    activation='softmax',
    name='output'
))

In [285]:
opt = keras.optimizers.Adam(learning_rate=0.001)
# opt = keras.optimizers.RMSprop(learning_rate=0.01)
model.compile(optimizer=opt, loss="categorical_crossentropy", metrics=[keras.metrics.AUC(), 'acc'])

## 3.2 Training model

In [291]:
model.fit(X_train, y_train, batch_size=16, epochs=8, verbose=2)

Epoch 1/8
7500/7500 - 8s - loss: 0.2559 - auc_7: 0.9875 - acc: 0.9086 - 8s/epoch - 1ms/step
Epoch 2/8
7500/7500 - 8s - loss: 0.2543 - auc_7: 0.9876 - acc: 0.9102 - 8s/epoch - 1ms/step
Epoch 3/8
7500/7500 - 8s - loss: 0.2525 - auc_7: 0.9879 - acc: 0.9101 - 8s/epoch - 1ms/step
Epoch 4/8
7500/7500 - 8s - loss: 0.2508 - auc_7: 0.9880 - acc: 0.9110 - 8s/epoch - 1ms/step
Epoch 5/8
7500/7500 - 8s - loss: 0.2500 - auc_7: 0.9881 - acc: 0.9109 - 8s/epoch - 1ms/step
Epoch 6/8
7500/7500 - 8s - loss: 0.2479 - auc_7: 0.9883 - acc: 0.9114 - 8s/epoch - 1ms/step
Epoch 7/8
7500/7500 - 8s - loss: 0.2475 - auc_7: 0.9882 - acc: 0.9119 - 8s/epoch - 1ms/step
Epoch 8/8
7500/7500 - 8s - loss: 0.2468 - auc_7: 0.9884 - acc: 0.9120 - 8s/epoch - 1ms/step


<keras.callbacks.History at 0x14dd0aa2f70>

In [292]:
scores = model.evaluate(X_test, y_test, verbose=0)
for i, m in enumerate(model.metrics_names):
    print(f'{m}: {scores[i]:.5f}')

loss: 0.38904
auc_7: 0.97480
acc: 0.87711
