In [1]:
# Import necessary libraries
import random

import gensim.downloader
from gensim.models import KeyedVectors
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import contractions
import string, re
import operator

from collections import Counter

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences, to_categorical
from keras.layers import *
from keras.models import *
from keras.callbacks import *
from keras import regularizers
from keras.optimizers import Adam
from keras import initializers, regularizers, constraints, optimizers, layers
from keras import backend as K

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', 1000)

In [2]:
# Download necessary nltk packages
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pekxu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\pekxu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Table of Contents<a id='home'></a>

- [Data pre-processing](#DataPreprocessing)
    - [Text cleaning](#text-clean)
        - [Case-folding](#casefold)
        - [Removing Contractions](#cont)
        - [Removing punctuations](#punc)
        - [Lemmatization](#lemm)
        - [Removing common stop words](#stop)
    - [Prepare train and development data](#data)
- [Model building](#model)
    - [Taking last word](#LSTM)
    - [Maximum pooling](#CNN)
- [Model training and evaluation](#eval)
    - [Prepare test set](#pre-test)
    - Training
        - [LSTM](#tLSTM)
        - [CNN](#tCNN)
    - [Final test accuracy](#final)

## Download word2vec embeddings

In [3]:
print(list(gensim.downloader.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [4]:
w2v = gensim.downloader.load('word2vec-google-news-300')

In [5]:
w2v

<gensim.models.keyedvectors.KeyedVectors at 0x1bb3d3b7d90>

## Import TREC Dataset

We download the **TREC dataset** and import the dataset using the `pandas` library.

The Text REtrieval Conference (TREC) Question Classification dataset contains 5500 labeled questions in training set and another 500 for test set.

The dataset has 6 coarse class labels and 50 fine class labels. Average length of each sentence is 10, vocabulary size of 8700.

In [6]:
df_train = pd.read_csv('./dataset/train.csv')
df_train.drop(['label-fine'], axis=1, inplace=True)
df_train = df_train.drop_duplicates(keep='first') # Drop 71 duplicates, length should be 5381

print(df_train.info())
df_train.head()

<class 'pandas.core.frame.DataFrame'>
Index: 5381 entries, 0 to 5451
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   label-coarse  5381 non-null   int64 
 1   text          5381 non-null   object
dtypes: int64(1), object(1)
memory usage: 126.1+ KB
None


Unnamed: 0,label-coarse,text
0,0,How did serfdom develop in and then leave Russia ?
1,1,What films featured the character Popeye Doyle ?
2,0,How can I find a list of celebrities ' real names ?
3,1,What fowl grabs the spotlight after the Chinese Year of the Monkey ?
4,2,What is the full form of .com ?


The data fields are the same among all splits. 

- text (str): Text of the question. 
- coarse_label (ClassLabel): Corase class label. Possible values are:
    > 'ABBR' (0): Abbreviation  
    > 'ENTY' (1): Entity  
    > 'DESC' (2): Description and abstract concept  
    > 'HUM' (3): Human being  
    > 'NUM' (4): Numeric value   
    > 'LOC' (5): Location

In [7]:
to_replace = {0: 'ABBR', 
              1: 'ENTY', 
              2: 'DESC', 
              3: 'HUM', 
              4: 'NUM', 
              5: 'LOC'}

df_train['label-coarse'] = df_train['label-coarse'].replace(to_replace)
df_train.sample(5)

Unnamed: 0,label-coarse,text
3283,ENTY,What are you caught in if a haboob blows up ?
2880,HUM,What is the name of the pop singer whose song became the theme song for a brand of catsup ?
5266,HUM,Who is the richest person in the world ?
2954,ABBR,How is digital audio used ?
4317,HUM,What team did Babe Ruth play his first major league game for ?


## Data pre-processing<a id='DataPreprocessing'></a>

### Check coverage

Check the portion of words being covered by the pre-trained word2vec embeddings.

In [8]:
def build_vocab(texts):
    """
    Function to count the occurence of each word in the given corpus.
    
    Variable
    ========
    texts (pd.DataFrame): A column containing the corpus.
    
    Return
    ======
    vocab (dictionary): A dictionary which count the occurrence of each word in the given texts.
                        With key as the word, value as the count of the occurrence of respective word.
    """
    
    sentences = texts.apply(lambda x: x.split()).values
    vocab = {}
    for sentence in sentences:
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

def check_coverage(vocab, embeddings_index):
    """
    Function to check how many word in the vocab is being covered by the given embeddings.
    
    Variables
    =========
    vocab (dictionary): A dictionary containing the count of occurence each word in a corpus.
    embeddings_index: Word embeddings that is being used. In this case, word2vec.
    
    Return
    ======
    unknown_words (dictionary): A dictionary containing all the vocab unrecognised by the given embeddings and their respective frequency.
    """
    
    known_words = {}
    unknown_words = {}
    nb_known_words = 0
    nb_unknown_words = 0
    for word in vocab.keys():
        try:
            known_words[word] = embeddings_index[word]
            nb_known_words += vocab[word]
        except:
            unknown_words[word] = vocab[word]
            nb_unknown_words += vocab[word]
            pass

    print('Found embeddings for {:.2%} of vocab'.format(len(known_words) / len(vocab)))
    print('Found embeddings for {:.2%} of all text'.format(nb_known_words / (nb_known_words + nb_unknown_words)))
    unknown_words = sorted(unknown_words.items(), key=operator.itemgetter(1))[::-1]

    return unknown_words

In [9]:
vocab = build_vocab(df_train['text'])

In [10]:
cvg = check_coverage(vocab, w2v)

Found embeddings for 91.71% of vocab
Found embeddings for 77.77% of all text


## Text cleaning<a id='text-clean'></a>

Several steps were done to clean the corpus.
- [Case-folding](#casefold)
- [Removing Contractions](#cont)
- [Removing punctuations](#punc)
- [Lemmatization](#lemm)
- [Removing common stop words](#stop)

### Case-folding<a id='casefold'></a>

In [11]:
df_train['text-clean'] = df_train['text'].apply(lambda x: x.lower())

### Expanding the contraction<a id='cont'></a>

In [12]:
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", 
                       "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": 
                       "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", 
                       "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", 
                       "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", 
                       "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have", "it's": "it is", "let's": "let us", "ma'am": "madam", 
                       "mayn't": "may not", "might've": "might have","mightn't": "might not", "mightn't've": "might not have", "must've": "must have", "mustn't": "must not", 
                       "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have", "o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", 
                       "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", 
                       "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is",
                       "that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is",
                       "they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", 
                       "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", 
                       "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", 
                       "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", 
                       "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", 
                       "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
                       "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have",
                       "what's": "what is"}

In [13]:
def clean_contractions(text, mapping):
    """
    Function used to replace the texts using the mapping.
    
    Variables
    =========
    text (str): Sentence that contains sub-string that needed to be replaced.
    mapping (dictionary): Dictionary contains the mapping of the replacement of words.
    
    Return
    ======
    text (str): Sentence with some words replaced according to the mapping.
    """
    specials = ["’", "‘", "´", "`"]
    for s in specials:
        text = text.replace(s, "'")
    text = ' '.join([mapping[t] if t in mapping else t for t in text.split(" ")])

    return text

In [14]:
df_train['text-clean'] = df_train['text-clean'].str.replace(" '", "'") # Remove the space to ensure clear_contractions work properly

In [15]:
df_train['text-clean'] = df_train['text-clean'].apply(lambda x: clean_contractions(x, contraction_mapping))

In [16]:
df_train['text-clean'] = df_train['text-clean'].str.replace('u.s.', 'USA')

In [17]:
# Check coverage to see if it improved
vocab = build_vocab(df_train['text-clean'])
cvg = check_coverage(vocab, w2v)

Found embeddings for 76.25% of vocab
Found embeddings for 76.20% of all text


In [18]:
# Top-10 frequent words that is not being covered
cvg[:10]

[('?', 5261),
 ('of', 1524),
 ('a', 1012),
 ('to', 604),
 (',', 559),
 ('and', 419),
 ("''", 396),
 ('.', 89),
 (':', 62),
 ("'", 36)]

### Removing punctuations<a id='punc'></a>

In [19]:
def remove_punct(text):
    """
    Function to remove the punctuations given a sentence.
    """
    return text.translate(str.maketrans('', '', string.punctuation))

In [20]:
df_train['text-clean'] = df_train['text-clean'].apply(lambda x: remove_punct(x))

In [21]:
vocab = build_vocab(df_train['text-clean'])
cvg = check_coverage(vocab, w2v)

Found embeddings for 82.36% of vocab
Found embeddings for 88.18% of all text


In [22]:
cvg[:10]

[('of', 1524),
 ('a', 1013),
 ('to', 604),
 ('and', 419),
 ('10', 12),
 ('1984', 10),
 ('gould', 9),
 ('1963', 7),
 ('mozambique', 7),
 ('15', 7)]

### Lemmatization<a id='lemm'></a>

In [23]:
def lemmatize_(seq):
    """
    Variable
    ========
    seq (str): A long string containing your sequence.
    
    Return
    ======
    Lemmatize string, readied to be tokenize.
    
    """
    # Define lemmatizer
    lemmatizer = WordNetLemmatizer()
    
    # Separate the sequence by space, into a list
    seq_list = seq.split(' ')
    
    # Final sequence list
    output_list = []
    
    for word in seq_list:
        # 1. Lemmatize verbs - Only lemmatize verbs into their respective base forms
        word = lemmatizer.lemmatize(str(word), pos='v')
    
        # Lemmatize nouns - In case some plural nouns like `films`, lemmatize to base form `film`
        word = lemmatizer.lemmatize(str(word), pos='n')
        
        output_list.append(word)
        
    return ' '.join(output_list) # Merge the list into a complete sentence via adding spaces in-between

In [24]:
df_train['text-clean'] = df_train['text-clean'].apply(lambda x: lemmatize_(x))

In [25]:
vocab = build_vocab(df_train['text-clean'])
cvg = check_coverage(vocab, w2v)

Found embeddings for 80.82% of vocab
Found embeddings for 88.22% of all text


In [26]:
cvg[:10]

[('of', 1524),
 ('a', 1137),
 ('to', 604),
 ('and', 419),
 ('10', 12),
 ('1984', 10),
 ('gould', 9),
 ('1963', 7),
 ('mozambique', 7),
 ('15', 7)]

### Removing common stop-words<a id='stop'></a>

In [27]:
filtered_stop_words = {'of', 'a', 'to', 'and'}

# Function to remove stopwords
def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in filtered_stop_words]
    return ' '.join(filtered_words)

In [28]:
df_train['text-clean'] = df_train['text-clean'].apply(lambda x: remove_stopwords(x))

In [29]:
vocab = build_vocab(df_train['text-clean'])
cvg = check_coverage(vocab, w2v)

Found embeddings for 80.86% of vocab
Found embeddings for 95.65% of all text


In [30]:
cvg[:10]

[('10', 12),
 ('1984', 10),
 ('gould', 9),
 ('1963', 7),
 ('mozambique', 7),
 ('15', 7),
 ('1899', 7),
 ('nnp', 7),
 ('11', 7),
 ('mutombo', 6)]

[Return to top](#home)

## Preparing train and development data<a id='data'></a>

- Combining two random chosen class into "OTHERS"
- Split into train-development sets
- Tokenize and pad the sequences
- Build embedding matrix

In [31]:
import random
random.seed(100) # Set randomly

class_list = df_train['label-coarse'].value_counts().index.to_list()
chosen_class = random.sample(class_list, 4)
other_class = list(set(class_list) - set(chosen_class))
print(f"{other_class} will be replaced with 'OTHERS'")

['DESC', 'ABBR'] will be replaced with 'OTHERS'


In [32]:
# Replace the selected classes with the label 'OTHERS'
df_train.loc[df_train['label-coarse'] == other_class[0], 'label-coarse'] = 'OTHERS'
df_train.loc[df_train['label-coarse'] == other_class[1], 'label-coarse'] = 'OTHERS'

df_train['label-coarse'].value_counts()

label-coarse
ENTY      1245
OTHERS    1239
HUM       1215
NUM        858
LOC        824
Name: count, dtype: int64

In [33]:
X = df_train.drop(['label-coarse', 'text'], axis=1)
y = df_train.drop(['text', 'text-clean'], axis=1)

In [34]:
to_replace = {'OTHERS': 0, 
              'ENTY': 1, 
              'HUM': 2, 
              'NUM': 3,
              'LOC': 4}

y['label-coarse'] = y['label-coarse'].replace(to_replace)
y['label-coarse'] = y['label-coarse'].replace(to_replace)

In [35]:
# Train test split before tokenizing them
X_train, X_dev, y_train, y_dev = train_test_split(X, y,
                                                  stratify=y, 
                                                  test_size=500, 
                                                  random_state=100)

In [36]:
# Sanity check
len(X_train), len(X_dev), len(y_train), len(y_dev)

(4881, 500, 4881, 500)

### Creating embedding matrix

In [37]:
EMBEDDING_SIZE = w2v.vector_size
MAX_LENGTH = 40
NUM_CLASSES = 5

In [38]:
print(f"Vocabulary size: {len(w2v.index_to_key)}")
print(f"Embedding size: {w2v.vector_size}")

Vocabulary size: 3000000
Embedding size: 300


In [39]:
tokenizer = Tokenizer(oov_token='OOV')
tokenizer.fit_on_texts(X_train['text-clean'].tolist())

# Tokenize both train and dev sets
tokenized_train = tokenizer.texts_to_sequences(X_train['text-clean'].tolist())
X_train = pad_sequences(tokenized_train, maxlen=MAX_LENGTH)

tokenized_dev = tokenizer.texts_to_sequences(X_dev['text-clean'].tolist())
X_dev = pad_sequences(tokenized_dev, maxlen=MAX_LENGTH)

# One-hot encoding the test labels
y_train = to_categorical(y_train, num_classes=NUM_CLASSES)
y_dev = to_categorical(y_dev, num_classes=NUM_CLASSES)

In [40]:
vocab_size = len(tokenizer.word_index) + 1

embedding_matrix = np.zeros((vocab_size, EMBEDDING_SIZE))

for word, i in tokenizer.word_index.items():
    try:
        embedding_matrix[i] = w2v[word]
    except KeyError:
        continue

In [41]:
len(embedding_matrix)

6973

[Return to top](#home)

# Model building <a id='model'></a>

In this section, we start to build the models using different aggregation methods. A total of two were explored, as listed below：
- [Taking last word](#LSTM)
    - Simple LSTM
    - Deeper LSTM
- [Maximum pooling](#CNN)
    - CNN without maximum pooling
    - CNN with maximum pooling

## Taking last word as the representation <a id='LSTM'></a>

For this aggregation method, RNN (Recurrent Neural Network), specifically LSTM (Long-Short Term Memory, both uni-directional and bi-directional) was implemented.

As the words are being parsed into the LSTM network sequentially, at the last word, the LSTM network should produce the context of the whole sequence up to that particular word. Thus, the final embeddings extracted should contain the summary of the whole sequence.

In [45]:
inp = Input(shape=(MAX_LENGTH, ))
x = Embedding(input_dim=vocab_size,
              output_dim=EMBEDDING_SIZE,
              weights=[embedding_matrix],
              input_length=MAX_LENGTH,
              trainable=False)(inp)
dropout = Dropout(0.5)(x)

lstm = Bidirectional(CuDNNLSTM(units=150, return_sequences=True))(dropout)
dropout = Dropout(0.5)(lstm)

flat = Flatten()(dropout)

output = Dense(NUM_CLASSES, activation='softmax', kernel_regularizer=regularizers.l2(0.001))(flat)

model_lstm = Model(inputs=inp, outputs=output)

optimizer = Adam(learning_rate=0.003)
model_lstm.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['acc'])

In [51]:
print(model_lstm.summary())
tf.keras.utils.plot_model(model_lstm, show_shapes=True)

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 40)]              0         
                                                                 
 embedding_2 (Embedding)     (None, 40, 300)           2091900   
                                                                 
 dropout_3 (Dropout)         (None, 40, 300)           0         
                                                                 
 bidirectional_3 (Bidirectio  (None, 40, 300)          542400    
 nal)                                                            
                                                                 
 dropout_4 (Dropout)         (None, 40, 300)           0         
                                                                 
 flatten_1 (Flatten)         (None, 12000)             0         
                                                           

In [52]:
model_lstm_deep = Sequential()

model_lstm_deep.add(Embedding(vocab_size, 
                    output_dim=EMBEDDING_SIZE, 
                    weights=[embedding_matrix], 
                    input_length=MAX_LENGTH, 
                    trainable=False))

model_lstm_deep.add(Bidirectional(CuDNNLSTM(units=256, return_sequences=True)))
model_lstm_deep.add(Bidirectional(CuDNNLSTM(units=256, return_sequences=True)))
model_lstm_deep.add(Dropout(0.2))
model_lstm_deep.add(CuDNNLSTM(units=256))

model_lstm_deep.add(Dense(NUM_CLASSES, activation='softmax'))

optimizer = Adam(learning_rate=0.001, clipvalue=0.5)
model_lstm_deep.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['acc'])

In [53]:
model_lstm_deep.summary()
tf.keras.utils.plot_model(model_lstm_deep, show_shapes=True)

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 40, 300)           2091900   
                                                                 
 bidirectional_4 (Bidirectio  (None, 40, 512)          1142784   
 nal)                                                            
                                                                 
 bidirectional_5 (Bidirectio  (None, 40, 512)          1576960   
 nal)                                                            
                                                                 
 dropout_5 (Dropout)         (None, 40, 512)           0         
                                                                 
 cu_dnnlstm_7 (CuDNNLSTM)    (None, 256)               788480    
                                                                 
 dense_3 (Dense)             (None, 5)                

[Return to top](#home)

## Maximum pooling <a id='CNN'></a>

For maximum pooling, a combination of CNN (Convolutional Neural Network) and LSTM were implemented.

Via maximum pooling, the model should be able to extract the most salient features from the convolved embeddings.

In [55]:
inp = Input(shape=(MAX_LENGTH, ))
x = Embedding(input_dim=vocab_size,
              output_dim=EMBEDDING_SIZE,
              weights=[embedding_matrix],
              input_length=MAX_LENGTH,
              trainable=False)(inp)
dropout = Dropout(0.5)(x)

conv1d = Conv1D(filters=300, kernel_size=3, activation='relu', strides=1, padding='valid')(dropout)

lstm = Bidirectional(CuDNNLSTM(units=150, return_sequences=True))(conv1d)
dropout = Dropout(0.5)(lstm)
flat = Flatten()(dropout)

output = Dense(NUM_CLASSES, activation='softmax', kernel_regularizer=regularizers.l2(0.001))(flat)

model_conv = Model(inputs=inp, outputs=output)

optimizer = Adam(learning_rate=0.003)
model_conv.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['acc'])

In [56]:
model_conv.summary()
tf.keras.utils.plot_model(model_conv, show_shapes=True)

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 40)]              0         
                                                                 
 embedding_5 (Embedding)     (None, 40, 300)           2091900   
                                                                 
 dropout_8 (Dropout)         (None, 40, 300)           0         
                                                                 
 conv1d_1 (Conv1D)           (None, 38, 300)           270300    
                                                                 
 bidirectional_7 (Bidirectio  (None, 38, 300)          542400    
 nal)                                                            
                                                                 
 dropout_9 (Dropout)         (None, 38, 300)           0         
                                                           

In [62]:
inp = Input(shape=(MAX_LENGTH, ))
x = Embedding(input_dim=vocab_size,
              output_dim=EMBEDDING_SIZE,
              weights=[embedding_matrix],
              input_length=MAX_LENGTH,
              trainable=False)(inp)
dropout = Dropout(0.5)(x)

conv1d = Conv1D(filters=300, kernel_size=3, activation='relu', strides=1, padding='valid')(dropout)
maxpool = MaxPooling1D(pool_size=2, strides=2)(conv1d)

lstm = Bidirectional(CuDNNLSTM(units=150, return_sequences=True))(maxpool)
dropout = Dropout(0.5)(lstm)
flat = Flatten()(dropout)

output = Dense(NUM_CLASSES, activation='softmax', kernel_regularizer=regularizers.l2(0.001))(flat)

model_pool = Model(inputs=inp, outputs=output)

optimizer = Adam(learning_rate=0.003)
model_pool.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['acc'])

In [63]:
model_pool.summary()
tf.keras.utils.plot_model(model_pool, show_shapes=True)

Model: "model_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, 40)]              0         
                                                                 
 embedding_6 (Embedding)     (None, 40, 300)           2091900   
                                                                 
 dropout_10 (Dropout)        (None, 40, 300)           0         
                                                                 
 conv1d_2 (Conv1D)           (None, 38, 300)           270300    
                                                                 
 max_pooling1d (MaxPooling1D  (None, 19, 300)          0         
 )                                                               
                                                                 
 bidirectional_8 (Bidirectio  (None, 19, 300)          542400    
 nal)                                                      

[Return to top](#home)

# Model training and evaluation<a id='eval'></a>

In this section, a custom callback was introduced to conduct model training and evaluation at once.

- [Prepare test set](#pre-test)
- Training
    - [LSTM](#tLSTM)
    - [CNN](#tCNN)
- [Final test accuracy](#final)

In [57]:
class EvaluateTestSet(Callback):
    """
    Custom callback class to evaluate the test set after every epoch.
    """
    def __init__(self, test_data):
        super().__init__()
        self.test_data = test_data

    def on_epoch_end(self, epoch, logs={}):
        x_test, y_test = self.test_data
        loss, acc = self.model.evaluate(x_test, y_test, verbose=0, batch_size=128)
        logs['test_loss'] = loss
        logs['test_acc'] = acc
        print(f'\nTesting loss: {loss}, acc: {acc}')

## Prepare test set<a id='pre-test'></a>

Load in the test set, replace the class labels with 'OTHERS', and implement the same text cleaning processes.

In [58]:
df_test = pd.read_csv('./dataset/test.csv')
df_test.drop(['label-fine'], axis=1, inplace=True)

to_replace = {0: 'ABBR', 
              1: 'ENTY', 
              2: 'DESC', 
              3: 'HUM', 
              4: 'NUM', 
              5: 'LOC'}

df_test['label-coarse'] = df_test['label-coarse'].replace(to_replace)

df_test['text-clean'] = df_test['text'].apply(lambda x: x.lower())
df_test['text-clean'] = df_test['text-clean'].apply(lambda x: clean_contractions(x, contraction_mapping))
df_test['text-clean'] = df_test['text-clean'].str.replace('u.s.', 'USA')
df_test['text-clean'] = df_test['text-clean'].apply(lambda x: remove_punct(x))
df_test['text-clean'] = df_test['text-clean'].apply(lambda x: lemmatize_(x))
df_test['text-clean'] = df_test['text-clean'].apply(lambda x: remove_stopwords(x))

df_test['label-coarse'] = df_test['label-coarse'].replace(['ABBR', 'DESC'], 'OTHERS')

to_replace = {'OTHERS': 0, 
              'ENTY': 1, 
              'HUM': 2, 
              'NUM': 3,
              'LOC': 4}
df_test['label-coarse'] = df_test['label-coarse'].replace(to_replace)

x_test = df_test.drop(['label-coarse', 'text'], axis=1)
y_test = df_test.drop(['text', 'text-clean'], axis=1)

tokenized_test = tokenizer.texts_to_sequences(x_test['text-clean'].tolist())
X_test = pad_sequences(tokenized_test, maxlen=MAX_LENGTH)
y_test = to_categorical(y_test, num_classes=NUM_CLASSES)

## Model Training

### LSTM <a id='tLSTM'></a>

In [60]:
file_path = "./model/model_lstm.h5"
test_callback = EvaluateTestSet((X_test, y_test))
model_checkpt = ModelCheckpoint(file_path, monitor='test_acc', mode='max', verbose=1, save_best_only=True)
reduce_lr = ReduceLROnPlateau(monitor='val_acc', mode='max', factor=0.2, patience=10, min_lr=0.0001)

history = model_lstm.fit(X_train, y_train,
                         batch_size=4,
                         epochs=50,
                         validation_data=(X_dev, y_dev),
                         callbacks=[reduce_lr, test_callback, model_checkpt])

Epoch 1/50
Testing loss: 0.5333226919174194, acc: 0.878000020980835

Epoch 1: test_acc improved from -inf to 0.87800, saving model to .\model_lstm.h5
Epoch 2/50
Testing loss: 0.40755531191825867, acc: 0.8880000114440918

Epoch 2: test_acc improved from 0.87800 to 0.88800, saving model to .\model_lstm.h5
Epoch 3/50
Testing loss: 0.4188373386859894, acc: 0.8820000290870667

Epoch 3: test_acc did not improve from 0.88800
Epoch 4/50
Testing loss: 0.3410651385784149, acc: 0.9259999990463257

Epoch 4: test_acc improved from 0.88800 to 0.92600, saving model to .\model_lstm.h5
Epoch 5/50
Testing loss: 0.34282007813453674, acc: 0.921999990940094

Epoch 5: test_acc did not improve from 0.92600
Epoch 6/50
Testing loss: 0.32152295112609863, acc: 0.9240000247955322

Epoch 6: test_acc did not improve from 0.92600
Epoch 7/50
Testing loss: 0.3137790858745575, acc: 0.9279999732971191

Epoch 7: test_acc improved from 0.92600 to 0.92800, saving model to .\model_lstm.h5
Epoch 8/50
Testing loss: 0.31409418

For taking the last word, via simple LSTM model, the highest test accuracy ever achieved is **95.400%**.

In [61]:
file_path = "./model/model_lstm_deep.h5"
test_callback = EvaluateTestSet((X_test, y_test))
model_checkpt = ModelCheckpoint(file_path, monitor='test_acc', mode='max', verbose=1, save_best_only=True)
reduce_lr = ReduceLROnPlateau(monitor='val_acc', mode='max', factor=0.2, patience=10, min_lr=0.0001)

history = model_lstm_deep.fit(X_train, y_train,
                         batch_size=4,
                         epochs=50,
                         validation_data=(X_dev, y_dev),
                         callbacks=[reduce_lr, test_callback, model_checkpt])

Epoch 1/50
Testing loss: 0.48813316226005554, acc: 0.8180000185966492

Epoch 1: test_acc improved from -inf to 0.81800, saving model to .\model_lstm_deep.h5
Epoch 2/50
Testing loss: 0.3176787197589874, acc: 0.8899999856948853

Epoch 2: test_acc improved from 0.81800 to 0.89000, saving model to .\model_lstm_deep.h5
Epoch 3/50
Testing loss: 0.3589704632759094, acc: 0.8920000195503235

Epoch 3: test_acc improved from 0.89000 to 0.89200, saving model to .\model_lstm_deep.h5
Epoch 4/50
Testing loss: 0.2934108376502991, acc: 0.9039999842643738

Epoch 4: test_acc improved from 0.89200 to 0.90400, saving model to .\model_lstm_deep.h5
Epoch 5/50
Testing loss: 0.39433354139328003, acc: 0.8999999761581421

Epoch 5: test_acc did not improve from 0.90400
Epoch 6/50
Testing loss: 0.2584786117076874, acc: 0.9179999828338623

Epoch 6: test_acc improved from 0.90400 to 0.91800, saving model to .\model_lstm_deep.h5
Epoch 7/50
Testing loss: 0.2738567590713501, acc: 0.9179999828338623

Epoch 7: test_acc d

For taking the last word, via a more complex LSTM model, the highest test accuracy ever achieved is  **93.800%**

### CNN <a id='tCNN'></a>

In [59]:
file_path = "./model/model_conv.h5"
test_callback = EvaluateTestSet((X_test, y_test))
model_checkpt = ModelCheckpoint(file_path, monitor='test_acc', mode='max', verbose=1, save_best_only=True)
reduce_lr = ReduceLROnPlateau(monitor='val_acc', mode='max', factor=0.2, patience=10, min_lr=0.0001)

history = model_conv.fit(X_train, y_train,
                         batch_size=4,
                         epochs=50,
                         validation_data=(X_dev, y_dev),
                         callbacks=[reduce_lr, test_callback, model_checkpt])

Epoch 1/50
Testing loss: 0.4558641314506531, acc: 0.8539999723434448

Epoch 1: test_acc improved from -inf to 0.85400, saving model to .\model_conv.h5
Epoch 2/50
Testing loss: 0.3502139449119568, acc: 0.9020000100135803

Epoch 2: test_acc improved from 0.85400 to 0.90200, saving model to .\model_conv.h5
Epoch 3/50
Testing loss: 0.3616127669811249, acc: 0.8999999761581421

Epoch 3: test_acc did not improve from 0.90200
Epoch 4/50
Testing loss: 0.3307274281978607, acc: 0.9139999747276306

Epoch 4: test_acc improved from 0.90200 to 0.91400, saving model to .\model_conv.h5
Epoch 5/50
Testing loss: 0.33718931674957275, acc: 0.8980000019073486

Epoch 5: test_acc did not improve from 0.91400
Epoch 6/50
Testing loss: 0.3448626399040222, acc: 0.9139999747276306

Epoch 6: test_acc did not improve from 0.91400
Epoch 7/50
Testing loss: 0.32052090764045715, acc: 0.9300000071525574

Epoch 7: test_acc improved from 0.91400 to 0.93000, saving model to .\model_conv.h5
Epoch 8/50
Testing loss: 0.3588872

For normal Conv1D (without MaxPool), the highest test accuracy ever achieved is  **93.000%**

In [64]:
file_path = "./model/model_pool.h5"
test_callback = EvaluateTestSet((X_test, y_test))
model_checkpt = ModelCheckpoint(file_path, monitor='test_acc', mode='max', verbose=1, save_best_only=True)
reduce_lr = ReduceLROnPlateau(monitor='val_acc', mode='max', factor=0.2, patience=10, min_lr=0.0001)

history = model_pool.fit(X_train, y_train,
                         batch_size=4,
                         epochs=50,
                         validation_data=(X_dev, y_dev),
                         callbacks=[reduce_lr, test_callback, model_checkpt])

Epoch 1/50
Testing loss: 0.39754751324653625, acc: 0.8920000195503235

Epoch 1: test_acc improved from -inf to 0.89200, saving model to .\model_pool.h5
Epoch 2/50
Testing loss: 0.35684043169021606, acc: 0.8960000276565552

Epoch 2: test_acc improved from 0.89200 to 0.89600, saving model to .\model_pool.h5
Epoch 3/50
Testing loss: 0.3149435520172119, acc: 0.9079999923706055

Epoch 3: test_acc improved from 0.89600 to 0.90800, saving model to .\model_pool.h5
Epoch 4/50
Testing loss: 0.3819805681705475, acc: 0.8980000019073486

Epoch 4: test_acc did not improve from 0.90800
Epoch 5/50
Testing loss: 0.31369540095329285, acc: 0.9100000262260437

Epoch 5: test_acc improved from 0.90800 to 0.91000, saving model to .\model_pool.h5
Epoch 6/50
Testing loss: 0.31594935059547424, acc: 0.9139999747276306

Epoch 6: test_acc improved from 0.91000 to 0.91400, saving model to .\model_pool.h5
Epoch 7/50
Testing loss: 0.42605212330818176, acc: 0.8899999856948853

Epoch 7: test_acc did not improve from 0.

For normal Conv1D (with MaxPool), the highest test accuracy ever achieved is  **93.200%**

Comparing all 4 models above, we can conclude:
- The models'ranking (based on their respective highest test accuracy): Simple LSTM > Complex LSTM > Conv1D with Maxpool > Conv1D without Maxpool
- Simple LSTM performed better than complex LSTM.
- Convolutional Network performed better when maximum pooling is applied. This might due to the effectiveness of extracting the salient features from the convolved embeddings.
- LSTM works better than CNN, and hence, taking last word performed better than maximum pooling.

[Return to top](#home)

## Final test accuracy<a id='final'></a>

After experimenting with different model configurations, we found that Simple LSTM models with taking last word as the representation of the whole sequence embeddings yielded the best result. As such, we present our final test accuracy using that model.

In [65]:
model_lstm.load_weights('./model/model_lstm.h5')
model_lstm.evaluate(X_test, y_test)



[0.3133699297904968, 0.9539999961853027]

In [66]:
# Inverse the 'to_replace' dictionary to map numbers back to text labels
label_to_text = {v: k for k, v in to_replace.items()}

# Assuming 'model' is your trained Keras model and 'X_test' is the test data in numerical format
predictions = model_lstm.predict(X_test)

# Convert predictions to class labels
predicted_classes = np.argmax(predictions, axis=1)

# 'y_test' are the true labels and 'df' is the DataFrame with the corresponding text
true_classes = np.argmax(y_test, axis=1)

# Get the indices where predictions and true labels differ
incorrect_indices = np.where(predicted_classes != true_classes)[0]

# Print the index, predicted label, true label, and raw text
for index in incorrect_indices:
    predicted_label_text = label_to_text[predicted_classes[index]]
    true_label_text = label_to_text[true_classes[index]]
    print(f"Index: {index}, Predicted: {predicted_label_text}, True: {true_label_text}, Text: {df_test.iloc[index]['text']}")

Index: 7, Predicted: OTHERS, True: ENTY, Text: What is Australia 's national flower ?
Index: 27, Predicted: ENTY, True: LOC, Text: What imaginary line is halfway between the North and South Poles ?
Index: 54, Predicted: ENTY, True: OTHERS, Text: What is done with worn or outdated flags ?
Index: 124, Predicted: OTHERS, True: NUM, Text: What is the earth 's diameter ?
Index: 184, Predicted: OTHERS, True: ENTY, Text: What is natural gas composed of ?
Index: 186, Predicted: ENTY, True: HUM, Text: What French ruler was defeated at the battle of Waterloo ?
Index: 190, Predicted: NUM, True: ENTY, Text: What is the sales tax in Minnesota ?
Index: 204, Predicted: ENTY, True: NUM, Text: What is the melting point of copper ?
Index: 223, Predicted: NUM, True: ENTY, Text: What is the electrical output in Madrid , Spain ?
Index: 234, Predicted: OTHERS, True: ENTY, Text: What are the two houses of the Legislative branch ?
Index: 245, Predicted: ENTY, True: LOC, Text: What is the longest suspension br

[Return to top](#home)