# Chapter 8: From Human Neurons to Artificial Neurons for Understanding Text

## Let's talk Keras

In [1]:
#importing and initializing the Sequential model
from keras.models import Sequential
model = Sequential()

In [2]:
#adding layers to the sequential stack
from keras.layers import Dense, Dropout
model.add(Dense(units=64, activation='relu'))
model.add(Dropout(0.3))

In [3]:
#compiling our model and specifying loss function and optimizer
model.compile(loss='binary_crossentropy',optimizer= 'adam', metrics=['accuracy'])

## Building a question classifier using neural networks

#### 1) Importing the basic libraries

In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer 
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import pandas as pd
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

#### 2) Reading the dataset

In [5]:
train_data = open('training_data.txt', 'r+')
test_data = open('test_dataset.txt', 'r+')

train = pd.DataFrame(train_data.readlines(), columns = ['Question'])
test = pd.DataFrame(test_data.readlines(), columns = ['Question'])

#### 3) Checking some data

In [6]:
train.head()

Unnamed: 0,Question
0,DESC:manner How did serfdom develop in and the...
1,ENTY:cremat What films featured the character ...
2,DESC:manner How can I find a list of celebriti...
3,ENTY:animal What fowl grabs the spotlight afte...
4,ABBR:exp What is the full form of .com ?\n


#### 4) Splitting the data points to obtain question strings and coarse and fine question categories

In [7]:
train['QType'] = train.Question.apply(lambda x: x.split(' ', 1)[0])
train['Question'] = train.Question.apply(lambda x: x.split(' ', 1)[1])
train['QType-Coarse'] = train.QType.apply(lambda x: x.split(':')[0])
train['QType-Fine'] = train.QType.apply(lambda x: x.split(':')[1])
test['QType'] = test.Question.apply(lambda x: x.split(' ', 1)[0])
test['Question'] = test.Question.apply(lambda x: x.split(' ', 1)[1])
test['QType-Coarse'] = test.QType.apply(lambda x: x.split(':')[0])
test['QType-Fine'] = test.QType.apply(lambda x: x.split(':')[1])

#### 5) Checking cleaned dataset

In [8]:
train.head()

Unnamed: 0,Question,QType,QType-Coarse,QType-Fine
0,How did serfdom develop in and then leave Russ...,DESC:manner,DESC,manner
1,What films featured the character Popeye Doyle...,ENTY:cremat,ENTY,cremat
2,How can I find a list of celebrities ' real na...,DESC:manner,DESC,manner
3,What fowl grabs the spotlight after the Chines...,ENTY:animal,ENTY,animal
4,What is the full form of .com ?\n,ABBR:exp,ABBR,exp


#### 6) Removing the QType and QType-Fine variables, as our focus is on predicting the coarse classes

In [9]:
train.pop('QType')
train.pop('QType-Fine')
test.pop('QType')
test.pop('QType-Fine')

0           dist
1           city
2           desc
3            def
4           date
         ...    
495          ind
496     currency
497        count
498    substance
499          def
Name: QType-Fine, Length: 500, dtype: object

In [10]:
train.head()

Unnamed: 0,Question,QType-Coarse
0,How did serfdom develop in and then leave Russ...,DESC
1,What films featured the character Popeye Doyle...,ENTY
2,How can I find a list of celebrities ' real na...,DESC
3,What fowl grabs the spotlight after the Chines...,ENTY
4,What is the full form of .com ?\n,ABBR


#### 7) Checking the different classes in our dataset

In [11]:
classes = np.unique(np.array(train['QType-Coarse']))
classes

array(['ABBR', 'DESC', 'ENTY', 'HUM', 'LOC', 'NUM'], dtype=object)

#### 8) Using label encoding to convert classes into integral identfiers

In [12]:
le = LabelEncoder()
le.fit(pd.Series(train['QType-Coarse'].tolist() + test['QType-Coarse'].tolist()).values)
train['QType-Coarse'] = le.transform(train['QType-Coarse'].values)
test['QType-Coarse'] = le.transform(test['QType-Coarse'].values)

#### 9) Preprocessing our dataset

In [13]:
#The different process present in the pipeline

def text_clean(corpus):
    cleaned_corpus = pd.Series()
    for row in corpus:
        qs = []
        for word in row.split():
            p1 = re.sub(pattern='[^a-zA-Z]',repl=' ',string=word)
            p1 = p1.lower()
            qs.append(p1)
        cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs)))
    return cleaned_corpus

def stopwords_removal(corpus):
    wh_words = ['who', 'what', 'when', 'why', 'how', 'which', 'where', 'whom']
    stop = set(stopwords.words('english'))
    for word in wh_words:
        stop.remove(word)
    corpus = [[x for x in x.split() if x not in stop] for x in corpus]
    return corpus

def lemmatize(corpus):
    lem = WordNetLemmatizer()
    corpus = [[lem.lemmatize(x, pos = 'v') for x in x] for x in corpus]
    return corpus

def stem(corpus, stem_type = None):
    if stem_type == 'snowball':
        stemmer = SnowballStemmer(language = 'english')
        corpus = [[stemmer.stem(x) for x in x] for x in corpus]
    else :
        stemmer = PorterStemmer()
        corpus = [[stemmer.stem(x) for x in x] for x in corpus]
    return corpus

def preprocess(corpus, cleaning = True, stemming = False, stem_type = None, lemmatization = False, remove_stopwords = True):
    if cleaning == True:
        corpus = text_clean(corpus)
    
    if remove_stopwords == True:
        corpus = stopwords_removal(corpus)
    else :
        corpus = [[x for x in x.split()] for x in corpus]
    
    if lemmatization == True:
        corpus = lemmatize(corpus)
        
        
    if stemming == True:
        corpus = stem(corpus, stem_type)
    
    corpus = [' '.join(x) for x in corpus]
        

    return corpus

In [14]:
all_corpus = pd.Series(train.Question.tolist() + test.Question.tolist()).astype(str)
all_corpus = preprocess(all_corpus, remove_stopwords = True)

  cleaned_corpus = pd.Series()


#### 10) Splitting our data into training and testings sets

In [15]:
train_corpus = all_corpus[0:train.shape[0]]
test_corpus = all_corpus[train.shape[0]:]

#### 11) Vectorizing our text data using TD-IDF

In [16]:
vectorizer = TfidfVectorizer()
tf_idf_matrix_train = vectorizer.fit_transform(train_corpus)
tf_idf_matrix_test = vectorizer.transform(test_corpus)

#### 12) Importing Keras and various libraries

In [17]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential, Model 
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense, Dropout, Input
from keras.utils import np_utils

#### 13) One hot encoding class labels

In [18]:
y_train = np_utils.to_categorical(train['QType-Coarse'], train['QType-Coarse'].nunique())
y_test = np_utils.to_categorical(test['QType-Coarse'], train['QType-Coarse'].nunique())

#### 14) Defining the network architecture

In [19]:
model = Sequential()

model.add(Dense(128, activation='relu', input_dim=tf_idf_matrix_train.shape[1]))
model.add(Dropout(0.3))
model.add(Dense(6, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 128)               1027968   
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 6)                 774       
Total params: 1,028,742
Trainable params: 1,028,742
Non-trainable params: 0
_________________________________________________________________


#### 15) Training the model

In [20]:
def convert_sparse_matrix_to_sparse_tensor(X):
    coo = X.tocoo()
    indices = np.mat([coo.row, coo.col]).transpose()
    return tf.SparseTensor(indices, coo.data, coo.shape)

In [21]:
 tf_idf_matrix_train=convert_sparse_matrix_to_sparse_tensor(tf_idf_matrix_train)

In [22]:
 tf_idf_matrix_train= tf.sparse.reorder(tf_idf_matrix_train)

In [23]:
training_history = model.fit(tf_idf_matrix_train, y_train, epochs=10, batch_size=100)

Epoch 1/10




Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


#### 16) Evaluating the model using "accuracy" as the metric

In [24]:
tf_idf_matrix_test=convert_sparse_matrix_to_sparse_tensor(tf_idf_matrix_test)
tf_idf_matrix_test= tf.sparse.reorder(tf_idf_matrix_test)

In [25]:
loss, accuracy = model.evaluate(tf_idf_matrix_test, y_test, verbose=False)
print("Testing Accuracy: {:.4f}".format(accuracy))

Testing Accuracy: 0.8580


#### 17) Saving the model architecture and weights

In [26]:
import h5py
model_structure = model.to_json()
with open("question_classification_model.json", "w") as json_file:
    json_file.write(model_structure)
    
model.save_weights("question_classification_weights.h5")