# Chapter 8: From Human Neurons to Artificial Neurons for Understanding Text

## Exploring the biology Behind Neural Networks

## How does a neural network learn?

## Understanding regularization

## Let's talk Keras

In [1]:
#importing and initializing the Sequential model
from keras.models import Sequential
model = Sequential()

In [2]:
#adding layers to the sequential stack
from keras.layers import Dense, Dropout
model.add(Dense(units=64, activation='relu'))
model.add(Dropout(0.3))

In [3]:
#compiling our model and specifying loss function and optimizer
model.compile(loss='binary_crossentropy',optimizer= 'adam', metrics=['accuracy'])

## Building a question classifier using neural networks

#### 1) Importing the basic libraries

In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import pandas as pd
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

#### 2) Reading the dataset

In [5]:
train_data = open('training_data.txt', 'r+')
test_data = open('test_dataset.txt', 'r+')
train = pd.DataFrame(train_data.readlines(), columns = ['Question'])
test = pd.DataFrame(test_data.readlines(), columns = ['Question'])

#### 3) Checking some data

In [6]:
train.head()

Unnamed: 0,Question
0,DESC:manner How did serfdom develop in and the...
1,ENTY:cremat What films featured the character ...
2,DESC:manner How can I find a list of celebriti...
3,ENTY:animal What fowl grabs the spotlight afte...
4,ABBR:exp What is the full form of .com ?\n


#### 4) Splitting the data points to obtain question strings and coarse and fine question categories

In [7]:
train['QType'] = train.Question.apply(lambda x: x.split(' ', 1)[0])
train['Question'] = train.Question.apply(lambda x: x.split(' ', 1)[1])
train['QType-Coarse'] = train.QType.apply(lambda x: x.split(':')[0])
train['QType-Fine'] = train.QType.apply(lambda x: x.split(':')[1])
test['QType'] = test.Question.apply(lambda x: x.split(' ', 1)[0])
test['Question'] = test.Question.apply(lambda x: x.split(' ', 1)[1])
test['QType-Coarse'] = test.QType.apply(lambda x: x.split(':')[0])
test['QType-Fine'] = test.QType.apply(lambda x: x.split(':')[1])

#### 5) Checking cleaned dataset

In [8]:
train.head()

Unnamed: 0,Question,QType,QType-Coarse,QType-Fine
0,How did serfdom develop in and then leave Russ...,DESC:manner,DESC,manner
1,What films featured the character Popeye Doyle...,ENTY:cremat,ENTY,cremat
2,How can I find a list of celebrities ' real na...,DESC:manner,DESC,manner
3,What fowl grabs the spotlight after the Chines...,ENTY:animal,ENTY,animal
4,What is the full form of .com ?\n,ABBR:exp,ABBR,exp


#### 6) Removing the QType and QType-Fine variables, as our focus is on predicting the coarse classes

In [9]:
train.pop('QType')
train.pop('QType-Fine')
test.pop('QType')
test.pop('QType-Fine')

0           dist
1           city
2           desc
3            def
4           date
         ...    
495          ind
496     currency
497        count
498    substance
499          def
Name: QType-Fine, Length: 500, dtype: object

In [10]:
train.head()

Unnamed: 0,Question,QType-Coarse
0,How did serfdom develop in and then leave Russ...,DESC
1,What films featured the character Popeye Doyle...,ENTY
2,How can I find a list of celebrities ' real na...,DESC
3,What fowl grabs the spotlight after the Chines...,ENTY
4,What is the full form of .com ?\n,ABBR


#### 7) Checking the different classes in our dataset

In [11]:
classes = np.unique(np.array(train['QType-Coarse']))
classes

array(['ABBR', 'DESC', 'ENTY', 'HUM', 'LOC', 'NUM'], dtype=object)

#### 8) Using label encoding to convert classes into integral identfiers

In [12]:
le = LabelEncoder()

le.fit(pd.Series(train['QType-Coarse'].tolist() + test['QType-Coarse'].tolist()).values)

train['QType-Coarse'] = le.transform(train['QType-Coarse'].values)
test['QType-Coarse'] = le.transform(test['QType-Coarse'].values)

Unnamed: 0,Question,QType-Coarse
0,How did serfdom develop in and then leave Russ...,1
1,What films featured the character Popeye Doyle...,2
2,How can I find a list of celebrities ' real na...,1
3,What fowl grabs the spotlight after the Chines...,2
4,What is the full form of .com ?\n,0


#### 9) Preprocessing our dataset

In [13]:
#The different process present in the pipeline

#tokenizing
def tokenizer(corpus, keep_list = []):
    cleaned_rows = []
    for row in corpus:
        qs = []
        for word in row.split():
            if word not in keep_list:
                p1 = re.sub('[^a-zA-Z0-9]', ' ', word).lower()
                qs.append(p1)
            else : qs.append(word)
        cleaned_rows.append(' '.join(qs))
    return pd.Series(cleaned_rows)

#removing stopwords
def remove_stops(corpus):
    stop = set(stopwords.words('english'))
    corpus = [[word for word in sentence.split() if word not in stop] for sentence in corpus]
    return corpus

#stemming
def stemmer(corpus, stem_type):
    if stem_type == 'Porter':
        stemmer = PorterStemmer()
        corpus = [[stemmer.stem(word) for word in sentence] for sentence in corpus]          

    if stem_type == 'Snowball':
        stemmer = SnowballStemmer(language='english')
        corpus = [' '.join([stemmer.stem(word) for word in sentence]) for sentence in corpus] 

        return corpus
#lemmatization
def lemmatizer(corpus):
    lemmatizer = WordNetLemmatizer()
    corpus = [' '.join([lemmatizer.lemmatize(x, pos = 'v') for x in x]) for x in corpus]
    return corpus

#function to preprocess
def preprocess(corpus, keep_list = [], stemming = False, stem_type = None, lemmatization = False, remove_stopwords = True):
    
    corpus = tokenizer(corpus, keep_list)
    
    if remove_stopwords:
        corpus = remove_stops(corpus)
    
    if stemming:
        corpus = stemmer(corpus, stem_type)
        
    if lemmatization:
        corpus = lemmatizer(corpus)
    
    corpus = [' '.join(x) for x in corpus]     
    
    return corpus  

In [14]:
all_corpus = pd.Series(train.Question.tolist() + test.Question.tolist()).astype(str)
all_corpus = preprocess(all_corpus, remove_stopwords = True)

['serfdom develop leave russia',
 'films featured character popeye doyle',
 'find list celebrities real names',
 'fowl grabs spotlight chinese year monkey',
 'full form com',
 'contemptible scoundrel stole cork lunch',
 'team baseball st louis browns become',
 'oldest profession',
 'liver enzymes',
 'name scar faced bounty hunter old west',
 'ozzy osbourne born',
 'heavier objects travel downhill faster',
 'pride yankees',
 'killed gandhi',
 'considered costliest disaster insurance industry ever faced',
 'sprawling u state boasts airports',
 'repealed amendment u constitution deal',
 'many jews executed concentration camps wwii',
 'nine inch nails',
 'annotated bibliography',
 'date boxing day',
 'articles clothing tokens monopoly',
 'name 11 famous martyrs',
 'olympic motto',
 'origin name scarlett',
 'second used vowel english',
 'inventor silly putty',
 'highest waterfall united states',
 'name golf course myrtle beach',
 'two states enclose chesapeake bay',
 'abbreviation aids stan

#### 10) Splitting our data into training and testings sets

In [15]:
train_corpus = all_corpus[0:train.shape[0]]
test_corpus = all_corpus[train.shape[0]:]

#### 11) Vectorizing our text data using TD-IDF

In [21]:
vectorizer = TfidfVectorizer()
tf_idf_matrix_train = vectorizer.fit_transform(train_corpus)
tf_idf_matrix_test = vectorizer.transform(test_corpus)

#### 12) Importing Keras and various libraries

In [22]:
import keras
from keras.models import Sequential, Model
from keras import layers
from keras.layers import Dense, Dropout, Input
from keras.utils import np_utils

#### 13) One hot encoding class labels

In [23]:
y_train = np_utils.to_categorical(train['QType-Coarse'], train['QType-Coarse'].nunique())

y_test = np_utils.to_categorical(test['QType-Coarse'], train['QType-Coarse'].nunique())

#### 14) Defining the network architecture

In [24]:
model = Sequential()
model.add(Dense(128, activation='relu',input_dim=tf_idf_matrix_train.shape[1]))
model.add(Dropout(0.3))
model.add(Dense(6, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 128)               881408    
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 6)                 774       
Total params: 882,182
Trainable params: 882,182
Non-trainable params: 0
_________________________________________________________________


#### 15) Training our model

In [25]:
model.fit(tf_idf_matrix_train, y_train, batch_size=100, epochs=10)

InvalidArgumentError: indices[1] = [0,3600] is out of order. Many sparse ops require sorted indices.
    Use `tf.sparse.reorder` to create a correctly ordered copy.

 [Op:SerializeManySparse]