# <center> Text Classification with RNN </center>

___

### Get the current working directory

In [1]:
import os, sys

PATH = os.getcwd()

TEXT_DATA_DIR = os.path.join(PATH, "txt")

TEXT_DATA_DIR

'/Users/shreyasi25/Downloads/20191229_Batch70_CSE7321c_Lab05_RNN/txt'

### Get the list of folders inside data path

In [2]:
TEXT_DATA_DIR_LIST = os.listdir(TEXT_DATA_DIR)

if '.DS_Store' in TEXT_DATA_DIR_LIST :
    TEXT_DATA_DIR_LIST.remove('.DS_Store')

print(TEXT_DATA_DIR_LIST)

['kalam', 'obama', 'romney']


### Preparing the text data
Iterate over the folders in which our text documents are stored, and format them into a list of documents. 

Also prepare a list of class indices matching the documents

In [3]:
docs = []          # list of text samples
labels = []        # list of label ids
labels_Index = {}  # dictionary mapping label index to label name

for name in TEXT_DATA_DIR_LIST:
    
    path = os.path.join(TEXT_DATA_DIR, name)
    files = sorted(os.listdir(path))

    print("{} dir has following docs {} \n".format( name, files ))

    key = len(labels_Index)
    labels_Index[key] = name

    for fname in files:

        with open( os.path.join(path, fname), encoding = "ISO-8859-1") as file :
            text = file.read()
            docs.append(text)

        labels.append(key)

print(len(labels), 'docs with labels --> ', labels)       

kalam dir has following docs ['doc01.txt', 'doc02.txt', 'doc03.txt', 'doc04.txt', 'doc05.txt', 'doc06.txt', 'doc07.txt', 'doc08.txt', 'doc09.txt', 'doc10.txt', 'doc11.txt', 'doc12.txt'] 

obama dir has following docs ['obama01.txt', 'obama02.txt', 'obama03.txt', 'obama04.txt', 'obama05.txt', 'obama06.txt', 'obama07.txt', 'obama08.txt', 'obama09.txt', 'obama10.txt', 'obama11.txt', 'obama12.txt'] 

romney dir has following docs ['romney01.txt', 'romney02.txt', 'romney03.txt', 'romney04.txt', 'romney05.txt', 'romney06.txt', 'romney07.txt', 'romney08.txt', 'romney09.txt', 'romney10.txt', 'romney11.txt', 'romney12.txt'] 

36 docs with labels -->  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]


###  Load the libraries

In [4]:
import numpy as np
from keras.models import Model
from keras.layers import Input, Dense, Flatten, Conv1D, MaxPooling1D, LSTM, Embedding, Bidirectional
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn import metrics

Using TensorFlow backend.


Format the text samples and labels into tensors that can be fed into a neural network. 

To do this, we will rely on Keras utilities 

- keras.preprocessing.text.Tokenizer 

`Tokenizer` : Class for vectorizing texts, or/and turning texts into sequences (=list of word indexes, where the word of rank i in the dataset (starting at 1) has index i).

`fit_on_texts(texts)` : list of texts to train on.
        
`word_index` : Dictionary mapping words (str) to their rank/index (int). Only set after fit_on_texts was called.

In [5]:
tokenizer = Tokenizer()

tokenizer.fit_on_texts(docs)

word_Index = tokenizer.word_index

vocab_Size = len(word_Index) + 1
print('Found %s unique tokens.' % vocab_Size)

Found 7314 unique tokens.


In [6]:
word_Index

{'the': 1,
 'and': 2,
 'to': 3,
 'of': 4,
 'a': 5,
 'in': 6,
 'that': 7,
 'i': 8,
 'for': 9,
 'is': 10,
 'we': 11,
 'our': 12,
 'you': 13,
 'will': 14,
 'it': 15,
 'this': 16,
 'with': 17,
 'have': 18,
 'on': 19,
 'are': 20,
 'be': 21,
 'who': 22,
 'as': 23,
 'not': 24,
 'by': 25,
 'my': 26,
 'from': 27,
 'but': 28,
 'can': 29,
 'he': 30,
 'has': 31,
 'all': 32,
 'was': 33,
 'their': 34,
 'they': 35,
 'america': 36,
 'what': 37,
 'at': 38,
 'or': 39,
 'when': 40,
 'one': 41,
 'us': 42,
 'an': 43,
 'science': 44,
 'more': 45,
 'people': 46,
 'so': 47,
 'president': 48,
 'his': 49,
 'work': 50,
 'nation': 51,
 'do': 52,
 'about': 53,
 'me': 54,
 'your': 55,
 'would': 56,
 'time': 57,
 'world': 58,
 'there': 59,
 'new': 60,
 '\x96': 61,
 'up': 62,
 'like': 63,
 'which': 64,
 'years': 65,
 'because': 66,
 'how': 67,
 'been': 68,
 'country': 69,
 'if': 70,
 'american': 71,
 'am': 72,
 'these': 73,
 'out': 74,
 'where': 75,
 'many': 76,
 'than': 77,
 'every': 78,
 'friends': 79,
 'had': 80,


`texts_to_sequences(texts)` : Takes list of texts to turn to sequences, returns list of sequences (one per text input).

In [7]:
# integer encode the documents
sequences = tokenizer.texts_to_sequences(docs)
print(docs[33], '\n', sequences[33])

September 21, 2012, Written words nor my own spoken words will never and can never honestly express nor explain exactly how I feel about Barack H. Obama. Obama is so EVIL, wrotten and wicked. The most horrid Liar that I have ever seen in my whole life. And how he ever got into congress and to the Presidency in the first place is so unbelievable to me. It is a horrible shock to my whle body actually. Obama is not in any way a President. George Washington and John Adams and Abraham Lincoln would not believe it either if they could see what has become of America, now. I saw a lady who told me that she is going to vote for Obama. I told her that Obama is a Marxist-Communist and that he Murders Babies in the Womb and that he wants to ruin America and that he hates America, GOD and Israel. I told her that Obama is a Dictator and that he wants to bring in Sharia Law and that he will bring us all into Misery. But she did not believe me, she thought that I was crazy to be wearing a ROMNEY T-shi

Lets check the length of each document.

In [8]:
sorted([(len(d), i) for i, d in enumerate(sequences)])

[(279, 33),
 (289, 9),
 (453, 7),
 (677, 24),
 (809, 34),
 (812, 18),
 (830, 23),
 (1025, 11),
 (1141, 35),
 (1201, 4),
 (1582, 21),
 (1742, 22),
 (1877, 14),
 (1879, 32),
 (1884, 20),
 (1981, 12),
 (2168, 19),
 (2203, 28),
 (2315, 30),
 (2382, 17),
 (2398, 13),
 (2484, 16),
 (2486, 25),
 (2510, 10),
 (2690, 26),
 (2734, 29),
 (2950, 5),
 (2956, 2),
 (3000, 0),
 (3049, 8),
 (3153, 6),
 (3185, 1),
 (3411, 27),
 (4110, 31),
 (4123, 3),
 (4676, 15)]

- keras.preprocessing.sequence.pad_sequences

The sequences have different lengths and Keras requires inputs to be vectorized and all inputs to have the same length. 

We will pad all input sequences to have the length of 1000. 

In [9]:
MAX_SEQUENCE_LENGTH = 1000

data = pad_sequences(sequences, maxlen = MAX_SEQUENCE_LENGTH)

print('Shape of data tensor:', data.shape)

Shape of data tensor: (36, 1000)


Default `truncating='pre'`

In [10]:
print(sequences[11], '\n\n', data[11])

[292, 106, 5267, 3, 5, 1471, 4, 159, 106, 6, 64, 1176, 290, 3529, 3, 335, 159, 357, 253, 5268, 1, 330, 47, 7, 73, 357, 29, 21, 246, 24, 127, 6, 1, 669, 28, 98, 9, 1, 747, 3, 114, 79, 58, 31, 47, 314, 305, 1, 106, 5269, 106, 2, 637, 106, 263, 106, 2, 462, 106, 28, 581, 3, 1, 3530, 4, 645, 436, 2, 5270, 1106, 439, 3, 45, 77, 402, 497, 6, 1, 58, 107, 15, 10, 767, 3, 362, 4, 292, 106, 6, 78, 1161, 4, 159, 85, 6, 78, 679, 4, 1, 144, 47, 314, 211, 4, 1, 567, 282, 1299, 25, 1, 337, 235, 1, 206, 484, 2, 203, 3010, 102, 712, 452, 4, 567, 2134, 147, 2135, 23, 286, 23, 511, 401, 3132, 8, 33, 628, 773, 37, 11, 671, 27, 47, 76, 2135, 2, 391, 3531, 2311, 9, 1164, 1905, 2, 401, 47, 76, 838, 5271, 47, 76, 2505, 383, 2, 47, 211, 4, 567, 2076, 27, 712, 567, 5272, 103, 40, 11, 2667, 1, 567, 37, 10, 1, 5273, 7, 29, 21, 333, 3, 1, 245, 1106, 4, 232, 497, 46, 3, 1072, 84, 17, 299, 203, 2, 2081, 3, 1322, 34, 182, 438, 4, 85, 8, 33, 486, 37, 14, 21, 1, 146, 1472, 7, 14, 169, 3, 272, 292, 106, 8, 56, 63, 3, 21

Default `padding='pre'`

In [11]:
print(sequences[33], '\n\n', data[33])

[915, 3695, 662, 1610, 448, 865, 26, 182, 1552, 448, 14, 153, 2, 29, 153, 7227, 1533, 865, 1470, 1227, 67, 8, 826, 53, 1278, 1475, 138, 138, 10, 47, 1408, 7228, 2, 7229, 1, 168, 7230, 7231, 7, 8, 18, 268, 305, 6, 26, 816, 85, 2, 67, 30, 268, 343, 94, 325, 2, 3, 1, 1265, 6, 1, 108, 252, 10, 47, 7232, 3, 54, 15, 10, 5, 7233, 2485, 3, 26, 7234, 2507, 1070, 138, 10, 24, 6, 173, 141, 5, 48, 687, 295, 2, 167, 7235, 2, 2322, 1912, 56, 24, 160, 15, 1464, 70, 35, 174, 184, 37, 31, 210, 4, 36, 96, 8, 407, 5, 923, 22, 350, 54, 7, 99, 10, 221, 3, 698, 9, 138, 8, 350, 105, 7, 138, 10, 5, 7236, 7237, 2, 7, 30, 7238, 7239, 6, 1, 7240, 2, 7, 30, 743, 3, 4161, 36, 2, 7, 30, 7241, 36, 194, 2, 939, 8, 350, 105, 7, 138, 10, 5, 3783, 2, 7, 30, 743, 3, 230, 6, 3993, 607, 2, 7, 30, 14, 230, 42, 32, 94, 3299, 28, 99, 248, 24, 160, 54, 99, 375, 7, 8, 33, 3925, 3, 21, 2418, 5, 943, 1166, 7242, 165, 46, 14, 83, 24, 268, 336, 865, 89, 34, 414, 27, 1, 7243, 1596, 1091, 7244, 129, 94, 1, 194, 333, 411, 587, 7, 35, 

### Split data into train test

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size = 0.2, random_state = 123, 
                                                     stratify = labels)

#### Convert target into one-hot encoded

In [13]:
Y_train = to_categorical(y_train)
Y_test = to_categorical(y_test)

In [14]:
Y_train

array([[0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.]], dtype=float32)

### Preparing the Embedding layer

Compute an index mapping words to known embeddings, by parsing the data dump of pre-trained embeddings:

In [15]:
embeddings_index = {}

with open(os.path.join(PATH, 'glove.6b.50d.txt'), encoding="utf8") as f :
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [16]:
embedding_Matrix = np.zeros((vocab_Size, 50))

for word, i in word_Index.items():
    
    embedding_Vector = embeddings_index.get(word)
    
    if embedding_Vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_Matrix[i] = embedding_Vector

print (embedding_Matrix.shape)

(7314, 50)


### Create the embedding layer

The embedding layer can be seeded with the GloVe word embedding weights. 

- We chose the 50-dimensional version, therefore the Embedding layer must be defined with output_dim set to 50. 
- We do not want to update the learned word weights in this model, therefore we will set the trainable attribute for the model to be False.

In [17]:
sequence_input = Input( shape = (MAX_SEQUENCE_LENGTH,), dtype='int32')

embedded_sequences = Embedding(vocab_Size, 50, weights = [embedding_Matrix], 
                               trainable = False)(sequence_input)


### 1. Build 1D convnet

In [18]:
x = Conv1D(64, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(4)(x)
x = Conv1D(64, 5, activation='relu')(x)
x = MaxPooling1D(4)(x)
x = Conv1D(64, 5, activation='relu')(x)
x = MaxPooling1D(4)(x)  # global max pooling
x = Flatten()(x)
x = Dense(64, activation='relu')(x)
preds = Dense(len(TEXT_DATA_DIR_LIST), activation='softmax')(x)

model = Model(sequence_input, preds)

In [19]:
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 1000)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 1000, 50)          365700    
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 996, 64)           16064     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 249, 64)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 245, 64)           20544     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 61, 64)            0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 57, 64)            20544     
__________

In [20]:
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
model.fit(X_train, Y_train, epochs=25)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x1a2c04b6d8>

#### Evaluate the model

In [21]:
model.evaluate(X_test, Y_test)



[0.5816278457641602, 0.625]

#### Make predictions on test data 

In [22]:
Y_pred = model.predict(X_test)
print(Y_pred)

[[4.6966944e-02 6.7747629e-01 2.7555683e-01]
 [9.0951747e-01 7.1693525e-02 1.8788939e-02]
 [5.4703359e-02 5.6427693e-01 3.8101968e-01]
 [2.6960969e-02 5.5746186e-01 4.1557717e-01]
 [9.9486887e-01 4.9004219e-03 2.3066011e-04]
 [6.0076278e-02 7.0722181e-01 2.3270196e-01]
 [5.7064582e-02 4.9829453e-01 4.4464085e-01]
 [1.3497594e-01 5.8634394e-01 2.7868021e-01]]


In [23]:
y_pred =[]

for i in Y_pred:
    y_pred.append(np.argmax(i))

print(y_pred)

[1, 0, 1, 1, 0, 1, 1, 1]


In [35]:
y_test

[1, 0, 2, 2, 0, 1, 1, 2]

In [24]:
metrics.accuracy_score(y_test, y_pred)

0.625

In [25]:
metrics.confusion_matrix(y_test, y_pred)

array([[2, 0, 0],
       [0, 3, 0],
       [0, 3, 0]])

### 2. LSTM

In [31]:
lstm = LSTM(256, return_sequences = True)(embedded_sequences)
lstm = LSTM(64)(lstm)
preds = Dense(3, activation='softmax')(lstm)

model = Model(sequence_input, preds)

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 1000)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 1000, 50)          365700    
_________________________________________________________________
lstm_4 (LSTM)                (None, 1000, 256)         314368    
_________________________________________________________________
lstm_5 (LSTM)                (None, 64)                82176     
_________________________________________________________________
dense_4 (Dense)              (None, 3)                 195       
Total params: 762,439
Trainable params: 396,739
Non-trainable params: 365,700
_________________________________________________________________


In [32]:
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
model.fit(X_train, Y_train, epochs=25)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x1a37110710>

#### Evaluate the model

In [33]:
model.evaluate(X_test, Y_test)



[1.226965308189392, 0.625]

#### Make predictions on test data 

In [36]:
Y_pred = model.predict(X_test)
print(Y_pred)

[[0.00377436 0.9438871  0.05233848]
 [0.99316347 0.0048024  0.00203409]
 [0.00510955 0.08603683 0.9088536 ]
 [0.00329399 0.97817296 0.01853301]
 [0.02031444 0.96698475 0.0127009 ]
 [0.00348596 0.9583928  0.03812126]
 [0.01057569 0.6052689  0.3841554 ]
 [0.0060836  0.70055145 0.293365  ]]


In [37]:
y_pred =[]

for i in Y_pred:
    y_pred.append(np.argmax(i))

print(y_pred)

[1, 0, 2, 1, 1, 1, 1, 1]


In [38]:
y_test

[1, 0, 2, 2, 0, 1, 1, 2]

In [39]:
metrics.accuracy_score(y_test, y_pred)

0.625

In [40]:
metrics.confusion_matrix(y_test, y_pred)

array([[1, 1, 0],
       [0, 3, 0],
       [0, 2, 1]])