In [1]:
import numpy as np # linear algebra
import seaborn as sns
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import tensorflow as tf
import os.path

### Data pre-processing

In [2]:
tweets = pd.read_csv('socialmedia-disaster-tweets.csv', encoding='ansi')

In [3]:
tweets.head()

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,choose_one,choose_one:confidence,choose_one_gold,keyword,location,text,tweetid,userid
0,778243823,True,golden,156,,Relevant,1.0,Relevant,,,Just happened a terrible car crash,1.0,
1,778243824,True,golden,152,,Relevant,1.0,Relevant,,,Our Deeds are the Reason of this #earthquake M...,13.0,
2,778243825,True,golden,137,,Relevant,1.0,Relevant,,,"Heard about #earthquake is different cities, s...",14.0,
3,778243826,True,golden,136,,Relevant,0.9603,Relevant,,,"there is a forest fire at spot pond, geese are...",15.0,
4,778243827,True,golden,138,,Relevant,1.0,Relevant,,,Forest fire near La Ronge Sask. Canada,16.0,


In [4]:
tweets = tweets.drop(labels=['_unit_id', '_golden', '_unit_state', '_trusted_judgments', '_last_judgment_at', 'choose_one:confidence', 'choose_one_gold', 'keyword', 'location', 'tweetid', 'userid'], axis=1)

Let's convert String category variables to Integer category variables.

In [5]:
tweets.choose_one = tweets.choose_one.replace(to_replace=['Relevant', 'Not Relevant', 'Can\'t Decide'], value=[1, 0, 0])

In [6]:
tweets.head()

Unnamed: 0,choose_one,text
0,1,Just happened a terrible car crash
1,1,Our Deeds are the Reason of this #earthquake M...
2,1,"Heard about #earthquake is different cities, s..."
3,1,"there is a forest fire at spot pond, geese are..."
4,1,Forest fire near La Ronge Sask. Canada


### Build Vocabulary

In [7]:
vocabulary = []

all_tweets = tweets.text.values

In [8]:
for tweet in all_tweets:
    words = tweet.split()
    for word in words:
        if word.isalnum() and word.lower() not in vocabulary:
            vocabulary.append(word.lower())
vocabulary = sorted(vocabulary)
print(len(vocabulary))

14325


In [9]:
pad_char = "$0$"
if pad_char not in vocabulary:
    vocabulary.append(pad_char)
vocab_size = len(vocabulary)
print(vocab_size)

vocab_dict = {}
for i, word in enumerate(vocabulary):
    vocab_dict[word] = i

14326


In [10]:
len(vocab_dict)

14326

In [11]:
print(vocabulary[0:10], vocabulary[-1])

['0', '02', '03', '04', '05', '05th', '06', '061', '06jst', '08'] $0$


Just keep the columns with features.

In [12]:
X = tweets.drop(labels=['choose_one'], axis=1).values
y = tweets.choose_one.values

In [13]:
print(X[:10], y[:10])
print(len(X))

[['Just happened a terrible car crash']
 ['Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all']
 ['Heard about #earthquake is different cities, stay safe everyone.']
 ['there is a forest fire at spot pond, geese are fleeing across the street, I cannot save them all']
 ['Forest fire near La Ronge Sask. Canada']
 ["All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected"]
 ['13,000 people receive #wildfires evacuation orders in California ']
 ['Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school ']
 ['#RockyFire Update => California Hwy. 20 closed in both directions due to Lake County fire - #CAfire #wildfires']
 ['Apocalypse lighting. #Spokane #wildfires']] [1 1 1 1 1 1 1 1 1 1]
10876


In [14]:
X = [x[0].split() for x in X]

In [15]:
print(X[:10], y[:10])

[['Just', 'happened', 'a', 'terrible', 'car', 'crash'], ['Our', 'Deeds', 'are', 'the', 'Reason', 'of', 'this', '#earthquake', 'May', 'ALLAH', 'Forgive', 'us', 'all'], ['Heard', 'about', '#earthquake', 'is', 'different', 'cities,', 'stay', 'safe', 'everyone.'], ['there', 'is', 'a', 'forest', 'fire', 'at', 'spot', 'pond,', 'geese', 'are', 'fleeing', 'across', 'the', 'street,', 'I', 'cannot', 'save', 'them', 'all'], ['Forest', 'fire', 'near', 'La', 'Ronge', 'Sask.', 'Canada'], ['All', 'residents', 'asked', 'to', "'shelter", 'in', "place'", 'are', 'being', 'notified', 'by', 'officers.', 'No', 'other', 'evacuation', 'or', 'shelter', 'in', 'place', 'orders', 'are', 'expected'], ['13,000', 'people', 'receive', '#wildfires', 'evacuation', 'orders', 'in', 'California'], ['Just', 'got', 'sent', 'this', 'photo', 'from', 'Ruby', '#Alaska', 'as', 'smoke', 'from', '#wildfires', 'pours', 'into', 'a', 'school'], ['#RockyFire', 'Update', '=>', 'California', 'Hwy.', '20', 'closed', 'in', 'both', 'direct

In [16]:
def export_trimmed_glove_vectors(vocab):
    glove_filename  = 'glove.6B.300d.txt'
    trimmed_filename = 'trimmed_gloves.npz'
    dim = 300
    if os.path.isfile(trimmed_filename):
        return
    
    embeddings = np.zeros([len(vocab), dim])
    with open(glove_filename, encoding="utf8") as f:
        for line in f:
            line = line.strip().split(' ')
            word = line[0]
            embedding = [float(x) for x in line[1:]]
            if word in vocab:
                word_idx = vocab[word]
                embeddings[word_idx] = np.asarray(embedding)

    np.savez_compressed(trimmed_filename, embeddings=embeddings)

In [17]:
def get_trimmed_glove_vectors():
    filename = 'trimmed_gloves.npz'
    with np.load(filename) as data:
        return data["embeddings"]

In [18]:
export_trimmed_glove_vectors(vocab_dict)

In [19]:
embeddings = get_trimmed_glove_vectors()

In [20]:
def string_to_vocab(lines):
    X_onehot = []

    for line in lines:
        temp_X = np.array([vocabulary.index(word.lower()) for word in line if word.isalnum() or word == pad_char])
        temp_oh_X = np.zeros((len(temp_X), vocab_size))
        temp_oh_X[np.arange(len(temp_X)),temp_X] = 1
        X_onehot.append(temp_oh_X)
        
    return X_onehot

In [21]:
def string_to_word_ids(lines):
    X_word_ids = []

    for line in lines:
        temp_X = np.array([vocabulary.index(word.lower()) for word in line if word.isalnum() or word == pad_char])
        X_word_ids.append(temp_X)
        
    return X_word_ids

In [22]:
string_to_word_ids([X[0], X[1]])

[array([ 6966,  5834,   469, 12665,  2308,  3196]),
 array([ 9134,  3524,  1076, 12718, 10350,  8931, 12768,  7948,   814,
         5137, 13429,   813])]

In [23]:
def pad_to_max(X):
    padded_X = []
    cleared_X = []
    
    for x in X:
        tempx = [word for word in x if word.isalnum()]
        cleared_X.append(tempx)
    
    cleaned_lengths = [len(x) for x in cleared_X]
    max_len = max(cleaned_lengths)
    print("max_len", max_len)
    for x in cleared_X:
        tempx = [word for word in x if word.isalnum()]
        tempx.extend([pad_char]*(max_len-len(x)-1))
        padded_X.append(tempx)
    
    return padded_X, cleaned_lengths

In [24]:
def extract_axis_1(data, ind):

    batch_range = tf.range(tf.shape(data)[0])
    indices = tf.stack([batch_range, ind], axis=1)
    res = tf.gather_nd(data, indices)

    return res

In [25]:
def get_feed_dict(feed_X, feed_Y):
    feed_dict = {}
    feed_dict['data'], feed_dict['seq_length'] = pad_to_max(feed_X)
    feed_dict['data'] = string_to_word_ids(feed_dict['data'])
    feed_dict['labels'] = feed_Y
    
    return feed_dict

In [26]:
print_x = pad_to_max(X[0:10])

for x in print_x:
    print(x)

max_len 19
[['Just', 'happened', 'a', 'terrible', 'car', 'crash', '$0$', '$0$', '$0$', '$0$', '$0$', '$0$', '$0$', '$0$', '$0$', '$0$', '$0$', '$0$'], ['Our', 'Deeds', 'are', 'the', 'Reason', 'of', 'this', 'May', 'ALLAH', 'Forgive', 'us', 'all', '$0$', '$0$', '$0$', '$0$', '$0$', '$0$'], ['Heard', 'about', 'is', 'different', 'stay', 'safe', '$0$', '$0$', '$0$', '$0$', '$0$', '$0$', '$0$', '$0$', '$0$', '$0$', '$0$', '$0$'], ['there', 'is', 'a', 'forest', 'fire', 'at', 'spot', 'geese', 'are', 'fleeing', 'across', 'the', 'I', 'cannot', 'save', 'them', 'all', '$0$'], ['Forest', 'fire', 'near', 'La', 'Ronge', 'Canada', '$0$', '$0$', '$0$', '$0$', '$0$', '$0$', '$0$', '$0$', '$0$', '$0$', '$0$', '$0$'], ['All', 'residents', 'asked', 'to', 'in', 'are', 'being', 'notified', 'by', 'No', 'other', 'evacuation', 'or', 'shelter', 'in', 'place', 'orders', 'are', 'expected'], ['people', 'receive', 'evacuation', 'orders', 'in', 'California', '$0$', '$0$', '$0$', '$0$', '$0$', '$0$', '$0$', '$0$', '$0

In [27]:
print(get_feed_dict(X[0:10], y[0:10]))

max_len 19
{'data': [array([ 6966,  5834,   469, 12665,  2308,  3196, 14325, 14325, 14325,
       14325, 14325, 14325, 14325, 14325, 14325, 14325, 14325, 14325]), array([ 9134,  3524,  1076, 12718, 10350,  8931, 12768,  7948,   814,
        5137, 13429,   813, 14325, 14325, 14325, 14325, 14325, 14325]), array([ 5941,   518,  6729,  3773, 12081, 10978, 14325, 14325, 14325,
       14325, 14325, 14325, 14325, 14325, 14325, 14325, 14325, 14325]), array([12737,  6729,   469,  5128,  4959,  1196, 11956,  5402,  1076,
        5025,   578, 12718,  6317,  2277, 11072, 12725,   813, 14325]), array([ 5128,  4959,  8600,  7222, 10842,  2259, 14325, 14325, 14325,
       14325, 14325, 14325, 14325, 14325, 14325, 14325, 14325, 14325]), array([  813, 10610,  1157, 12890,  6453,  1076,  1579,  8801,  2176,
        8730,  9123,  4536,  9075, 11393,  6453,  9617,  9087,  1076,
        4625]), array([ 9435, 10367,  4536,  9087,  6453,  2220, 14325, 14325, 14325,
       14325, 14325, 14325, 14325, 14325, 1

In [28]:
seed = 123
np.random.seed(seed)
tf.set_random_seed(seed)

In [29]:
print(len(X), round(len(X) * 0.9))

10876 9788


Randomly select indexes to use for training.

In [30]:
train_index = np.random.choice(len(X), round(len(X) * 0.9), replace=False)

In [31]:
print(train_index, len(train_index))

[ 7405 10573  2013 ...    78  4420  9733] 9788


In [32]:
test_index = np.array(list(set(range(len(X))) - set(train_index)))
X = np.array(X)
train_X = X[train_index]
train_y = y[train_index]
test_X = X[test_index]
test_y = y[test_index]

In [33]:
print(test_index, len(test_index))

[4098 8194 2052 ... 2043 4092 2046] 1088


In [34]:
train_X[0:10], train_y[0:10]

(array([list(["I'm", 'about', 'to', 'be', 'obliterated']),
        list(['#Insurance:', 'Texas', 'Seeks', 'Comment', 'on', 'Rules', 'for', 'Changes', 'to', 'Windstorm', 'Insurer', 'http://t.co/rb02svlpPu']),
        list(['lets', 'hope', 'this', 'concert', 'ends', 'with', 'zero', 'casualties', 'amen']),
        list(['Storm', 'in', 'RI', 'worse', 'than', 'last', 'hurricane.', 'My', 'city&amp;3others', 'hardest', 'hit.', 'My', 'yard', 'looks', 'like', 'it', 'was', 'bombed.', 'Around', '20000K', 'still', 'without', 'power']),
        list(['Tell', '@BarackObama', 'to', 'rescind', 'medals', 'of', "'honor'", 'given', 'to', 'US', 'soldiers', 'at', 'the', 'Massacre', 'of', 'Wounded', 'Knee.', 'SIGN', 'NOW', '&amp;', 'RT!', 'https://t.co/u4r8dRiuAc']),
        list(["I'm", 'more', 'into', 'the', 'healing/reviving', 'side', 'of', 'the', 'game', 'rather', 'than', 'better', 'attacking', 'so', 'for', 'now', 'Siren', '&gt;', 'all', 'other', 'characters', '(except', 'new', 'girl).']),
        list(

### Building Tensorflow Graph
Let's start building Tensorflow model. Tensorflow is a graph based framework. So we have to build the whole graph of our model before we start any kind of training. For most ML problems, the graph building process is pretty straight forward.

- Step 1: Define placeholders for input Data and Labels.
- Step 2: Pass the Data through the layers.
- Step 3: Define a loss function.
- Step 4: Apply Gradient Descent (or other optimisers) on the loss.

Placeholders are special type of Tensorflow variables. These variables don't hold any particular values and are used as input nodes to the graph. The data is fed to these variables in order to Train the model or infer using the model.

- "data" is used for data input. The shape is Batch Size x Input Features. 
- "labels" is used for output labels. Batch Size x Output Space

To keep the batch size "variable", we use None instead of a specific Batch Size.

In [35]:
data = tf.placeholder(dtype=tf.int32, shape=[None, None], name='Data_Input')
labels = tf.placeholder(dtype=tf.float32, shape=[None, 1], name='Labels_Input')
seq_lengths = tf.placeholder(dtype=tf.int32, shape=[None], name='Seq_Lengths')
keep_prob = tf.placeholder(dtype=tf.float32, shape=[], name="dropout")

In [36]:
word_embeddings_matrix = tf.Variable(embeddings, name="word_embeddings", dtype=tf.float32, trainable=False)

In [37]:
data

<tf.Tensor 'Data_Input:0' shape=(?, ?) dtype=int32>

In [38]:
data.shape

TensorShape([Dimension(None), Dimension(None)])

Now pass the data through all layers.

In [39]:
input_embeddings = tf.nn.embedding_lookup(word_embeddings_matrix, data, name="input_embeddings")

In [40]:
size_lstm = 32

cell_fw = tf.contrib.rnn.LSTMCell(size_lstm)
cell_bw = tf.contrib.rnn.LSTMCell(size_lstm)

(output_fw, output_bw), _ = tf.nn.bidirectional_dynamic_rnn(
    cell_fw, cell_bw, input_embeddings,
    sequence_length=seq_lengths, dtype=tf.float32)

lstm = tf.concat([output_fw, output_bw], axis=-1)
output = tf.nn.dropout(lstm, keep_prob)

last_output = extract_axis_1(output, seq_lengths - 1)
last_output = tf.reshape(last_output, [-1, 2*size_lstm])

In [41]:
logits = tf.layers.dense(last_output, units=1,
                         kernel_initializer=tf.contrib.layers.xavier_initializer(),
                         kernel_regularizer=tf.nn.l2_loss)

Once we have the logits calculated, lets add the loss operation. Since we're doing logistic regression, the loss function is sigmoid cross entropy.

In [42]:
loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=labels))
tf.summary.scalar('loss', loss)

<tf.Tensor 'loss:0' shape=() dtype=string>

In [43]:
learning_rate = 0.003
batch_size = 32
iter_num = 10000

To train our model, we iterate on the parameter values such that the loss is minimized. To reduce the loss, we have multiple Optimizers available. Let's use Gradient Descent.

In [44]:
opt = tf.train.AdamOptimizer(learning_rate)

In [45]:
goal = opt.minimize(loss)

Prediction and Accuracy utility operations.

In [46]:
# Define the accuracy
# The default threshold is 0.5, rounded off directly
prediction = tf.round(tf.sigmoid(logits), name='Prediction')
# Bool into float32 type
correct = tf.cast(tf.equal(prediction, labels), dtype=tf.float32)
# Average
accuracy = tf.reduce_mean(correct, name='Accuracy')

tf.summary.scalar('accuracy', accuracy)
# End of the definition of the model framework

<tf.Tensor 'accuracy_1:0' shape=() dtype=string>

In [47]:
merged = tf.summary.merge_all()

In [48]:
loss_trace = []
train_acc = []
test_acc = []

### Training / Evaluating the model
Up until now, we have just built the graph of model in Tensorflow. Now we will start trainig and evaluating the model. To use the model:
- Initialise the model.
- To "Train" the model: feed data and labels and process the model till the "goal" operation.
- To "Evaluate" the model: provide data and lables and process till "accuracy" operation.

Initialisation is pretty standard. We get a session object after initialisation and this session object is used to interact with the graph.

In [49]:
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)

In [50]:
train_writer = tf.summary.FileWriter('./text_classification_glove_logs', sess.graph)

To train the model:
- Get a batch of data and labels.
- Feed data and labels to the graph.
- Check the Train and Test data accuracy.

In [51]:
# training model
for epoch in range(iter_num):
    # Generate random batch index
    batch_index = np.random.choice(len(train_X), size=batch_size)
    batch_train_X = train_X[batch_index]
    batch_train_y = np.matrix(train_y[batch_index]).T
    feed_dict = get_feed_dict(batch_train_X, batch_train_y)
    
    print(feed_dict['data'])
    print(feed_dict['seq_length'])
    _, summary = sess.run([goal, merged], feed_dict={data: feed_dict['data'], 
                                                     labels: feed_dict['labels'],
                                                     seq_lengths: feed_dict['seq_length'],
                                                     keep_prob: 0.5})

    temp_loss = sess.run(loss, feed_dict={data: feed_dict['data'],
                                          labels: feed_dict['labels'],
                                          seq_lengths: feed_dict['seq_length'],
                                          keep_prob: 0.5})

    loss_trace.append(temp_loss)
    
    train_writer.add_summary(summary, epoch)
    
    # print('epoch: {:4d} loss: {:5f}'.format(epoch + 1, temp_loss))
    # output
    if (epoch + 1) % 20 == 0:
        feed_dict = get_feed_dict(test_X, np.matrix(test_y).T)

        test_batch_acc = sess.run(accuracy, feed_dict={data: feed_dict['data'],
                                              labels: feed_dict['labels'],
                                              seq_lengths: feed_dict['seq_length'],
                                              keep_prob: 1})
        test_acc.append(test_batch_acc)
 
        print('epoch: {:4d} loss: {:5f} test_acc: {:5f}'.format(epoch + 1, temp_loss, test_batch_acc))

max_len 28
[array([13949,  2129,  9009,  2220,  2579,  4212,  8646, 14325, 14325,
       14325, 14325, 14325, 14325, 14325, 14325, 14325, 14325, 14325,
       14325, 14325, 14325, 14325, 14325, 14325, 14325, 14325, 14325]), array([ 8338,   913,  1673,  6894,  1076,   469,  9936,  6453, 12768,
        5343,  1829,  1378,  8701, 14325, 14325, 14325, 14325, 14325,
       14325, 14325, 14325, 14325, 14325, 14325, 14325, 14325, 14325]), array([13189,  1349, 12890,  6317,  5900, 10593, 14247,   913, 12451,
        1316, 14254, 11551,  1132, 12777, 12745, 13860,  6732,  3027,
        4407, 13537, 14325, 14325, 14325, 14325, 14325, 14325, 14325]), array([ 1585,  6753,  9075,  8789,  5761, 12930,  8435, 10232,  9134,
        8663,  9636,  7845, 13076,  1076,   602, 14325, 14325, 14325,
       14325, 14325, 14325, 14325, 14325, 14325, 14325, 14325, 14325]), array([ 5267,  1102,   913,  4084, 14325, 14325, 14325, 14325, 14325,
       14325, 14325, 14325, 14325, 14325, 14325, 14325, 14325, 14325,


ValueError: setting an array element with a sequence.

In [None]:
# Visualization of the results
# loss function
plt.plot(loss_trace)
plt.title('Cross Entropy Loss')
plt.xlabel('epoch')
plt.ylabel('loss')
plt.show()

In [None]:
plt.plot(train_acc, 'b-', label='train accuracy')
plt.plot(test_acc, 'k-', label='test accuracy')
plt.xlabel('epoch')
plt.ylabel('accuracy')
plt.title('Train and Test Accuracy')
plt.legend(loc='best')
plt.show()