In [328]:
import os
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
import string
import re
from gensim.models import Word2Vec
from gensim.models.phrases import Phrases, Phraser
import logging
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)
from nltk import bigrams
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.calibration import CalibratedClassifierCV
from keras.layers import Dense
from keras.models import Sequential
from keras.layers import Embedding, SimpleRNN
from keras.layers import LSTM, GlobalMaxPooling1D
from keras.initializers import Constant
import keras.metrics
from numpy import newaxis
from keras.preprocessing.text import Tokenizer

## Reading in the Data:

We have used all the tweets from 2018 (train and test) to create the embeddings. However, balanced datasets have been used to train and test the model. The following code reads in all the data. Note, we are only considering earthquakes and floods in the following sections and we will build separate models for each earthquakes and floods. 

In [2]:
os.chdir('../10_Data/30_Balanced Tweets (Crit = High = Medium = Low)/10_2018 Train')

In [3]:
df_e1 = pd.read_csv('earthquake_TREC_2018_train_BALANCED.csv')
df_f1 = pd.read_csv('flood_TREC_2018_train_BALANCED.csv')

In [4]:
os.chdir('../15_2018 Test')

In [5]:
df_e2 = pd.read_csv('earthquake_TREC_2018_test_BALANCED.csv')
df_f2 = pd.read_csv('flood_TREC_2018_test_BALANCED.csv')

In [6]:
os.chdir('../../20_Extracted Tweets/10_2018 Train')

In [7]:
df_e1_embed = pd.read_csv('Earthquake_TREC_2018_train.csv')
df_f1_embed = pd.read_csv('flood_TREC_2018_train.csv')

In [8]:
os.chdir('../15_2018 Test')

In [9]:
df_e2_embed = pd.read_csv('Earthquake_TREC_2018_test.csv')
df_f2_embed = pd.read_csv('Floods_TREC_2018_test.csv')

## Combining dataframes
We are now combine earthquake tweets into one df and flood tweets into another df. We have subsetting on the tweets and priority columns. We do this for both the embedding dataframes and our balanced datasets, although we will not use the balanced datasets until later.

In [359]:
df_quake_e = pd.DataFrame()
df_quake_e['Tweet'] = pd.concat([df_e1_embed['Tweet'] , df_e2_embed['Tweet']])
df_quake_e['Priority'] = pd.concat([df_e1_embed['Priority'] , df_e2_embed['Priority']])

In [360]:
df_flood_e = pd.DataFrame()
df_flood_e['Tweet'] = pd.concat([df_f1_embed['Tweet'] , df_f2_embed['Tweet']])
df_flood_e['Priority'] = pd.concat([df_f1_embed['Priority'] , df_f2_embed['Priority']])

In [361]:
df_quake = pd.DataFrame()
df_quake['Tweet'] = pd.concat([df_e1['Tweet'] , df_e2['Tweet']])
df_quake['Priority'] = pd.concat([df_e1['Priority'] , df_e2['Priority']])

In [362]:
df_flood = pd.DataFrame()
df_flood['Tweet'] = pd.concat([df_f1['Tweet'] , df_f2['Tweet']])
df_flood['Priority'] = pd.concat([df_f1['Priority'] , df_f2['Priority']])

Cross checking the shapes to make sure they match. **They do match**

In [363]:
df_quake_e.shape, df_flood_e.shape, df_quake.shape, df_flood.shape

((5140, 2), (2518, 2), (100, 2), (120, 2))

## Converting to categorical (0 & 1)

We will now define a function to convert the priority to a categorical 0 & 1. This will be necessary when we train a model.

In [385]:
'''
Creating a categorical variable to keep label critical tweets as 1 and 0 otherwise
'''
def to_categorical(array):
    t = []
    for element in array:
        if element =='Critical':
            t.append(1)
        else:
            t.append(0)
        
    return (t)


## Pre-Processing

We shall now perform a series of pre processing to our tweets. This includes tokenizing them, removing stop words and lemmatising them.  

**I also wrote a line of code to remove the links in the tweets as it was throwing in a lot of gibberish into the emebddings.** The link is almost always at the end of the tweet so its relatively easy to remove. 

I left the punctuation marks in. We have to take a call on this later.

In [386]:
def preProcess(df):
    df['Tweet'] = df['Tweet'].astype('str')
    
    df['Tweet'] = df['Tweet'].apply(lambda x: re.split('http?s *: *\/\/.*', str(x))[0])
    
    token_array = []
    #for tweet in df['Tweet']:
    #    translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
    #    tweet = tweet.translate(translator)
    #    token_tweet = word_tokenize(tweet)
    #    token_array.append(token_tweet)
    
    for tweet in df['Tweet']:
        token_tweet = word_tokenize(tweet)
        token_array.append(token_tweet)
        
    stop_words=set(stopwords.words("english"))
    filtered_token_array=[]
    for tweet in token_array:
        filtered_tweet = []
        for word in tweet:
                if word not in stop_words:
                    filtered_tweet.append(word)
        filtered_token_array.append(filtered_tweet)
        
    lem = WordNetLemmatizer()

    lemmatized_array=[]
    for tweet in filtered_token_array:
        lemmatized_tweet = []
        for word in tweet:
            lemmatized_tweet.append(lem.lemmatize(word,'v'))
        lemmatized_array.append(lemmatized_tweet)
    
    lemmatized_array_join = []
    for element in lemmatized_array:
        lemmatized_array_join.append(' '.join(element))
        
    return (lemmatized_array_join)


The tweets have been pre processed and we now have a collection of tweets. Word2Vec takes in a list of words together contained in a bigger list (Corpus -> List of Tweet -> List of Words).

In [387]:
l_quake_e = preProcess(df_quake_e)
l_flood_e = preProcess(df_flood_e)

list_of_words_quake = []
list_of_words_flood = []

for tweet in l_quake_e:
        token_tweet = word_tokenize(tweet)
        list_of_words_quake.append(token_tweet)
        
for tweet in l_flood_e:
        token_tweet = word_tokenize(tweet)
        list_of_words_flood.append(token_tweet)
        

## Creating a Word2Vec model

We will now initialise a word2vec model. This takes in a paramter 'min_count' which is the number of minimum occurences of a word required for it be included in the embeddings. **For now we set min_count = 1**. I did this because we are creating our own embeddings (not picking from wikipedia embeddings etc) and our data is not that huge. 

In [388]:
'''
min_count = int - Ignores all words with total absolute frequency lower than this - (2, 100)

window = int - The maximum distance between the current and predicted word within a sentence. 
        E.g. window words on the left and window words on the left of our target - (2, 10)

size = int - Dimensionality of the feature vectors. - (50, 300)

sample = float - The threshold for configuring which higher-frequency words are randomly downsampled. 
        Highly influencial. - (0, 1e-5)

alpha = float - The initial learning rate - (0.01, 0.05)

min_alpha = float - Learning rate will linearly drop to min_alpha as training progresses. 
        To set it: alpha - (min_alpha * epochs) ~ 0.00

negative = int - If > 0, negative sampling will be used, the int for negative specifies how many "noise words" 
        should be drown. If set to 0, no negative sampling is used. - (5, 20)
        
workers = int - Use these many worker threads to train the model (=faster training with multicore machines)
'''
w2v_quake = Word2Vec(min_count=1,
                     window=4,
                     size=3,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20)
w2v_flood = Word2Vec(min_count=1,
                     window=4,
                     size=3,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20)



We will now build our vocabulary. Do not run the following code multiple times before initialising the w2v instance again from above.

In [389]:
w2v_quake.build_vocab(list_of_words_quake)
w2v_flood.build_vocab(list_of_words_flood)

INFO - 16:20:25: collecting all words and their counts
INFO - 16:20:25: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 16:20:25: collected 13732 word types from a corpus of 70267 raw words and 5140 sentences
INFO - 16:20:25: Loading a fresh vocabulary
INFO - 16:20:26: effective_min_count=1 retains 13732 unique words (100% of original 13732, drops 0)
INFO - 16:20:26: effective_min_count=1 leaves 70267 word corpus (100% of original 70267, drops 0)
INFO - 16:20:26: deleting the raw counts dictionary of 13732 items
INFO - 16:20:26: sample=6e-05 downsamples 669 most-common words
INFO - 16:20:26: downsampling leaves estimated 33474 word corpus (47.6% of prior 70267)
INFO - 16:20:26: estimated required memory for 13732 words and 3 dimensions: 7195568 bytes
INFO - 16:20:26: resetting layer weights
INFO - 16:20:28: collecting all words and their counts
INFO - 16:20:28: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 16:20:28: collected 9614 wor

The following code is to view the vocabulary that we created

In [390]:
vocabulary_quake = w2v_quake.wv.vocab
vocabulary_flood = w2v_flood.wv.vocab
vocabulary_quake

{'ã€�': <gensim.models.keyedvectors.Vocab at 0x1d2481b2390>,
 '#': <gensim.models.keyedvectors.Vocab at 0x1d2481d66d8>,
 'USGS': <gensim.models.keyedvectors.Vocab at 0x1d2481d6da0>,
 'Breakingã€': <gensim.models.keyedvectors.Vocab at 0x1d2481d6d68>,
 '‘': <gensim.models.keyedvectors.Vocab at 0x1d2481d6f28>,
 'M': <gensim.models.keyedvectors.Vocab at 0x1d2481d6f60>,
 '1.1': <gensim.models.keyedvectors.Vocab at 0x1d2481d6fd0>,
 ',': <gensim.models.keyedvectors.Vocab at 0x1d2481d6a90>,
 '28km': <gensim.models.keyedvectors.Vocab at 0x1d2481d6c50>,
 'SSW': <gensim.models.keyedvectors.Vocab at 0x1d2481d6e80>,
 'Fairbanks': <gensim.models.keyedvectors.Vocab at 0x1d2481d6d30>,
 'Alaska': <gensim.models.keyedvectors.Vocab at 0x1d2481d6e48>,
 'http': <gensim.models.keyedvectors.Vocab at 0x1d2481d6048>,
 ':': <gensim.models.keyedvectors.Vocab at 0x1d2481d6400>,
 '//t.co/hSyciQFM': <gensim.models.keyedvectors.Vocab at 0x1d26aa85208>,
 'PastHour': <gensim.models.keyedvectors.Vocab at 0x1d26aa85128>

In [391]:
w2v_quake.train(list_of_words_quake, total_examples=w2v_quake.corpus_count, epochs=30, report_delay=1)
w2v_flood.train(list_of_words_flood, total_examples=w2v_quake.corpus_count, epochs=30, report_delay=1)

INFO - 16:20:30: training model with 3 workers on 13732 vocabulary and 3 features, using sg=0 hs=0 sample=6e-05 negative=20 window=4
INFO - 16:20:30: worker thread finished; awaiting finish of 2 more threads
INFO - 16:20:30: worker thread finished; awaiting finish of 1 more threads
INFO - 16:20:30: worker thread finished; awaiting finish of 0 more threads
INFO - 16:20:30: EPOCH - 1 : training on 70267 raw words (33568 effective words) took 0.1s, 598444 effective words/s
INFO - 16:20:30: worker thread finished; awaiting finish of 2 more threads
INFO - 16:20:30: worker thread finished; awaiting finish of 1 more threads
INFO - 16:20:30: worker thread finished; awaiting finish of 0 more threads
INFO - 16:20:30: EPOCH - 2 : training on 70267 raw words (33412 effective words) took 0.1s, 431109 effective words/s
INFO - 16:20:30: worker thread finished; awaiting finish of 2 more threads
INFO - 16:20:30: worker thread finished; awaiting finish of 1 more threads
INFO - 16:20:30: worker thread fi

INFO - 16:20:32: EPOCH - 24 : training on 70267 raw words (33498 effective words) took 0.1s, 449813 effective words/s
INFO - 16:20:32: worker thread finished; awaiting finish of 2 more threads
INFO - 16:20:32: worker thread finished; awaiting finish of 1 more threads
INFO - 16:20:32: worker thread finished; awaiting finish of 0 more threads
INFO - 16:20:32: EPOCH - 25 : training on 70267 raw words (33485 effective words) took 0.1s, 503253 effective words/s
INFO - 16:20:32: worker thread finished; awaiting finish of 2 more threads
INFO - 16:20:32: worker thread finished; awaiting finish of 1 more threads
INFO - 16:20:32: worker thread finished; awaiting finish of 0 more threads
INFO - 16:20:32: EPOCH - 26 : training on 70267 raw words (33520 effective words) took 0.1s, 521864 effective words/s
INFO - 16:20:32: worker thread finished; awaiting finish of 2 more threads
INFO - 16:20:32: worker thread finished; awaiting finish of 1 more threads
INFO - 16:20:32: worker thread finished; await

INFO - 16:20:33: worker thread finished; awaiting finish of 1 more threads
INFO - 16:20:33: worker thread finished; awaiting finish of 0 more threads
INFO - 16:20:33: EPOCH - 14 : training on 48396 raw words (22217 effective words) took 0.1s, 439682 effective words/s
INFO - 16:20:33: worker thread finished; awaiting finish of 2 more threads
INFO - 16:20:33: worker thread finished; awaiting finish of 1 more threads
INFO - 16:20:33: worker thread finished; awaiting finish of 0 more threads
INFO - 16:20:33: EPOCH - 15 : training on 48396 raw words (22316 effective words) took 0.0s, 547947 effective words/s
INFO - 16:20:33: worker thread finished; awaiting finish of 2 more threads
INFO - 16:20:33: worker thread finished; awaiting finish of 1 more threads
INFO - 16:20:33: worker thread finished; awaiting finish of 0 more threads
INFO - 16:20:33: EPOCH - 16 : training on 48396 raw words (22210 effective words) took 0.0s, 445408 effective words/s
INFO - 16:20:33: worker thread finished; await

(669067, 1451880)

The following code allows us to see words closely related (positively or negatively) with any other word. In this case we put in the word 'help'.

In [392]:
w2v_quake.wv.most_similar(positive=["dead"])

INFO - 16:20:34: precomputing L2-norms of word weight vectors


[('monicamoralestv', 0.9997273087501526),
 ('//t.co/3nhE4gko', 0.9996863603591919),
 ('51km', 0.9995065927505493),
 ('//t.co/ihMyu14D', 0.9994640946388245),
 ('15m', 0.9994456171989441),
 ('NatGeo', 0.9993886351585388),
 ('17:21', 0.9993427395820618),
 ('u.p', 0.9990856647491455),
 ('RealZaidHamid_', 0.9989771842956543),
 ('33m', 0.9989227056503296)]

We will now convert these word2vec embeddings into a pandas dataframe. This can be converted into a csv for use later in maybe R. We will also explore ways to use this in an SVM model to classify our tweets. 

In [393]:
word_list_quake = []
vector_list_quake = []
for word in w2v_quake.wv.vocab:
    word_list_quake.append(word)
    vector_list_quake.append(w2v_quake.wv.get_vector(word))
    
word_list_flood = []
vector_list_flood = []
for word in w2v_flood.wv.vocab:
    word_list_flood.append(word)
    vector_list_flood.append(w2v_flood.wv.get_vector(word))

In [394]:
quake_embeddings = pd.DataFrame()
quake_embeddings['Word'] = word_list_quake
quake_embeddings['Vector'] = vector_list_quake

In [395]:
flood_embeddings = pd.DataFrame()
flood_embeddings['Word'] = word_list_flood
flood_embeddings['Vector'] = vector_list_flood

In [133]:
quake_embeddings.to_csv('Word2Vec_Earthquake.csv')
flood_embeddings.to_csv('Word2Vec_Flood.csv')

## RNN LSTM Classifier

### Creating Word2Vec Vectors for Train Data:

Before we can implement the classifier, we need to convert our sentences (from balanced df) to a word2vec representation using our word2vec model that we trained above. This was tricky. All vectors corresponding to the tweets have to be the same length as the longest one. However, this does not mean we simply pad a 0 to the end of all the tweets but we need to append an array size (100,) beacuse each word in our word2vec is in 100 dimensions.

In [396]:
def word2vec_convert(all_tweets, model):
    
    max_tweet_length = 0
    for tweet in all_tweets:
        if len(word_tokenize(tweet))>max_tweet_length:
            max_tweet_length = len(word_tokenize(tweet))
            
    whole_vec = np.zeros((len(all_tweets), max_tweet_length, 5))
    num = 0
    for tweet in all_tweets:
        tweet_vec = np.zeros(shape = (max_tweet_length, 5))
        vec = np.zeros(5)
        for i in range(len(word_tokenize(tweet))):
            try:
                vec = np.add(vec, model[word_tokenize(tweet)[i]])
                tweet_vec[i] = vec 
                num = num + 1
            except:
                pass
        whole_vec[i] = tweet_vec
    return(whole_vec, num)

Breaking the quake df into 3 different dfs. Each df with have equal number of (citical, high), (critical, med) and (critical, low} tweets. We will analyse how the model does w.r.t to each of these categories. We are then getting the respective embeddings for each df. The embeddings will be in the shape of (50, 34, 100) aka (50 tweets, max padded length 34 and 100 dims)

In [397]:
df_quake_low = df_quake[(df_quake['Priority'] == 'Critical') | (df_quake['Priority'] == 'Low')]
df_quake_med = df_quake[(df_quake['Priority'] == 'Critical') | (df_quake['Priority'] == 'Medium')]
df_quake_high = df_quake[(df_quake['Priority'] == 'Critical') | (df_quake['Priority'] == 'High')]

l_quake_l = preProcess(df_quake_low)
word2vec_convert_quake_l, num_q_l = word2vec_convert(l_quake_l, w2v_quake)

l_quake_m = preProcess(df_quake_med)
word2vec_convert_quake_m, num_q_m = word2vec_convert(l_quake_m, w2v_quake)

l_quake_h = preProcess(df_quake_high)
word2vec_convert_quake_h, num_q_h = word2vec_convert(l_quake_h, w2v_quake)

t_quake_low = to_categorical(df_quake_low['Priority'])
t_quake_med = to_categorical(df_quake_med['Priority'])
t_quake_high = to_categorical(df_quake_high['Priority'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
  from ipykernel import kernelapp as app


In [415]:
model = keras.Sequential()                    
model.add(LSTM(8, input_shape = (word2vec_convert_quake_l.shape[1], word2vec_convert_quake_l.shape[2]),
              return_sequences = False))
model.add(Dense(1,activation='sigmoid')) 

In [416]:
model.compile(optimizer='rmsprop',
                  loss='binary_crossentropy',
                  metrics=['acc', keras.metrics.Precision(), keras.metrics.Recall(), keras.metrics.AUC()])

In [417]:
history = model.fit(word2vec_convert_quake_l , t_quake_low , batch_size=5, epochs=50, validation_split=0.3, shuffle=True)

Train on 35 samples, validate on 15 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50


Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [401]:
y_pred = model.predict(history.validation_data[0])

In [402]:
y_pred

array([[0.56220317],
       [0.56220317],
       [0.56220317],
       [0.56220317],
       [0.56220317],
       [0.56220317],
       [0.56220317],
       [0.56220317],
       [0.56220317],
       [0.56220317],
       [0.56220317],
       [0.56220317],
       [0.56220317],
       [0.56220317],
       [0.56220317]], dtype=float32)

In [287]:
word2vec_convert_quake_l.shape

(50, 34, 100)

In [339]:
word2vec_convert_quake_l[1][0].shape

(100,)

In [355]:
df_quake_low[df_quake_low['Priority'] == 'Critical'].shape

(25, 2)

In [358]:
history.on_train_batch_begin

<bound method Callback.on_train_batch_begin of <keras.callbacks.callbacks.History object at 0x000001D269253BA8>>