In [132]:
import os
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
import string
import re
from gensim.models import Word2Vec
from gensim.models.phrases import Phrases, Phraser
import logging
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)
from nltk import bigrams

## Reading in the Data:

We have used all the tweets from 2018 (train and test) to create the embeddings. However, balanced datasets have been used to train and test the model. The following code reads in all the data. Note, we are only considering earthquakes and floods in the following sections and we will build separate models for each earthquakes and floods. 

In [2]:
os.chdir('../10_Data/30_Balanced Tweets (Crit = Non-Crit)/10_2018 Train')

In [3]:
df_e1 = pd.read_csv('earthquake_TREC_2018_train_BALANCED.csv')
df_f1 = pd.read_csv('flood_TREC_2018_train_BALANCED.csv')

In [5]:
os.chdir('../15_2018 Test')

In [6]:
df_e2 = pd.read_csv('earthquake_TREC_2018_test_BALANCED.csv')
df_f2 = pd.read_csv('flood_TREC_2018_test_BALANCED.csv')

In [7]:
os.chdir('../../20_Extracted Tweets/10_2018 Train')

In [9]:
df_e1_embed = pd.read_csv('Earthquake_TREC_2018_train.csv')
df_f1_embed = pd.read_csv('flood_TREC_2018_train.csv')

In [10]:
os.chdir('../15_2018 Test')

In [12]:
df_e2_embed = pd.read_csv('Earthquake_TREC_2018_test.csv')
df_f2_embed = pd.read_csv('Floods_TREC_2018_test.csv')

## Combining dataframes
We are now combine earthquake tweets into one df and flood tweets into another df. We have subsetting on the tweets and priority columns. We do this for both the embedding dataframes and our balanced datasets, although we will not use the balanced datasets until later.

In [111]:
df_quake_e = pd.DataFrame()
df_quake_e['Tweet'] = pd.concat([df_e1_embed['Tweet'] , df_e2_embed['Tweet']])
df_quake_e['Priority'] = pd.concat([df_e1_embed['Priority'] , df_e2_embed['Priority']])

In [112]:
df_flood_e = pd.DataFrame()
df_flood_e['Tweet'] = pd.concat([df_f1_embed['Tweet'] , df_f2_embed['Tweet']])
df_flood_e['Priority'] = pd.concat([df_f1_embed['Priority'] , df_f2_embed['Priority']])

In [113]:
df_quake = pd.DataFrame()
df_quake['Tweet'] = pd.concat([df_e1['Tweet'] , df_e2['Tweet']])
df_quake['Priority'] = pd.concat([df_e1['Priority'] , df_e2['Priority']])

In [114]:
df_flood = pd.DataFrame()
df_flood['Tweet'] = pd.concat([df_f1['Tweet'] , df_f2['Tweet']])
df_flood['Priority'] = pd.concat([df_f1['Priority'] , df_f2['Priority']])

Cross checking the shapes to make sure they match. **They do match**

In [115]:
df_quake_e.shape, df_flood_e.shape, df_quake.shape, df_flood.shape

((5140, 2), (2518, 2), (50, 2), (60, 2))

## Converting to categorical (0 & 1)

We will now define a function to convert the priority to a categorical 0 & 1. This will be necessary when we train a model.

In [116]:
'''
Creating a categorical variable to keep label critical tweets as 1 and 0 otherwise
'''
def to_categorical(df_c):
    t = []
    for element in df_c['Priority']:
        if element =='Critical':
            t.append(1)
        else:
            t.append(0)
        
    t = np.array(t)
    df_c['Target'] = t

    df_c['Target'] = df_c['Target'].astype('category')
    t = df_c['Target']
    del df_c['Target']
    return (t)

t_quake_e = to_categorical(df_quake_e)
t_flood_e = to_categorical(df_flood_e)
t_quake = to_categorical(df_quake)
t_flood = to_categorical(df_flood)

## Pre-Processing

We shall now perform a series of pre processing to our tweets. This includes tokenizing them, removing stop words and lemmatising them.  

**I also wrote a line of code to remove the links in the tweets as it was throwing in a lot of gibberish into the emebddings.** The link is almost always at the end of the tweet so its relatively easy to remove. 

I left the punctuation marks in. We have to take a call on this later.

In [203]:
def preProcess(df):
    df['Tweet'] = df['Tweet'].astype('str')
    
    df['Tweet'] = df['Tweet'].apply(lambda x: re.split('http:\/\/.*', str(x))[0])
    
    token_array = []
    #for tweet in df['Tweet']:
    #    translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
    #    tweet = tweet.translate(translator)
    #    token_tweet = word_tokenize(tweet)
    #    token_array.append(token_tweet)
    
    for tweet in df['Tweet']:
        token_tweet = word_tokenize(tweet)
        token_array.append(token_tweet)
        
    stop_words=set(stopwords.words("english"))
    filtered_token_array=[]
    for tweet in token_array:
        filtered_tweet = []
        for word in tweet:
                if word not in stop_words:
                    filtered_tweet.append(word)
        filtered_token_array.append(filtered_tweet)
        
    lem = WordNetLemmatizer()

    lemmatized_array=[]
    for tweet in filtered_token_array:
        lemmatized_tweet = []
        for word in tweet:
            lemmatized_tweet.append(lem.lemmatize(word,'v'))
        lemmatized_array.append(lemmatized_tweet)
    
    lemmatized_array_join = []
    for element in lemmatized_array:
        lemmatized_array_join.append(' '.join(element))
        
    return (lemmatized_array_join)


The tweets have been pre processed and we now have a collection of tweets. Word2Vec takes in a list of words together contained in a bigger list (Corpus -> List of Tweet -> List of Words).

In [233]:
l_quake_e = preProcess(df_quake_e)
l_flood_e = preProcess(df_flood_e)

list_of_words_quake = []
list_of_words_flood = []

for tweet in l_quake_e:
        token_tweet = word_tokenize(tweet)
        list_of_words_quake.append(token_tweet)
        
for tweet in l_flood_e:
        token_tweet = word_tokenize(tweet)
        list_of_words_flood.append(token_tweet)
        

In [235]:
unique_words

{'mt',
 'DavidBCohen1',
 'Seventeen',
 'todayâ€™',
 'fact',
 'company',
 'electric',
 'Dhobighat',
 'GMT',
 'Keeping',
 'whoa',
 '5.4',
 '25M',
 'PrayforHumanity',
 'traumatise',
 'Mayweather',
 'UFOS',
 'bbc',
 'narcissistic',
 '1457',
 'Earthquakeâ€™s',
 '//t.co/rozuJTcPES',
 'M6.5',
 'LORD',
 '//t.co/RxBGlkCFHp',
 'FATA',
 '..kindly',
 '8.2-magnitude',
 'astroturfing',
 'peril',
 '85',
 'Sikkim',
 'Aaa',
 'Battle',
 'UK',
 'garment',
 '16000',
 'race',
 'inadequate',
 'PHOTO',
 'threaten',
 'wheeze',
 'Relieved',
 '//t.co/dWGNkHy7jA',
 'fullest',
 'migrant',
 'kr',
 'idiotic',
 'ho',
 'Thank',
 'though',
 'trans',
 'soldier',
 '//t.co/nXzZ1CDtQl',
 '//t.co/nAyUj3WQ3h',
 'READERS',
 'OttoPerezMolina',
 'AvaAddams',
 'families',
 '//t.co/ybbSlOjnzM',
 'monetary',
 'erthquake',
 'Premier',
 'ArmadaHotel',
 'kishorenepal',
 'lonvining',
 'dekho',
 'Afraid',
 '//t.co/prIJTH3AEv',
 'redcrossnepal',
 'Philippines',
 'panelists',
 'global',
 '//t.co/fDXUccjgMw',
 'wipe',
 'mobiles',
 'road'

## Creating a Word2Vec model

We will now initialise a word2vec model. This takes in a paramter 'min_count' which is the number of minimum occurences of a word required for it be included in the embeddings. **For now we set min_count = 1**. I did this because we are creating our own embeddings (not picking from wikipedia embeddings etc) and our data is not that huge. 

In [264]:
'''
min_count = int - Ignores all words with total absolute frequency lower than this - (2, 100)

window = int - The maximum distance between the current and predicted word within a sentence. 
        E.g. window words on the left and window words on the left of our target - (2, 10)

size = int - Dimensionality of the feature vectors. - (50, 300)

sample = float - The threshold for configuring which higher-frequency words are randomly downsampled. 
        Highly influencial. - (0, 1e-5)

alpha = float - The initial learning rate - (0.01, 0.05)

min_alpha = float - Learning rate will linearly drop to min_alpha as training progresses. 
        To set it: alpha - (min_alpha * epochs) ~ 0.00

negative = int - If > 0, negative sampling will be used, the int for negative specifies how many "noise words" 
        should be drown. If set to 0, no negative sampling is used. - (5, 20)
        
workers = int - Use these many worker threads to train the model (=faster training with multicore machines)
'''
w2v_quake = Word2Vec(min_count=1,
                     window=4,
                     size=100,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20)
w2v_flood = Word2Vec(min_count=1,
                     window=4,
                     size=100,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20)

We will now build our vocabulary. Do not run the following code multiple times before initialising the w2v instance again from above.

In [265]:
w2v_quake.build_vocab(list_of_words_quake)
w2v_flood.build_vocab(list_of_words_flood)

INFO - 14:01:04: collecting all words and their counts
INFO - 14:01:04: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 14:01:04: collected 12305 word types from a corpus of 64955 raw words and 5140 sentences
INFO - 14:01:04: Loading a fresh vocabulary
INFO - 14:01:04: effective_min_count=1 retains 12305 unique words (100% of original 12305, drops 0)
INFO - 14:01:04: effective_min_count=1 leaves 64955 word corpus (100% of original 64955, drops 0)
INFO - 14:01:04: deleting the raw counts dictionary of 12305 items
INFO - 14:01:04: sample=6e-05 downsamples 716 most-common words
INFO - 14:01:04: downsampling leaves estimated 31037 word corpus (47.8% of prior 64955)
INFO - 14:01:04: estimated required memory for 12305 words and 100 dimensions: 15996500 bytes
INFO - 14:01:04: resetting layer weights
INFO - 14:01:07: collecting all words and their counts
INFO - 14:01:07: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 14:01:07: collected 8305 

The following code is to view the vocabulary that we created

In [266]:
vocabulary_quake = w2v_quake.wv.vocab
vocabulary_flood = w2v_flood.wv.vocab
vocabulary_quake

{'ã€�': <gensim.models.keyedvectors.Vocab at 0x1c714cd0160>,
 '#': <gensim.models.keyedvectors.Vocab at 0x1c71142a358>,
 'USGS': <gensim.models.keyedvectors.Vocab at 0x1c71142a128>,
 'Breakingã€': <gensim.models.keyedvectors.Vocab at 0x1c714f5dc88>,
 '‘': <gensim.models.keyedvectors.Vocab at 0x1c714f5d390>,
 'M': <gensim.models.keyedvectors.Vocab at 0x1c714f5df28>,
 '1.1': <gensim.models.keyedvectors.Vocab at 0x1c714f5d860>,
 ',': <gensim.models.keyedvectors.Vocab at 0x1c714f5df60>,
 '28km': <gensim.models.keyedvectors.Vocab at 0x1c714f5d9e8>,
 'SSW': <gensim.models.keyedvectors.Vocab at 0x1c714f5de10>,
 'Fairbanks': <gensim.models.keyedvectors.Vocab at 0x1c714f5d828>,
 'Alaska': <gensim.models.keyedvectors.Vocab at 0x1c714f5db70>,
 'Earthquake': <gensim.models.keyedvectors.Vocab at 0x1c714f5d748>,
 '4.8': <gensim.models.keyedvectors.Vocab at 0x1c714f5d7f0>,
 'south': <gensim.models.keyedvectors.Vocab at 0x1c714f5da58>,
 'Bali': <gensim.models.keyedvectors.Vocab at 0x1c714f5d898>,
 'In

In [267]:
w2v_quake.train(list_of_words_quake, total_examples=w2v_quake.corpus_count, epochs=30, report_delay=1)
w2v_flood.train(list_of_words_flood, total_examples=w2v_quake.corpus_count, epochs=30, report_delay=1)

INFO - 14:01:11: training model with 3 workers on 12305 vocabulary and 100 features, using sg=0 hs=0 sample=6e-05 negative=20 window=4
INFO - 14:01:11: worker thread finished; awaiting finish of 2 more threads
INFO - 14:01:11: worker thread finished; awaiting finish of 1 more threads
INFO - 14:01:11: worker thread finished; awaiting finish of 0 more threads
INFO - 14:01:11: EPOCH - 1 : training on 64955 raw words (31026 effective words) took 0.1s, 220733 effective words/s
INFO - 14:01:11: worker thread finished; awaiting finish of 2 more threads
INFO - 14:01:11: worker thread finished; awaiting finish of 1 more threads
INFO - 14:01:11: worker thread finished; awaiting finish of 0 more threads
INFO - 14:01:11: EPOCH - 2 : training on 64955 raw words (31021 effective words) took 0.1s, 381303 effective words/s
INFO - 14:01:11: worker thread finished; awaiting finish of 2 more threads
INFO - 14:01:11: worker thread finished; awaiting finish of 1 more threads
INFO - 14:01:11: worker thread 

INFO - 14:01:13: EPOCH - 24 : training on 64955 raw words (31122 effective words) took 0.1s, 418902 effective words/s
INFO - 14:01:13: worker thread finished; awaiting finish of 2 more threads
INFO - 14:01:13: worker thread finished; awaiting finish of 1 more threads
INFO - 14:01:13: worker thread finished; awaiting finish of 0 more threads
INFO - 14:01:13: EPOCH - 25 : training on 64955 raw words (31043 effective words) took 0.1s, 374018 effective words/s
INFO - 14:01:13: worker thread finished; awaiting finish of 2 more threads
INFO - 14:01:13: worker thread finished; awaiting finish of 1 more threads
INFO - 14:01:13: worker thread finished; awaiting finish of 0 more threads
INFO - 14:01:13: EPOCH - 26 : training on 64955 raw words (30973 effective words) took 0.1s, 383275 effective words/s
INFO - 14:01:13: worker thread finished; awaiting finish of 2 more threads
INFO - 14:01:13: worker thread finished; awaiting finish of 1 more threads
INFO - 14:01:13: worker thread finished; await

INFO - 14:01:15: worker thread finished; awaiting finish of 1 more threads
INFO - 14:01:15: worker thread finished; awaiting finish of 0 more threads
INFO - 14:01:15: EPOCH - 14 : training on 43113 raw words (19899 effective words) took 0.1s, 394685 effective words/s
INFO - 14:01:15: worker thread finished; awaiting finish of 2 more threads
INFO - 14:01:15: worker thread finished; awaiting finish of 1 more threads
INFO - 14:01:15: worker thread finished; awaiting finish of 0 more threads
INFO - 14:01:15: EPOCH - 15 : training on 43113 raw words (19906 effective words) took 0.1s, 363280 effective words/s
INFO - 14:01:15: worker thread finished; awaiting finish of 2 more threads
INFO - 14:01:15: worker thread finished; awaiting finish of 1 more threads
INFO - 14:01:15: worker thread finished; awaiting finish of 0 more threads
INFO - 14:01:15: EPOCH - 16 : training on 43113 raw words (19968 effective words) took 0.0s, 440444 effective words/s
INFO - 14:01:15: worker thread finished; await

(599970, 1293390)

The following code allows us to see words closely related (positively or negatively) with any other word. In this case we put in the word 'help'.

In [268]:
w2v_quake.wv.most_similar(positive=["dead"])

INFO - 14:01:20: precomputing L2-norms of word weight vectors


[('northern', 0.9993839263916016),
 ('rock', 0.9992939233779907),
 ('kill', 0.9991955757141113),
 ('A', 0.9991101026535034),
 ('7.9', 0.9990882873535156),
 ('Strong', 0.9990764260292053),
 ('Italy', 0.9990614652633667),
 ('Magnitude', 0.9990595579147339),
 ('Powerful', 0.9990211725234985),
 ('cnnbrk', 0.9990118741989136)]

We will now convert these word2vec embeddings into a pandas dataframe. This can be converted into a csv for use later in maybe R. We will also explore ways to use this in an SVM model to classify our tweets. 

In [269]:
word_list_quake = []
vector_list_quake = []
for word in w2v_quake.wv.vocab:
    word_list_quake.append(word)
    vector_list_quake.append(w2v_quake.wv.get_vector(word))
    
word_list_flood = []
vector_list_flood = []
for word in w2v_flood.wv.vocab:
    word_list_flood.append(word)
    vector_list_flood.append(w2v_flood.wv.get_vector(word))

In [270]:
quake_embeddings = pd.DataFrame()
quake_embeddings['Word'] = word_list_quake
quake_embeddings['Vector'] = vector_list_quake

In [271]:
flood_embeddings = pd.DataFrame()
flood_embeddings['Word'] = word_list_flood
flood_embeddings['Vector'] = vector_list_flood

In [275]:
#quake_embeddings.to_csv('Word2Vec_Earthquake.csv')
#flood_embeddings.to_csv('Word2Vec_Flood.csv')

## SVM Classifer