In [1]:
import os
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
import string
import re
from gensim.models import Word2Vec
from gensim.models.phrases import Phrases, Phraser
import logging
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)
from nltk import bigrams

In [2]:
os.getcwd()

'C:\\Users\\Vishaal\\Documents\\GitHub\\TREC_Distributed_Machine_Learning\\TREC\\30_Models'

## Reading in the Data:

We have used all the tweets from 2018 (train and test) to create the embeddings. However, balanced datasets have been used to train and test the model. The following code reads in all the data. Note, we are only considering earthquakes and floods in the following sections and we will build separate models for each earthquakes and floods. 

In [3]:
os.chdir('../10_Data/30_Balanced Tweets (Crit = High = Medium = Low)/10_2018 Train')

In [4]:
df_e1 = pd.read_csv('earthquake_TREC_2018_train_BALANCED.csv')
df_f1 = pd.read_csv('flood_TREC_2018_train_BALANCED.csv')

In [5]:
os.chdir('../15_2018 Test')

In [6]:
df_e2 = pd.read_csv('earthquake_TREC_2018_test_BALANCED.csv')
df_f2 = pd.read_csv('flood_TREC_2018_test_BALANCED.csv')

In [7]:
os.chdir('../../20_Extracted Tweets/10_2018 Train')

In [8]:
df_e1_embed = pd.read_csv('Earthquake_TREC_2018_train.csv')
df_f1_embed = pd.read_csv('flood_TREC_2018_train.csv')

In [9]:
os.chdir('../15_2018 Test')

In [10]:
df_e2_embed = pd.read_csv('Earthquake_TREC_2018_test.csv')
df_f2_embed = pd.read_csv('Floods_TREC_2018_test.csv')

## Combining dataframes
We are now combine earthquake tweets into one df and flood tweets into another df. We have subsetting on the tweets and priority columns. We do this for both the embedding dataframes and our balanced datasets, although we will not use the balanced datasets until later.

In [11]:
df_quake_e = pd.DataFrame()
df_quake_e['Tweet'] = pd.concat([df_e1_embed['Tweet'] , df_e2_embed['Tweet']])
df_quake_e['Priority'] = pd.concat([df_e1_embed['Priority'] , df_e2_embed['Priority']])

In [12]:
df_flood_e = pd.DataFrame()
df_flood_e['Tweet'] = pd.concat([df_f1_embed['Tweet'] , df_f2_embed['Tweet']])
df_flood_e['Priority'] = pd.concat([df_f1_embed['Priority'] , df_f2_embed['Priority']])

In [13]:
df_quake = pd.DataFrame()
df_quake['Tweet'] = pd.concat([df_e1['Tweet'] , df_e2['Tweet']])
df_quake['Priority'] = pd.concat([df_e1['Priority'] , df_e2['Priority']])

In [14]:
df_flood = pd.DataFrame()
df_flood['Tweet'] = pd.concat([df_f1['Tweet'] , df_f2['Tweet']])
df_flood['Priority'] = pd.concat([df_f1['Priority'] , df_f2['Priority']])

Cross checking the shapes to make sure they match. **They do match**

In [15]:
df_quake_e.shape, df_flood_e.shape, df_quake.shape, df_flood.shape

((5140, 2), (2518, 2), (100, 2), (120, 2))

## Converting to categorical (0 & 1)

We will now define a function to convert the priority to a categorical 0 & 1. This will be necessary when we train a model.

In [16]:
'''
Creating a categorical variable to keep label critical tweets as 1 and 0 otherwise
'''
def to_categorical(df_c):
    t = []
    for element in df_c['Priority']:
        if element =='Critical':
            t.append(1)
        else:
            t.append(0)
        
    t = np.array(t)
    df_c['Target'] = t

    df_c['Target'] = df_c['Target'].astype('category')
    t = df_c['Target']
    del df_c['Target']
    return (t)

t_quake_e = to_categorical(df_quake_e)
t_flood_e = to_categorical(df_flood_e)
t_quake = to_categorical(df_quake)
t_flood = to_categorical(df_flood)

## Pre-Processing

We shall now perform a series of pre processing to our tweets. This includes tokenizing them, removing stop words and lemmatising them.  

**I also wrote a line of code to remove the links in the tweets as it was throwing in a lot of gibberish into the emebddings.** The link is almost always at the end of the tweet so its relatively easy to remove. 

I left the punctuation marks in. We have to take a call on this later.

In [52]:
def preProcess(df):
    df['Tweet'] = df['Tweet'].astype('str')
    
    df['Tweet'] = df['Tweet'].apply(lambda x: re.split('http?s *: *\/\/.*', str(x))[0])
    
    token_array = []
    #for tweet in df['Tweet']:
    #    translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
    #    tweet = tweet.translate(translator)
    #    token_tweet = word_tokenize(tweet)
    #    token_array.append(token_tweet)
    
    for tweet in df['Tweet']:
        token_tweet = word_tokenize(tweet)
        token_array.append(token_tweet)
        
    stop_words=set(stopwords.words("english"))
    filtered_token_array=[]
    for tweet in token_array:
        filtered_tweet = []
        for word in tweet:
                if word not in stop_words:
                    filtered_tweet.append(word)
        filtered_token_array.append(filtered_tweet)
        
    lem = WordNetLemmatizer()

    lemmatized_array=[]
    for tweet in filtered_token_array:
        lemmatized_tweet = []
        for word in tweet:
            lemmatized_tweet.append(lem.lemmatize(word,'v'))
        lemmatized_array.append(lemmatized_tweet)
    
    lemmatized_array_join = []
    for element in lemmatized_array:
        lemmatized_array_join.append(' '.join(element))
        
    return (lemmatized_array_join)


The tweets have been pre processed and we now have a collection of tweets. Word2Vec takes in a list of words together contained in a bigger list (Corpus -> List of Tweet -> List of Words).

In [53]:
l_quake_e = preProcess(df_quake_e)
l_flood_e = preProcess(df_flood_e)

list_of_words_quake = []
list_of_words_flood = []

for tweet in l_quake_e:
        token_tweet = word_tokenize(tweet)
        list_of_words_quake.append(token_tweet)
        
for tweet in l_flood_e:
        token_tweet = word_tokenize(tweet)
        list_of_words_flood.append(token_tweet)
        

## Creating a Word2Vec model

We will now initialise a word2vec model. This takes in a paramter 'min_count' which is the number of minimum occurences of a word required for it be included in the embeddings. **For now we set min_count = 1**. I did this because we are creating our own embeddings (not picking from wikipedia embeddings etc) and our data is not that huge. 

In [54]:
'''
min_count = int - Ignores all words with total absolute frequency lower than this - (2, 100)

window = int - The maximum distance between the current and predicted word within a sentence. 
        E.g. window words on the left and window words on the left of our target - (2, 10)

size = int - Dimensionality of the feature vectors. - (50, 300)

sample = float - The threshold for configuring which higher-frequency words are randomly downsampled. 
        Highly influencial. - (0, 1e-5)

alpha = float - The initial learning rate - (0.01, 0.05)

min_alpha = float - Learning rate will linearly drop to min_alpha as training progresses. 
        To set it: alpha - (min_alpha * epochs) ~ 0.00

negative = int - If > 0, negative sampling will be used, the int for negative specifies how many "noise words" 
        should be drown. If set to 0, no negative sampling is used. - (5, 20)
        
workers = int - Use these many worker threads to train the model (=faster training with multicore machines)
'''
w2v_quake = Word2Vec(min_count=1,
                     window=4,
                     size=100,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20)
w2v_flood = Word2Vec(min_count=1,
                     window=4,
                     size=100,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20)

We will now build our vocabulary. Do not run the following code multiple times before initialising the w2v instance again from above.

In [55]:
w2v_quake.build_vocab(list_of_words_quake)
w2v_flood.build_vocab(list_of_words_flood)

INFO - 11:02:00: collecting all words and their counts
INFO - 11:02:00: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 11:02:00: collected 11096 word types from a corpus of 61156 raw words and 5140 sentences
INFO - 11:02:00: Loading a fresh vocabulary
INFO - 11:02:00: effective_min_count=1 retains 11096 unique words (100% of original 11096, drops 0)
INFO - 11:02:00: effective_min_count=1 leaves 61156 word corpus (100% of original 61156, drops 0)
INFO - 11:02:00: deleting the raw counts dictionary of 11096 items
INFO - 11:02:00: sample=6e-05 downsamples 771 most-common words
INFO - 11:02:00: downsampling leaves estimated 29229 word corpus (47.8% of prior 61156)
INFO - 11:02:00: estimated required memory for 11096 words and 100 dimensions: 14424800 bytes
INFO - 11:02:00: resetting layer weights
INFO - 11:02:02: collecting all words and their counts
INFO - 11:02:02: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 11:02:02: collected 8277 

The following code is to view the vocabulary that we created

In [56]:
vocabulary_quake = w2v_quake.wv.vocab
vocabulary_flood = w2v_flood.wv.vocab
vocabulary_quake

{'ã€�': <gensim.models.keyedvectors.Vocab at 0x228e98ebfd0>,
 '#': <gensim.models.keyedvectors.Vocab at 0x228ee6189b0>,
 'USGS': <gensim.models.keyedvectors.Vocab at 0x228ee618c50>,
 'Breakingã€': <gensim.models.keyedvectors.Vocab at 0x228ee615b00>,
 '‘': <gensim.models.keyedvectors.Vocab at 0x228ee615748>,
 'M': <gensim.models.keyedvectors.Vocab at 0x228ee6156a0>,
 '1.1': <gensim.models.keyedvectors.Vocab at 0x228ee615c18>,
 ',': <gensim.models.keyedvectors.Vocab at 0x228ee615860>,
 '28km': <gensim.models.keyedvectors.Vocab at 0x228ee615eb8>,
 'SSW': <gensim.models.keyedvectors.Vocab at 0x228ee615048>,
 'Fairbanks': <gensim.models.keyedvectors.Vocab at 0x228ee6154e0>,
 'Alaska': <gensim.models.keyedvectors.Vocab at 0x228ee6154a8>,
 'Earthquake': <gensim.models.keyedvectors.Vocab at 0x228ee6158d0>,
 '4.8': <gensim.models.keyedvectors.Vocab at 0x228ee615d30>,
 'south': <gensim.models.keyedvectors.Vocab at 0x228ee615c88>,
 'Bali': <gensim.models.keyedvectors.Vocab at 0x228ee615d68>,
 'In

In [57]:
w2v_quake.train(list_of_words_quake, total_examples=w2v_quake.corpus_count, epochs=30, report_delay=1)
w2v_flood.train(list_of_words_flood, total_examples=w2v_quake.corpus_count, epochs=30, report_delay=1)

INFO - 11:02:04: training model with 3 workers on 11096 vocabulary and 100 features, using sg=0 hs=0 sample=6e-05 negative=20 window=4
INFO - 11:02:04: worker thread finished; awaiting finish of 2 more threads
INFO - 11:02:04: worker thread finished; awaiting finish of 1 more threads
INFO - 11:02:04: worker thread finished; awaiting finish of 0 more threads
INFO - 11:02:04: EPOCH - 1 : training on 61156 raw words (29091 effective words) took 0.1s, 226404 effective words/s
INFO - 11:02:04: worker thread finished; awaiting finish of 2 more threads
INFO - 11:02:04: worker thread finished; awaiting finish of 1 more threads
INFO - 11:02:04: worker thread finished; awaiting finish of 0 more threads
INFO - 11:02:04: EPOCH - 2 : training on 61156 raw words (29171 effective words) took 0.1s, 386010 effective words/s
INFO - 11:02:04: worker thread finished; awaiting finish of 2 more threads
INFO - 11:02:04: worker thread finished; awaiting finish of 1 more threads
INFO - 11:02:04: worker thread 

INFO - 11:02:06: EPOCH - 24 : training on 61156 raw words (29340 effective words) took 0.1s, 368265 effective words/s
INFO - 11:02:06: worker thread finished; awaiting finish of 2 more threads
INFO - 11:02:06: worker thread finished; awaiting finish of 1 more threads
INFO - 11:02:06: worker thread finished; awaiting finish of 0 more threads
INFO - 11:02:06: EPOCH - 25 : training on 61156 raw words (29246 effective words) took 0.1s, 446054 effective words/s
INFO - 11:02:06: worker thread finished; awaiting finish of 2 more threads
INFO - 11:02:06: worker thread finished; awaiting finish of 1 more threads
INFO - 11:02:06: worker thread finished; awaiting finish of 0 more threads
INFO - 11:02:06: EPOCH - 26 : training on 61156 raw words (29366 effective words) took 0.1s, 415310 effective words/s
INFO - 11:02:06: worker thread finished; awaiting finish of 2 more threads
INFO - 11:02:06: worker thread finished; awaiting finish of 1 more threads
INFO - 11:02:06: worker thread finished; await

INFO - 11:02:07: worker thread finished; awaiting finish of 1 more threads
INFO - 11:02:07: worker thread finished; awaiting finish of 0 more threads
INFO - 11:02:07: EPOCH - 14 : training on 43000 raw words (19917 effective words) took 0.1s, 353502 effective words/s
INFO - 11:02:07: worker thread finished; awaiting finish of 2 more threads
INFO - 11:02:07: worker thread finished; awaiting finish of 1 more threads
INFO - 11:02:07: worker thread finished; awaiting finish of 0 more threads
INFO - 11:02:07: EPOCH - 15 : training on 43000 raw words (19841 effective words) took 0.0s, 404942 effective words/s
INFO - 11:02:07: worker thread finished; awaiting finish of 2 more threads
INFO - 11:02:07: worker thread finished; awaiting finish of 1 more threads
INFO - 11:02:07: worker thread finished; awaiting finish of 0 more threads
INFO - 11:02:07: EPOCH - 16 : training on 43000 raw words (19965 effective words) took 0.1s, 370136 effective words/s
INFO - 11:02:07: worker thread finished; await

(598113, 1290000)

The following code allows us to see words closely related (positively or negatively) with any other word. In this case we put in the word 'help'.

In [58]:
w2v_quake.wv.most_similar(positive=["dead"])

INFO - 11:02:08: precomputing L2-norms of word weight vectors


[('kill', 0.9996314644813538),
 ('northern', 0.9996073246002197),
 ('A', 0.9995359778404236),
 ('7.4-magnitude', 0.9994767308235168),
 ('report', 0.9994566440582275),
 ('rock', 0.9994409680366516),
 ('Italy', 0.9994329810142517),
 ('powerful', 0.9994298219680786),
 ('6', 0.9993337392807007),
 ('least', 0.9993335008621216)]

We will now convert these word2vec embeddings into a pandas dataframe. This can be converted into a csv for use later in maybe R. We will also explore ways to use this in an SVM model to classify our tweets. 

In [59]:
word_list_quake = []
vector_list_quake = []
for word in w2v_quake.wv.vocab:
    word_list_quake.append(word)
    vector_list_quake.append(w2v_quake.wv.get_vector(word))
    
word_list_flood = []
vector_list_flood = []
for word in w2v_flood.wv.vocab:
    word_list_flood.append(word)
    vector_list_flood.append(w2v_flood.wv.get_vector(word))

In [60]:
quake_embeddings = pd.DataFrame()
quake_embeddings['Word'] = word_list_quake
quake_embeddings['Vector'] = vector_list_quake

In [61]:
flood_embeddings = pd.DataFrame()
flood_embeddings['Word'] = word_list_flood
flood_embeddings['Vector'] = vector_list_flood

In [62]:
quake_embeddings.to_csv('Word2Vec_Earthquake.csv')
flood_embeddings.to_csv('Word2Vec_Flood.csv')

## SVM Classifier

### Creating Word2Vec Vectors for Train Data:

Before we can implement the classifier, we need to convert our sentences (from balanced df) to a word2vec representation using our word2vec model that we trained above. 

In [94]:
def word2vec_convert(all_tweets, model):
    whole_vec = []
    vec = np.zeros(100)
    numw = 0
    for tweet in all_tweets:
        tweet_vec = []
        vec = np.zeros(100)
        for word in tweet:
            try:
                vec = np.add(vec, model[word])
                tweet_vec.append(vec)
                numw = numw + 1
            except:
                pass
        whole_vec.append(tweet_vec)
    return(whole_vec)

Getting the quake data into vector form using our trained embeddings. We also append it to the original df.

In [98]:
l_quake = preProcess(df_quake)
word2vec_convert_quake = word2vec_convert(l_quake, w2v_quake)
df_quake['word2vec'] = word2vec_convert_quake

  # Remove the CWD from sys.path while we load stuff.


Getting the flood data into vector form using our trained embeddings. We also append it to the original df.

In [99]:
l_flood = preProcess(df_flood)
word2vec_convert_flood = word2vec_convert(l_flood, w2v_flood)
df_flood['word2vec'] = word2vec_convert_flood

  # Remove the CWD from sys.path while we load stuff.


In [None]:
X_train, X_val, Y_train, Y_val = model_selection.train_test_split(text_lc, t_lc, test_size=0.2, random_state=100)