In [181]:
import os
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
import string
import re
from gensim.models import Word2Vec
from gensim.models.phrases import Phrases, Phraser
import logging
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)
from nltk import bigrams
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.calibration import CalibratedClassifierCV

In [2]:
os.getcwd()

'C:\\Users\\Vishaal\\Documents\\GitHub\\TREC_Distributed_Machine_Learning\\TREC\\30_Models'

## Reading in the Data:

We have used all the tweets from 2018 (train and test) to create the embeddings. However, balanced datasets have been used to train and test the model. The following code reads in all the data. Note, we are only considering earthquakes and floods in the following sections and we will build separate models for each earthquakes and floods. 

In [3]:
os.chdir('../10_Data/30_Balanced Tweets (Crit = High = Medium = Low)/10_2018 Train')

In [4]:
df_e1 = pd.read_csv('earthquake_TREC_2018_train_BALANCED.csv')
df_f1 = pd.read_csv('flood_TREC_2018_train_BALANCED.csv')

In [5]:
os.chdir('../15_2018 Test')

In [6]:
df_e2 = pd.read_csv('earthquake_TREC_2018_test_BALANCED.csv')
df_f2 = pd.read_csv('flood_TREC_2018_test_BALANCED.csv')

In [7]:
os.chdir('../../20_Extracted Tweets/10_2018 Train')

In [8]:
df_e1_embed = pd.read_csv('Earthquake_TREC_2018_train.csv')
df_f1_embed = pd.read_csv('flood_TREC_2018_train.csv')

In [9]:
os.chdir('../15_2018 Test')

In [10]:
df_e2_embed = pd.read_csv('Earthquake_TREC_2018_test.csv')
df_f2_embed = pd.read_csv('Floods_TREC_2018_test.csv')

## Combining dataframes
We are now combine earthquake tweets into one df and flood tweets into another df. We have subsetting on the tweets and priority columns. We do this for both the embedding dataframes and our balanced datasets, although we will not use the balanced datasets until later.

In [146]:
df_quake_e = pd.DataFrame()
df_quake_e['Tweet'] = pd.concat([df_e1_embed['Tweet'] , df_e2_embed['Tweet']])
df_quake_e['Priority'] = pd.concat([df_e1_embed['Priority'] , df_e2_embed['Priority']])

In [147]:
df_flood_e = pd.DataFrame()
df_flood_e['Tweet'] = pd.concat([df_f1_embed['Tweet'] , df_f2_embed['Tweet']])
df_flood_e['Priority'] = pd.concat([df_f1_embed['Priority'] , df_f2_embed['Priority']])

In [148]:
df_quake = pd.DataFrame()
df_quake['Tweet'] = pd.concat([df_e1['Tweet'] , df_e2['Tweet']])
df_quake['Priority'] = pd.concat([df_e1['Priority'] , df_e2['Priority']])

In [149]:
df_flood = pd.DataFrame()
df_flood['Tweet'] = pd.concat([df_f1['Tweet'] , df_f2['Tweet']])
df_flood['Priority'] = pd.concat([df_f1['Priority'] , df_f2['Priority']])

Cross checking the shapes to make sure they match. **They do match**

In [150]:
df_quake_e.shape, df_flood_e.shape, df_quake.shape, df_flood.shape

((5140, 2), (2518, 2), (100, 2), (120, 2))

## Converting to categorical (0 & 1)

We will now define a function to convert the priority to a categorical 0 & 1. This will be necessary when we train a model.

In [189]:
'''
Creating a categorical variable to keep label critical tweets as 1 and 0 otherwise
'''
def to_categorical(array):
    t = []
    for element in array:
        if element =='Critical':
            t.append(1)
        else:
            t.append(0)
        
    return (t)


## Pre-Processing

We shall now perform a series of pre processing to our tweets. This includes tokenizing them, removing stop words and lemmatising them.  

**I also wrote a line of code to remove the links in the tweets as it was throwing in a lot of gibberish into the emebddings.** The link is almost always at the end of the tweet so its relatively easy to remove. 

I left the punctuation marks in. We have to take a call on this later.

In [151]:
def preProcess(df):
    df['Tweet'] = df['Tweet'].astype('str')
    
    df['Tweet'] = df['Tweet'].apply(lambda x: re.split('http?s *: *\/\/.*', str(x))[0])
    
    token_array = []
    #for tweet in df['Tweet']:
    #    translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
    #    tweet = tweet.translate(translator)
    #    token_tweet = word_tokenize(tweet)
    #    token_array.append(token_tweet)
    
    for tweet in df['Tweet']:
        token_tweet = word_tokenize(tweet)
        token_array.append(token_tweet)
        
    stop_words=set(stopwords.words("english"))
    filtered_token_array=[]
    for tweet in token_array:
        filtered_tweet = []
        for word in tweet:
                if word not in stop_words:
                    filtered_tweet.append(word)
        filtered_token_array.append(filtered_tweet)
        
    lem = WordNetLemmatizer()

    lemmatized_array=[]
    for tweet in filtered_token_array:
        lemmatized_tweet = []
        for word in tweet:
            lemmatized_tweet.append(lem.lemmatize(word,'v'))
        lemmatized_array.append(lemmatized_tweet)
    
    lemmatized_array_join = []
    for element in lemmatized_array:
        lemmatized_array_join.append(' '.join(element))
        
    return (lemmatized_array_join)


The tweets have been pre processed and we now have a collection of tweets. Word2Vec takes in a list of words together contained in a bigger list (Corpus -> List of Tweet -> List of Words).

In [152]:
l_quake_e = preProcess(df_quake_e)
l_flood_e = preProcess(df_flood_e)

list_of_words_quake = []
list_of_words_flood = []

for tweet in l_quake_e:
        token_tweet = word_tokenize(tweet)
        list_of_words_quake.append(token_tweet)
        
for tweet in l_flood_e:
        token_tweet = word_tokenize(tweet)
        list_of_words_flood.append(token_tweet)
        

## Creating a Word2Vec model

We will now initialise a word2vec model. This takes in a paramter 'min_count' which is the number of minimum occurences of a word required for it be included in the embeddings. **For now we set min_count = 1**. I did this because we are creating our own embeddings (not picking from wikipedia embeddings etc) and our data is not that huge. 

In [153]:
'''
min_count = int - Ignores all words with total absolute frequency lower than this - (2, 100)

window = int - The maximum distance between the current and predicted word within a sentence. 
        E.g. window words on the left and window words on the left of our target - (2, 10)

size = int - Dimensionality of the feature vectors. - (50, 300)

sample = float - The threshold for configuring which higher-frequency words are randomly downsampled. 
        Highly influencial. - (0, 1e-5)

alpha = float - The initial learning rate - (0.01, 0.05)

min_alpha = float - Learning rate will linearly drop to min_alpha as training progresses. 
        To set it: alpha - (min_alpha * epochs) ~ 0.00

negative = int - If > 0, negative sampling will be used, the int for negative specifies how many "noise words" 
        should be drown. If set to 0, no negative sampling is used. - (5, 20)
        
workers = int - Use these many worker threads to train the model (=faster training with multicore machines)
'''
w2v_quake = Word2Vec(min_count=1,
                     window=4,
                     size=100,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20)
w2v_flood = Word2Vec(min_count=1,
                     window=4,
                     size=100,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20)

We will now build our vocabulary. Do not run the following code multiple times before initialising the w2v instance again from above.

In [154]:
w2v_quake.build_vocab(list_of_words_quake)
w2v_flood.build_vocab(list_of_words_flood)

INFO - 11:37:12: collecting all words and their counts
INFO - 11:37:12: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 11:37:12: collected 13732 word types from a corpus of 70267 raw words and 5140 sentences
INFO - 11:37:12: Loading a fresh vocabulary
INFO - 11:37:12: effective_min_count=1 retains 13732 unique words (100% of original 13732, drops 0)
INFO - 11:37:12: effective_min_count=1 leaves 70267 word corpus (100% of original 70267, drops 0)
INFO - 11:37:12: deleting the raw counts dictionary of 13732 items
INFO - 11:37:12: sample=6e-05 downsamples 669 most-common words
INFO - 11:37:12: downsampling leaves estimated 33474 word corpus (47.6% of prior 70267)
INFO - 11:37:12: estimated required memory for 13732 words and 100 dimensions: 17851600 bytes
INFO - 11:37:12: resetting layer weights
INFO - 11:37:14: collecting all words and their counts
INFO - 11:37:14: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 11:37:14: collected 9614 

The following code is to view the vocabulary that we created

In [155]:
vocabulary_quake = w2v_quake.wv.vocab
vocabulary_flood = w2v_flood.wv.vocab
vocabulary_quake

{'ã€�': <gensim.models.keyedvectors.Vocab at 0x228ea2f7dd8>,
 '#': <gensim.models.keyedvectors.Vocab at 0x228eed580b8>,
 'USGS': <gensim.models.keyedvectors.Vocab at 0x228eeb2fc88>,
 'Breakingã€': <gensim.models.keyedvectors.Vocab at 0x228eeb09dd8>,
 '‘': <gensim.models.keyedvectors.Vocab at 0x228eeb09f60>,
 'M': <gensim.models.keyedvectors.Vocab at 0x228eeb272b0>,
 '1.1': <gensim.models.keyedvectors.Vocab at 0x228eeb27da0>,
 ',': <gensim.models.keyedvectors.Vocab at 0x228eeb27f98>,
 '28km': <gensim.models.keyedvectors.Vocab at 0x228eeb273c8>,
 'SSW': <gensim.models.keyedvectors.Vocab at 0x228eeb27588>,
 'Fairbanks': <gensim.models.keyedvectors.Vocab at 0x228eeb27ef0>,
 'Alaska': <gensim.models.keyedvectors.Vocab at 0x228eeb27128>,
 'http': <gensim.models.keyedvectors.Vocab at 0x228eeb275c0>,
 ':': <gensim.models.keyedvectors.Vocab at 0x228f36de9b0>,
 '//t.co/hSyciQFM': <gensim.models.keyedvectors.Vocab at 0x228f36de7b8>,
 'PastHour': <gensim.models.keyedvectors.Vocab at 0x228f36de668>

In [156]:
w2v_quake.train(list_of_words_quake, total_examples=w2v_quake.corpus_count, epochs=30, report_delay=1)
w2v_flood.train(list_of_words_flood, total_examples=w2v_quake.corpus_count, epochs=30, report_delay=1)

INFO - 11:37:16: training model with 3 workers on 13732 vocabulary and 100 features, using sg=0 hs=0 sample=6e-05 negative=20 window=4
INFO - 11:37:16: worker thread finished; awaiting finish of 2 more threads
INFO - 11:37:16: worker thread finished; awaiting finish of 1 more threads
INFO - 11:37:16: worker thread finished; awaiting finish of 0 more threads
INFO - 11:37:16: EPOCH - 1 : training on 70267 raw words (33519 effective words) took 0.1s, 313607 effective words/s
INFO - 11:37:16: worker thread finished; awaiting finish of 2 more threads
INFO - 11:37:16: worker thread finished; awaiting finish of 1 more threads
INFO - 11:37:16: worker thread finished; awaiting finish of 0 more threads
INFO - 11:37:16: EPOCH - 2 : training on 70267 raw words (33454 effective words) took 0.1s, 386996 effective words/s
INFO - 11:37:16: worker thread finished; awaiting finish of 2 more threads
INFO - 11:37:16: worker thread finished; awaiting finish of 1 more threads
INFO - 11:37:16: worker thread 

INFO - 11:37:18: EPOCH - 24 : training on 70267 raw words (33353 effective words) took 0.1s, 399584 effective words/s
INFO - 11:37:18: worker thread finished; awaiting finish of 2 more threads
INFO - 11:37:18: worker thread finished; awaiting finish of 1 more threads
INFO - 11:37:18: worker thread finished; awaiting finish of 0 more threads
INFO - 11:37:18: EPOCH - 25 : training on 70267 raw words (33478 effective words) took 0.1s, 370812 effective words/s
INFO - 11:37:19: worker thread finished; awaiting finish of 2 more threads
INFO - 11:37:19: worker thread finished; awaiting finish of 1 more threads
INFO - 11:37:19: worker thread finished; awaiting finish of 0 more threads
INFO - 11:37:19: EPOCH - 26 : training on 70267 raw words (33414 effective words) took 0.1s, 409780 effective words/s
INFO - 11:37:19: worker thread finished; awaiting finish of 2 more threads
INFO - 11:37:19: worker thread finished; awaiting finish of 1 more threads
INFO - 11:37:19: worker thread finished; await

INFO - 11:37:20: worker thread finished; awaiting finish of 1 more threads
INFO - 11:37:20: worker thread finished; awaiting finish of 0 more threads
INFO - 11:37:20: EPOCH - 14 : training on 48396 raw words (22366 effective words) took 0.1s, 378388 effective words/s
INFO - 11:37:20: worker thread finished; awaiting finish of 2 more threads
INFO - 11:37:20: worker thread finished; awaiting finish of 1 more threads
INFO - 11:37:20: worker thread finished; awaiting finish of 0 more threads
INFO - 11:37:20: EPOCH - 15 : training on 48396 raw words (22200 effective words) took 0.1s, 386631 effective words/s
INFO - 11:37:20: worker thread finished; awaiting finish of 2 more threads
INFO - 11:37:20: worker thread finished; awaiting finish of 1 more threads
INFO - 11:37:20: worker thread finished; awaiting finish of 0 more threads
INFO - 11:37:20: EPOCH - 16 : training on 48396 raw words (22321 effective words) took 0.1s, 416288 effective words/s
INFO - 11:37:20: worker thread finished; await

(668727, 1451880)

The following code allows us to see words closely related (positively or negatively) with any other word. In this case we put in the word 'help'.

In [157]:
w2v_quake.wv.most_similar(positive=["dead"])

INFO - 11:37:21: precomputing L2-norms of word weight vectors


[('Powerful', 0.9995934963226318),
 ('northern', 0.9995595216751099),
 ('kill', 0.9994736909866333),
 ('Italy', 0.9994498491287231),
 ('A', 0.9994382858276367),
 ('7.4-magnitude', 0.9994300603866577),
 ('powerful', 0.9994100332260132),
 ('rock', 0.9993358850479126),
 ('report', 0.9992691874504089),
 ('Reuters', 0.9992547631263733)]

We will now convert these word2vec embeddings into a pandas dataframe. This can be converted into a csv for use later in maybe R. We will also explore ways to use this in an SVM model to classify our tweets. 

In [158]:
word_list_quake = []
vector_list_quake = []
for word in w2v_quake.wv.vocab:
    word_list_quake.append(word)
    vector_list_quake.append(w2v_quake.wv.get_vector(word))
    
word_list_flood = []
vector_list_flood = []
for word in w2v_flood.wv.vocab:
    word_list_flood.append(word)
    vector_list_flood.append(w2v_flood.wv.get_vector(word))

In [159]:
quake_embeddings = pd.DataFrame()
quake_embeddings['Word'] = word_list_quake
quake_embeddings['Vector'] = vector_list_quake

In [160]:
flood_embeddings = pd.DataFrame()
flood_embeddings['Word'] = word_list_flood
flood_embeddings['Vector'] = vector_list_flood

In [133]:
quake_embeddings.to_csv('Word2Vec_Earthquake.csv')
flood_embeddings.to_csv('Word2Vec_Flood.csv')

## SVM Classifier

### Creating Word2Vec Vectors for Train Data:

Before we can implement the classifier, we need to convert our sentences (from balanced df) to a word2vec representation using our word2vec model that we trained above. 

In [161]:
def word2vec_convert(all_tweets, model):
    whole_vec = []
    vec = np.zeros(100)
    numw = 0
    for tweet in all_tweets:
        tweet_vec = []
        vec = np.zeros(100)
        for word in tweet:
            try:
                vec = np.add(vec, model[word])
                tweet_vec.append(vec)
                numw = numw + 1
            except:
                pass
        whole_vec.append(tweet_vec)
    return(whole_vec)

Getting the quake data into vector form using our trained embeddings. We also append it to the original df.

In [162]:
l_quake = preProcess(df_quake)
word2vec_convert_quake = word2vec_convert(l_quake, w2v_quake)
df_quake['word2vec'] = word2vec_convert_quake

  # Remove the CWD from sys.path while we load stuff.


Getting the flood data into vector form using our trained embeddings. We also append it to the original df.

In [163]:
l_flood = preProcess(df_flood)
word2vec_convert_flood = word2vec_convert(l_flood, w2v_flood)
df_flood['word2vec'] = word2vec_convert_flood

  # Remove the CWD from sys.path while we load stuff.


Breaking the quake df into 3 different dfs. Each df with have equal number of (citical, high), (critical, med) and (critical, low} tweets. We will analyse how the model does w.r.t to each of these categories.

In [178]:
df_quake_low = df_quake[(df_quake['Priority'] == 'Critical') | (df_quake['Priority'] == 'Low')]
df_quake_med = df_quake[(df_quake['Priority'] == 'Critical') | (df_quake['Priority'] == 'Medium')]
df_quake_high = df_quake[(df_quake['Priority'] == 'Critical') | (df_quake['Priority'] == 'High')]

Splitting low and critical df into train and val set. 

In [174]:
X_train_q_l, X_val_q_l, Y_train_q_l, Y_val_q_l = train_test_split(df_quake_low['word2vec'], df_quake_low['Priority'], 
                                                  test_size=0.2, random_state=100)

Defining a model. We use SGD Classifer with SVM (hinge loss accomplishes this). We also define a calibrated classifier as SVM
by default does not give use proba. 

In [185]:
clf = SGDClassifier(loss = 'hinge', alpha = 0.001, max_iter=10000, tol=1e-3
                                                       , shuffle = True, learning_rate = 'optimal', penalty='l1')
model = CalibratedClassifierCV(clf)

In [208]:
'''

'''
clf.fit(X_train_q_l, to_categorical(Y_train_q_l))
y_pred_q_l = clf.predict(np.array(X_val_q_l)[0])

ValueError: setting an array element with a sequence.