In [30]:
from datetime import datetime
from datetime import timedelta
from textblob import TextBlob
import GetOldTweets3 as got
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns # advanced vizs
from gensim import models
import keras
from keras.callbacks import ModelCheckpoint
from keras.layers import Dense, Dropout, Reshape, Flatten, concatenate, Input, Conv1D, GlobalMaxPooling1D, Embedding
from keras.layers.recurrent import LSTM
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from sklearn.model_selection import train_test_split
import sklearn.metrics
import numpy as np
import pandas as pd
import os
import collections
import re
import string
import csv
from string import punctuation 
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from matplotlib.lines import Line2D
%matplotlib inline

In [3]:
tweet= pd.read_csv('./train.csv')
test=pd.read_csv('./test.csv')
test.head(3)

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."


In [4]:
disaster = []
not_disaster = []

for l in tweet['target']:
    if l == 0:
        disaster.append(0)
        not_disaster.append(1)
    elif l == 1:
        disaster.append(1)
        not_disaster.append(0)

tweet['disaster']= disaster
tweet['not_disaster']= not_disaster
df = tweet[['id', 'keyword','location', 'text', 'target', 'disaster', 'not_disaster']]
df.head()

Unnamed: 0,id,keyword,location,text,target,disaster,not_disaster
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,1,0
1,4,,,Forest fire near La Ronge Sask. Canada,1,1,0
2,5,,,All residents asked to 'shelter in place' are ...,1,1,0
3,6,,,"13,000 people receive #wildfires evacuation or...",1,1,0
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,1,0


### Data Cleaning

In [5]:
def process(tweet):
    tweet = tweet.lower() # convert text to lower-case
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'URL', tweet) # remove URLs
    tweet = re.sub('@[^\s]+', 'AT_USER', tweet) # remove usernames
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet) # remove the # in #hashtag
    return tweet   

def tokenize(tweet):
    _stopwords = set(stopwords.words('english') + list(punctuation) + ['AT_USER','URL'])
    tweet = word_tokenize(tweet) # remove repeated characters (helloooooooo into hello)
    return [word for word in tweet if word not in _stopwords]   

In [6]:
df['Text_Clean'] = df['text'].apply(lambda x: process(x))
filtered_words = [tokenize(sen) for sen in df.Text_Clean]

df['Text_Final'] = [' '.join(sen) for sen in filtered_words]
df['tokens'] = filtered_words

In [8]:
data_train = df[['Text_Final', 'tokens', 'target', 'disaster', 'not_disaster']]
data_train.head()

Unnamed: 0,Text_Final,tokens,target,disaster,not_disaster
0,deeds reason earthquake may allah forgive us,"[deeds, reason, earthquake, may, allah, forgiv...",1,1,0
1,forest fire near la ronge sask canada,"[forest, fire, near, la, ronge, sask, canada]",1,1,0
2,residents asked 'shelter place notified office...,"[residents, asked, 'shelter, place, notified, ...",1,1,0
3,"13,000 people receive wildfires evacuation ord...","[13,000, people, receive, wildfires, evacuatio...",1,1,0
4,got sent photo ruby alaska smoke wildfires pou...,"[got, sent, photo, ruby, alaska, smoke, wildfi...",1,1,0


### Split data into train and test 

In [10]:
data_train, data_test = train_test_split(data_train, 
                                         test_size=0.10, 
                                         random_state=42)

In [11]:
## build training vocabulary and get maximum training sentence length and total number of words training data
all_training_words = [word for tokens in data_train["tokens"] for word in tokens]
training_sentence_lengths = [len(tokens) for tokens in data_train["tokens"]]
TRAINING_VOCAB = sorted(list(set(all_training_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_training_words), len(TRAINING_VOCAB)))
print("Max sentence length is %s" % max(training_sentence_lengths))

64316 words total, with a vocabulary size of 15091
Max sentence length is 25


In [12]:
all_test_words = [word for tokens in data_test['tokens'] for word in tokens]
test_sentence_lengths = [len(tokens) for tokens in data_test['tokens']]
TEST_VOCAB = sorted(list(set(all_test_words)))
print('%s words total, with a vocabulary size of %s' % (len(all_test_words), len(TEST_VOCAB)))
print('Max sentence length is %s' % max(test_sentence_lengths))

7302 words total, with a vocabulary size of 3716
Max sentence length is 24


### Load word2vec, vectorize and get embeddings

In [13]:
word2vec_path = 'C:/Users/william.block/Desktop/Machine Learning/machinelearning/NLP/Ferring - twitter sentiment analysis/word2vec/GoogleNews-vectors-negative300.bin.gz'
word2vec = models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

In [14]:
def get_average_word2vec(tokens_list, vector, generate_missing=False, k=300):
    if len(tokens_list)<1:
        return np.zeros(k)
    if generate_missing:
        vectorized = [vector[word] if word in vector else np.random.rand(k) for word in tokens_list]
    else:
        vectorized = [vector[word] if word in vector else np.zeros(k) for word in tokens_list]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

def get_word2vec_embeddings(vectors, clean_comments, generate_missing=False):
    embeddings = clean_comments['tokens'].apply(lambda x: get_average_word2vec(x, vectors, 
                                                                                generate_missing=generate_missing))
    return list(embeddings)

In [15]:
training_embeddings = get_word2vec_embeddings(word2vec, data_train, generate_missing=True)
MAX_SEQUENCE_LENGTH = 50
EMBEDDING_DIM = 300

### Tokenize and Pad sequences

In [16]:
tokenizer = Tokenizer(num_words=len(TRAINING_VOCAB), lower=True, char_level=False)
tokenizer.fit_on_texts(data_train["Text_Final"].tolist())
training_sequences = tokenizer.texts_to_sequences(data_train["Text_Final"].tolist())

train_word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(train_word_index))

Found 14441 unique tokens.


In [17]:
train_cnn_data = pad_sequences(training_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [18]:
train_embedding_weights = np.zeros((len(train_word_index)+1, EMBEDDING_DIM))
for word,index in train_word_index.items():
    train_embedding_weights[index,:] = word2vec[word] if word in word2vec else np.random.rand(EMBEDDING_DIM)
print(train_embedding_weights.shape)

(14442, 300)


In [19]:
test_sequences = tokenizer.texts_to_sequences(data_test["Text_Final"].tolist())
test_cnn_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

### Train CNN

In [20]:
def ConvNet(embeddings, max_sequence_length, num_words, embedding_dim, labels_index):
    
    embedding_layer = Embedding(num_words,
                            embedding_dim,
                            weights=[embeddings],
                            input_length=max_sequence_length,
                            trainable=False)
    
    sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)

    convs = []
    filter_sizes = [2,3,4,5,6]

    for filter_size in filter_sizes:
        l_conv = Conv1D(filters=200, kernel_size=filter_size, activation='relu')(embedded_sequences)
        l_pool = GlobalMaxPooling1D()(l_conv)
        convs.append(l_pool)


    l_merge = concatenate(convs, axis=1)

    x = Dropout(0.1)(l_merge)  
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.2)(x)
    preds = Dense(labels_index, activation='sigmoid')(x)

    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['acc'])
    model.summary()
    return model

In [21]:
label_names = ['disaster', 'not_disaster']

In [22]:
y_train = data_train[label_names].values

In [23]:
x_train = train_cnn_data
y_tr = y_train

In [24]:
model = ConvNet(train_embedding_weights, MAX_SEQUENCE_LENGTH, len(train_word_index)+1, EMBEDDING_DIM, 
                len(list(label_names)))

W0804 13:14:48.822221  9644 deprecation_wrapper.py:119] From C:\Program Files (x86)\Microsoft Visual Studio\Shared\Anaconda3_64\lib\site-packages\keras\backend\tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0804 13:14:49.438543  9644 deprecation_wrapper.py:119] From C:\Program Files (x86)\Microsoft Visual Studio\Shared\Anaconda3_64\lib\site-packages\keras\backend\tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0804 13:14:49.565546  9644 deprecation_wrapper.py:119] From C:\Program Files (x86)\Microsoft Visual Studio\Shared\Anaconda3_64\lib\site-packages\keras\backend\tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0804 13:14:49.679544  9644 deprecation_wrapper.py:119] From C:\Program Files (x86)\Microsoft Visual Studio\Shared\Anaconda3_64\lib\site-packages\keras\backend\tensorflow_backe

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 50)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 50, 300)      4332600     input_1[0][0]                    
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 49, 200)      120200      embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, 48, 200)      180200      embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_3 (

In [25]:
num_epochs = 25
batch_size = 34

In [26]:
hist = model.fit(x_train, y_tr, epochs=num_epochs, validation_split=0.25, shuffle=True, batch_size=batch_size)

Train on 5138 samples, validate on 1713 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


### Test CNN

In [27]:
predictions = model.predict(test_cnn_data, batch_size=1024, verbose=1)



In [28]:
labels = [1, 0]
prediction_labels=[]
for p in predictions:
    prediction_labels.append(labels[np.argmax(p)])

In [32]:
print(sklearn.metrics.classification_report(data_test['target'].tolist(), prediction_labels)) 

              precision    recall  f1-score   support

           0       0.78      0.81      0.80       426
           1       0.75      0.71      0.73       336

    accuracy                           0.77       762
   macro avg       0.77      0.76      0.76       762
weighted avg       0.77      0.77      0.77       762



### Submission

In [33]:
test['new_clean_text'] = test['text'].apply(lambda x: process(x))
filtered_words = [tokenize(sen) for sen in test.new_clean_text]

test['new_clean_text'] = [' '.join(sen) for sen in filtered_words]
# df['tokens'] = filtered_words

# df.head()
tweet_sequences = tokenizer.texts_to_sequences(test["new_clean_text"].tolist())
tweet_data = pad_sequences(tweet_sequences, maxlen=MAX_SEQUENCE_LENGTH)

predictions = model.predict(tweet_data, batch_size=1024, verbose=1)

cnn_pred = []
for p in predictions:
#     print(p[0])
    cnn_pred.append(p[0])



In [40]:
labels = [1, 0]
prediction_labels=[]
for p in predictions:
    prediction_labels.append(labels[np.argmax(p)])

test['target'] = prediction_labels
submission = test[['id','target']]
submission.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1


In [41]:
submission.to_csv('cnn_v2_submission.csv', index = False)