## Common imports

In [0]:
#based on code accessed on 21/10/2019 from https://towardsdatascience.com/machine-learning-word-embedding-sentiment-classification-using-keras-b83c28087456 
import pandas as pd
import numpy as np

## Upload data

In [0]:
! wget -cq https://github.com/ScottJK-20190706/tweet_classifier/blob/master/pickle_files//train_data_formatted.pickle?raw=true
! wget -cq https://github.com/ScottJK-20190706/tweet_classifier/blob/master/pickle_files/eval_data_formatted.pickle?raw=true
train_data = pd.read_pickle('train_data_formatted.pickle?raw=true')
eval_data = pd.read_pickle('eval_data_formatted.pickle?raw=true')
all_data = train_data.append(eval_data).reset_index(drop=True)

## Tokenise each tweet

In [4]:
import string
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

tweet_lines = list()
lines = all_data['Tweet'].values.tolist()

for line in lines:
  tokens = word_tokenize(line)
  tokens = [w.lower() for w in tokens]
  table = str.maketrans('','',string.punctuation)
  stripped = [w.translate(table) for w in tokens]
  words = [word for word in stripped if word.isalpha()]
  stop_words = set(stopwords.words('english'))
  words = [w for w in words if not w in stop_words]
  tweet_lines.append(words)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Convert each tweet into array of integers corresponding to each word

In [7]:
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(tweet_lines)
sequences = tokenizer_obj.texts_to_sequences(tweet_lines)

word_index = tokenizer_obj.word_index
print('found %s unique tokens' % len(word_index))

max_length = len(max(tweet_lines,key=len))
tweet_pad = pad_sequences(sequences, maxlen=max_length)

found 7426 unique tokens


## Obtain the word embeddings

In [9]:
!pip install chakin #word embeddings
import chakin
chakin.search(lang='English')
chakin.download(number=21, save_dir='/tmp/') # select word2vec.GoogleNews

Collecting chakin
  Downloading https://files.pythonhosted.org/packages/ca/3f/ca2f63451c0ab47970a6ab1d39d96118e70b6e73125529cea767c31368a3/chakin-0.0.8-py3-none-any.whl
Installing collected packages: chakin
Successfully installed chakin-0.0.8


Test:   0% |-                                      | ETA:   0:01:18  20.1 MiB/s

                   Name  Dimension  ... Language    Author
2          fastText(en)        300  ...  English  Facebook
11         GloVe.6B.50d         50  ...  English  Stanford
12        GloVe.6B.100d        100  ...  English  Stanford
13        GloVe.6B.200d        200  ...  English  Stanford
14        GloVe.6B.300d        300  ...  English  Stanford
15       GloVe.42B.300d        300  ...  English  Stanford
16      GloVe.840B.300d        300  ...  English  Stanford
17    GloVe.Twitter.25d         25  ...  English  Stanford
18    GloVe.Twitter.50d         50  ...  English  Stanford
19   GloVe.Twitter.100d        100  ...  English  Stanford
20   GloVe.Twitter.200d        200  ...  English  Stanford
21  word2vec.GoogleNews        300  ...  English    Google

[12 rows x 7 columns]


Test: 100% ||                                      | Time:  0:00:23  67.8 MiB/s


'/tmp/GoogleNews-vectors-negative300.bin.gz'

In [0]:
import gzip
import shutil
with gzip.open('/tmp/GoogleNews-vectors-negative300.bin.gz', 'rb') as f_in:
    with open('/tmp/GoogleNews-vectors-negative300.bin', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

In [11]:
from datetime import datetime
print(f'Beginning loading!')
current_time = datetime.now()
from gensim import models
w = models.KeyedVectors.load_word2vec_format('/tmp/GoogleNews-vectors-negative300.bin', binary=True)
print('Loading took time ', datetime.now() - current_time)

Beginning loading!


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


Loading took time  0:00:40.634987


## Create the embedding matrix
Integer values in tweet_pad align with rows in embedding_matrix

In [0]:
num_words = len(word_index)+1
embedding_matrix = np.zeros((num_words,300))

for word, i in word_index.items():
  if i > num_words:
    continue
  try:
    embedding_vector = w[word]
  except:
    embedding_vector = np.zeros(300)
  #if embedding_vector is not None:
  embedding_matrix[i] = embedding_vector

## Create train and eval

In [0]:
X_train_pad = tweet_pad[:train_data.shape[0]]
y_train = train_data.loc[:,'class'].values
X_test_pad = tweet_pad[train_data.shape[0]:]
y_test = eval_data.loc[:,'class'].values

## Build RNN model

In [90]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, GRU, Bidirectional
from keras.layers.embeddings import Embedding
from keras.initializers import Constant

model = Sequential()
embedding_layer = Embedding(num_words,
                           300,
                           embeddings_initializer=Constant(embedding_matrix),
                           input_length = max_length,
                           trainable = False)
model.add(embedding_layer)
model.add(GRU(units=64, dropout=0.4, recurrent_dropout=0.4))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential_17"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_17 (Embedding)     (None, 42, 300)           2228100   
_________________________________________________________________
gru_15 (GRU)                 (None, 64)                70080     
_________________________________________________________________
dense_17 (Dense)             (None, 1)                 65        
Total params: 2,298,245
Trainable params: 70,145
Non-trainable params: 2,228,100
_________________________________________________________________
None


## Train Model

In [91]:
model.fit(X_train_pad, y_train, batch_size=32, epochs=25, validation_data=(X_test_pad,y_test), verbose=2)

Train on 4083 samples, validate on 1021 samples
Epoch 1/25
 - 13s - loss: 0.3404 - acc: 0.8704 - val_loss: 0.2501 - val_acc: 0.8952
Epoch 2/25
 - 5s - loss: 0.2221 - acc: 0.9162 - val_loss: 0.2065 - val_acc: 0.9177
Epoch 3/25
 - 5s - loss: 0.1876 - acc: 0.9324 - val_loss: 0.1742 - val_acc: 0.9373
Epoch 4/25
 - 6s - loss: 0.1754 - acc: 0.9356 - val_loss: 0.1627 - val_acc: 0.9393
Epoch 5/25
 - 5s - loss: 0.1548 - acc: 0.9412 - val_loss: 0.1541 - val_acc: 0.9412
Epoch 6/25
 - 6s - loss: 0.1561 - acc: 0.9388 - val_loss: 0.1430 - val_acc: 0.9422
Epoch 7/25
 - 5s - loss: 0.1423 - acc: 0.9488 - val_loss: 0.1366 - val_acc: 0.9461
Epoch 8/25
 - 6s - loss: 0.1224 - acc: 0.9552 - val_loss: 0.1276 - val_acc: 0.9549
Epoch 9/25
 - 5s - loss: 0.1265 - acc: 0.9552 - val_loss: 0.1172 - val_acc: 0.9579
Epoch 10/25
 - 6s - loss: 0.1117 - acc: 0.9581 - val_loss: 0.1238 - val_acc: 0.9549
Epoch 11/25
 - 6s - loss: 0.1045 - acc: 0.9625 - val_loss: 0.1080 - val_acc: 0.9657
Epoch 12/25
 - 6s - loss: 0.0953 - a

<keras.callbacks.History at 0x7f5e7b3bf2b0>

## Evaluation metrics

In [92]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, auc, roc_curve, accuracy_score #metrics

predict_proba = model.predict(x=X_test_pad)
predict_class = predict_proba.copy()
predict_class[predict_class >= 0.5] = 1
predict_class[predict_class < 0.5] = 0

conf_matrix = confusion_matrix(y_test,predict_class) #build confusion matrix
precision = precision_score(y_test,predict_class) #calculate precision
recall = recall_score(y_test,predict_class) #calculate recall
f1 = f1_score(y_test,predict_class) #calculate f1
fpr, tpr, thresholds = roc_curve(y_test,predict_class)
auc_score = auc(fpr, tpr) #calculate auc
accuracy = accuracy_score(y_test,predict_class) #calculate accuracy
print(conf_matrix)
print('precision = ' + str(precision))
print('recall = ' + str(recall))
print('f1 = ' + str(f1))
print('auc = ' + str(auc_score))
print('accuracy = ' + str(accuracy))

metrics = pd.DataFrame({'acc': accuracy, 
                        'auc_sc': auc_score, 
                        'bp': 'rnn', 
                        'f_1': f1,
                        'fn': conf_matrix[1][0],
                        'fp':conf_matrix[0][1],
                        'name':'rnn',
                        'p': precision,
                        'r': recall,
                        'tf': 'rnn_w2v',
                        'tn': conf_matrix[0][0],
                        'tp': conf_matrix[1][1]},
                        index=[0])

prediction_summary = pd.DataFrame({'tweet_id': eval_data['Tweet ID'].values,
                                   'pred': predict_class.tolist(),
                                   'model': 'rnn_w2v',
                                   'file': 'eval_data'})


[[819  12]
 [ 27 163]]
precision = 0.9314285714285714
recall = 0.8578947368421053
f1 = 0.8931506849315068
auc = 0.9217271518145544
accuracy = 0.9618021547502449


## Authenticate location for saving files

In [45]:
# Set the output directory for saving model file
# Optionally, set a GCP bucket location
import tensorflow as tf

FILE_OUTPUT_DIR = 'w2v_rnn_files'#@param {type:"string"}
#@markdown Whether or not to clear/delete the directory and create a new one
DO_DELETE = True #@param {type:"boolean"}
#@markdown Set USE_BUCKET and BUCKET if you want to (optionally) store model output on GCP bucket.
USE_BUCKET = True #@param {type:"boolean"}
BUCKET = 'dissertation_bucket' #@param {type:"string"}

if USE_BUCKET:
  FILE_OUTPUT_DIR = 'gs://{}/{}'.format(BUCKET, FILE_OUTPUT_DIR)
  from google.colab import auth
  auth.authenticate_user()

if DO_DELETE:
  try:
    tf.gfile.DeleteRecursively(FILE_OUTPUT_DIR)
  except:
    # Doesn't matter if the directory didn't exist
    pass
tf.gfile.MakeDirs(FILE_OUTPUT_DIR)
print('***** File output directory: {} *****'.format(FILE_OUTPUT_DIR))

***** File output directory: gs://dissertation_bucket/w2v_rnn_files *****


## Save metrics and predictions per tweet

In [47]:
!pip install gcsfs #google cloud storage
metrics.to_csv('gs://dissertation_bucket/w2v_rnn_files/metrics.csv')
prediction_summary.to_pickle('/tmp/prediction_summary.pickle')
!gsutil cp /tmp/prediction_summary.pickle gs://dissertation_bucket/w2v_rnn_files/

Collecting gcsfs
[?25l  Downloading https://files.pythonhosted.org/packages/ab/92/0297f2813cb240c52e90f8587420149970565800e019e1b08ef5ad28b6d9/gcsfs-0.3.1.tar.gz (43kB)
[K     |████████████████████████████████| 51kB 2.0MB/s 
Building wheels for collected packages: gcsfs
  Building wheel for gcsfs (setup.py) ... [?25l[?25hdone
  Created wheel for gcsfs: filename=gcsfs-0.3.1-py2.py3-none-any.whl size=17936 sha256=87acc7d3107b1e58e1d122e531af2f4b4d74ebf90765ac9c9f0961afad7a1151
  Stored in directory: /root/.cache/pip/wheels/9d/2b/6f/86954f0d8caa1173841e62bb780dc0f8693bd268e04a267682
Successfully built gcsfs
Installing collected packages: gcsfs
Successfully installed gcsfs-0.3.1
Copying file:///tmp/prediction_summary.pickle [Content-Type=application/octet-stream]...
/ [1 files][ 42.3 KiB/ 42.3 KiB]                                                
Operation completed over 1 objects/42.3 KiB.                                     
