## Common imports

In [0]:
#based on code accessed on 21/10/2019 from https://towardsdatascience.com/machine-learning-word-embedding-sentiment-classification-using-keras-b83c28087456 
import pandas as pd
import numpy as np

## Upload data

In [0]:
! wget -cq https://github.com/ScottJK-20190706/tweet_classifier/blob/master/pickle_files//train_data_formatted.pickle?raw=true
! wget -cq https://github.com/ScottJK-20190706/tweet_classifier/blob/master/pickle_files/eval_data_formatted.pickle?raw=true
train_data = pd.read_pickle('train_data_formatted.pickle?raw=true')
eval_data = pd.read_pickle('eval_data_formatted.pickle?raw=true')
all_data = train_data.append(eval_data).reset_index(drop=True)

## Tokenise each tweet

In [0]:
import string
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

tweet_lines = list()
lines = all_data['Tweet'].values.tolist()

for line in lines:
  tokens = word_tokenize(line)
  tokens = [w.lower() for w in tokens]
  table = str.maketrans('','',string.punctuation)
  stripped = [w.translate(table) for w in tokens]
  words = [word for word in stripped if word.isalpha()]
  stop_words = set(stopwords.words('english'))
  words = [w for w in words if not w in stop_words]
  tweet_lines.append(words)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## Convert each tweet into array of integers corresponding to each word

In [0]:
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(tweet_lines)
sequences = tokenizer_obj.texts_to_sequences(tweet_lines)

word_index = tokenizer_obj.word_index
print('found %s unique tokens' % len(word_index))

max_length = len(max(tweet_lines,key=len))
tweet_pad = pad_sequences(sequences, maxlen=max_length)

found 7426 unique tokens


## Obtain the word embeddings

In [0]:
!pip install chakin #word embeddings
import chakin
chakin.search(lang='English')
chakin.download(number=2, save_dir='/tmp/') # select fastText(en)

Collecting chakin
  Downloading https://files.pythonhosted.org/packages/ca/3f/ca2f63451c0ab47970a6ab1d39d96118e70b6e73125529cea767c31368a3/chakin-0.0.8-py3-none-any.whl
Installing collected packages: chakin
Successfully installed chakin-0.0.8
                   Name  Dimension  ... Language    Author
2          fastText(en)        300  ...  English  Facebook
11         GloVe.6B.50d         50  ...  English  Stanford
12        GloVe.6B.100d        100  ...  English  Stanford
13        GloVe.6B.200d        200  ...  English  Stanford
14        GloVe.6B.300d        300  ...  English  Stanford
15       GloVe.42B.300d        300  ...  English  Stanford
16      GloVe.840B.300d        300  ...  English  Stanford
17    GloVe.Twitter.25d         25  ...  English  Stanford
18    GloVe.Twitter.50d         50  ...  English  Stanford
19   GloVe.Twitter.100d        100  ...  English  Stanford
20   GloVe.Twitter.200d        200  ...  English  Stanford
21  word2vec.GoogleNews        300  ...  English 

Test: 100% ||                                      | Time:  0:00:28  45.0 MiB/s


'/tmp/cc.en.300.vec.gz'

In [0]:
import gzip
import shutil
with gzip.open('/tmp/cc.en.300.vec.gz', 'rb') as f_in:
    with open('cc.en.300.vec', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

In [0]:
#code adapted from https://stackoverflow.com/questions/37793118/load-pretrained-glove-vectors-in-python
def loadFasttextModel(FasttextFile):
    print("Loading Fasttext Model")
    f = open(FasttextFile,'r')
    model = {}
    for line in f:
        splitLine = line.split(' ')
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print("Done.",len(model)," words loaded!")
    return model

In [0]:
w = loadFasttextModel('cc.en.300.vec')

Loading Fasttext Model
Done. 2000000  words loaded!


## Create the embedding matrix
Integer values in tweet_pad align with rows in embedding_matrix

In [0]:
num_words = len(word_index)+1
embedding_matrix = np.zeros((num_words,300))

for word, i in word_index.items():
  if i > num_words:
    continue
  try:
    embedding_vector = w[word]
  except:
    embedding_vector = np.zeros(300)
  #if embedding_vector is not None:
  embedding_matrix[i] = embedding_vector

## Create train and eval

In [0]:
X_train_pad = tweet_pad[:train_data.shape[0]]
y_train = train_data.loc[:,'class'].values
X_test_pad = tweet_pad[train_data.shape[0]:]
y_test = eval_data.loc[:,'class'].values

## Build RNN model

In [0]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, GRU, Bidirectional
from keras.layers.embeddings import Embedding
from keras.initializers import Constant

model = Sequential()
embedding_layer = Embedding(num_words,
                           300,
                           embeddings_initializer=Constant(embedding_matrix),
                           input_length = max_length,
                           trainable = False)
model.add(embedding_layer)
model.add(GRU(units=64, dropout=0.4, recurrent_dropout=0.4))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())





Using TensorFlow backend.




Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 42, 300)           2228100   
_________________________________________________________________
gru_1 (GRU)                  (None, 64)                70080     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 2,298,245
Trainable params: 70,145
Non-trainable params: 2,228,100
_________________________________________________________________
None


## Train Model

In [0]:
model.fit(X_train_pad, y_train, batch_size=32, epochs=25, validation_data=(X_test_pad,y_test), verbose=2)




Train on 4083 samples, validate on 1021 samples
Epoch 1/25





 - 7s - loss: 0.3624 - acc: 0.8680 - val_loss: 0.2487 - val_acc: 0.8913
Epoch 2/25
 - 5s - loss: 0.2363 - acc: 0.9148 - val_loss: 0.2050 - val_acc: 0.9246
Epoch 3/25
 - 5s - loss: 0.1998 - acc: 0.9251 - val_loss: 0.1960 - val_acc: 0.9236
Epoch 4/25
 - 5s - loss: 0.1901 - acc: 0.9290 - val_loss: 0.1798 - val_acc: 0.9324
Epoch 5/25
 - 5s - loss: 0.1830 - acc: 0.9351 - val_loss: 0.1784 - val_acc: 0.9295
Epoch 6/25
 - 5s - loss: 0.1631 - acc: 0.9363 - val_loss: 0.1726 - val_acc: 0.9344
Epoch 7/25
 - 5s - loss: 0.1610 - acc: 0.9429 - val_loss: 0.1730 - val_acc: 0.9334
Epoch 8/25
 - 5s - loss: 0.1542 - acc: 0.9412 - val_loss: 0.1608 - val_acc: 0.9461
Epoch 9/25
 - 5s - loss: 0.1576 - acc: 0.9400 - val_loss: 0.1566 - val_acc: 0.9432
Epoch 10/25
 - 5s - loss: 0.1551 - acc: 0.9415 - val_loss: 0.1653 - val_acc: 0.9403
Epoch 11/25
 - 5s - loss: 0.1420 - acc: 0.9454 - val_loss: 0.1554 - val_acc: 0.9442
Epoch 12/25
 - 5s - loss: 0.1

<keras.callbacks.History at 0x7fe7d8b94198>

## Evaluation metrics

In [0]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, auc, roc_curve, accuracy_score #metrics

predict_proba = model.predict(x=X_test_pad)
predict_class = predict_proba.copy()
predict_class[predict_class >= 0.5] = 1
predict_class[predict_class < 0.5] = 0

conf_matrix = confusion_matrix(y_test,predict_class) #build confusion matrix
precision = precision_score(y_test,predict_class) #calculate precision
recall = recall_score(y_test,predict_class) #calculate recall
f1 = f1_score(y_test,predict_class) #calculate f1
fpr, tpr, thresholds = roc_curve(y_test,predict_class)
auc_score = auc(fpr, tpr) #calculate auc
accuracy = accuracy_score(y_test,predict_class) #calculate accuracy
print(conf_matrix)
print('precision = ' + str(precision))
print('recall = ' + str(recall))
print('f1 = ' + str(f1))
print('auc = ' + str(auc_score))
print('accuracy = ' + str(accuracy))

metrics = pd.DataFrame({'acc': accuracy, 
                        'auc_sc': auc_score, 
                        'bp': 'rnn', 
                        'f_1': f1,
                        'fn': conf_matrix[1][0],
                        'fp':conf_matrix[0][1],
                        'name':'rnn',
                        'p': precision,
                        'r': recall,
                        'tf': 'rnn_fasttext',
                        'tn': conf_matrix[0][0],
                        'tp': conf_matrix[1][1]},
                        index=[0])

prediction_summary = pd.DataFrame({'tweet_id': eval_data['Tweet ID'].values,
                                   'pred': predict_class.tolist(),
                                   'model': 'rnn_fasttext',
                                   'file': 'eval_data'})


[[820  11]
 [ 41 149]]
precision = 0.93125
recall = 0.7842105263157895
f1 = 0.8514285714285714
auc = 0.8854867312686048
accuracy = 0.9490695396669931


## Authenticate location for saving files

In [0]:
# Set the output directory for saving model file
# Optionally, set a GCP bucket location
import tensorflow as tf

FILE_OUTPUT_DIR = 'fasttext_rnn_files'#@param {type:"string"}
#@markdown Whether or not to clear/delete the directory and create a new one
DO_DELETE = True #@param {type:"boolean"}
#@markdown Set USE_BUCKET and BUCKET if you want to (optionally) store model output on GCP bucket.
USE_BUCKET = True #@param {type:"boolean"}
BUCKET = 'dissertation_bucket' #@param {type:"string"}

if USE_BUCKET:
  FILE_OUTPUT_DIR = 'gs://{}/{}'.format(BUCKET, FILE_OUTPUT_DIR)
  from google.colab import auth
  auth.authenticate_user()

if DO_DELETE:
  try:
    tf.gfile.DeleteRecursively(FILE_OUTPUT_DIR)
  except:
    # Doesn't matter if the directory didn't exist
    pass
tf.gfile.MakeDirs(FILE_OUTPUT_DIR)
print('***** File output directory: {} *****'.format(FILE_OUTPUT_DIR))

***** File output directory: gs://dissertation_bucket/fasttext_rnn_files *****


## Save metrics and predictions per tweet

In [0]:
!pip install gcsfs #google cloud storage
metrics.to_csv('gs://dissertation_bucket/fasttext_rnn_files/metrics.csv')
prediction_summary.to_pickle('/tmp/prediction_summary.pickle')
!gsutil cp /tmp/prediction_summary.pickle gs://dissertation_bucket/fasttext_rnn_files/

Collecting gcsfs
[?25l  Downloading https://files.pythonhosted.org/packages/ab/92/0297f2813cb240c52e90f8587420149970565800e019e1b08ef5ad28b6d9/gcsfs-0.3.1.tar.gz (43kB)
[K     |████████████████████████████████| 51kB 3.3MB/s 
Building wheels for collected packages: gcsfs
  Building wheel for gcsfs (setup.py) ... [?25l[?25hdone
  Created wheel for gcsfs: filename=gcsfs-0.3.1-py2.py3-none-any.whl size=17936 sha256=15b843ca52886a3c184d86d627c37e2a78ae9e1e4d6c95dd612ce99be4b97c87
  Stored in directory: /root/.cache/pip/wheels/9d/2b/6f/86954f0d8caa1173841e62bb780dc0f8693bd268e04a267682
Successfully built gcsfs
Installing collected packages: gcsfs
Successfully installed gcsfs-0.3.1
Copying file:///tmp/prediction_summary.pickle [Content-Type=application/octet-stream]...
/ [1 files][ 42.3 KiB/ 42.3 KiB]                                                
Operation completed over 1 objects/42.3 KiB.                                     
