## Common imports

In [0]:
#based on code accessed on 21/10/2019 from https://towardsdatascience.com/machine-learning-word-embedding-sentiment-classification-using-keras-b83c28087456 
import pandas as pd
import numpy as np

## Upload data

In [0]:
! wget -cq https://github.com/ScottJK-20190706/tweet_classifier/blob/master/pickle_files//train_data_formatted.pickle?raw=true
! wget -cq https://github.com/ScottJK-20190706/tweet_classifier/blob/master/pickle_files/eval_data_formatted.pickle?raw=true
train_data = pd.read_pickle('train_data_formatted.pickle?raw=true')
eval_data = pd.read_pickle('eval_data_formatted.pickle?raw=true')
all_data = train_data.append(eval_data).reset_index(drop=True)

## Tokenise each tweet

In [3]:
import string
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

tweet_lines = list()
lines = all_data['Tweet'].values.tolist()

for line in lines:
  tokens = word_tokenize(line)
  tokens = [w.lower() for w in tokens]
  table = str.maketrans('','',string.punctuation)
  stripped = [w.translate(table) for w in tokens]
  words = [word for word in stripped if word.isalpha()]
  stop_words = set(stopwords.words('english'))
  words = [w for w in words if not w in stop_words]
  tweet_lines.append(words)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## Convert each tweet into array of integers corresponding to each word

In [5]:
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(tweet_lines)
sequences = tokenizer_obj.texts_to_sequences(tweet_lines)

word_index = tokenizer_obj.word_index
print('found %s unique tokens' % len(word_index))

max_length = len(max(tweet_lines,key=len))
tweet_pad = pad_sequences(sequences, maxlen=max_length)

found 7426 unique tokens


In [7]:
len(tweet_pad)

5104

## Obtain the word embeddings

In [8]:
!pip install chakin #word embeddings
import chakin
chakin.search(lang='English')
chakin.download(number=16, save_dir='/tmp/') # select GloVe.840B.300d

Collecting chakin
  Downloading https://files.pythonhosted.org/packages/ca/3f/ca2f63451c0ab47970a6ab1d39d96118e70b6e73125529cea767c31368a3/chakin-0.0.8-py3-none-any.whl
Installing collected packages: chakin
Successfully installed chakin-0.0.8
                   Name  Dimension  ... Language    Author
2          fastText(en)        300  ...  English  Facebook
11         GloVe.6B.50d         50  ...  English  Stanford
12        GloVe.6B.100d        100  ...  English  Stanford
13        GloVe.6B.200d        200  ...  English  Stanford
14        GloVe.6B.300d        300  ...  English  Stanford
15       GloVe.42B.300d        300  ...  English  Stanford
16      GloVe.840B.300d        300  ...  English  Stanford
17    GloVe.Twitter.25d         25  ...  English  Stanford
18    GloVe.Twitter.50d         50  ...  English  Stanford
19   GloVe.Twitter.100d        100  ...  English  Stanford
20   GloVe.Twitter.200d        200  ...  English  Stanford
21  word2vec.GoogleNews        300  ...  English 

Test: 100% ||                                      | Time:  0:16:52   2.0 MiB/s


'/tmp/glove.840B.300d.zip'

In [0]:
from zipfile import ZipFile
with ZipFile('/tmp/glove.840B.300d.zip', 'r') as zipObj:
   # Extract all the contents of zip file in current directory
  zipObj.extractall()

In [0]:
#code adapted from https://stackoverflow.com/questions/37793118/load-pretrained-glove-vectors-in-python
def loadGloveModel(gloveFile):
    print("Loading Glove Model")
    f = open(gloveFile,'r')
    model = {}
    for line in f:
        splitLine = line.split(' ')
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print("Done.",len(model)," words loaded!")
    return model

In [11]:
w = loadGloveModel('glove.840B.300d.txt')

Loading Glove Model
Done. 2196016  words loaded!


## Create the embedding matrix
Integer values in tweet_pad align with rows in embedding_matrix

In [0]:
num_words = len(word_index)+1
embedding_matrix = np.zeros((num_words,300))

for word, i in word_index.items():
  if i > num_words:
    continue
  try:
    embedding_vector = w[word]
  except:
    embedding_vector = np.zeros(300)
  #if embedding_vector is not None:
  embedding_matrix[i] = embedding_vector

## 10 stratified splits of the data.  Train model and assess 10 times.

In [26]:
model_run = []
name = []
tn = []
fp = []
fn = []
tp = []
p = []
r = []
f_1 = []
auc_sc = []
acc = []

n = 0

from sklearn.model_selection import StratifiedShuffleSplit
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, GRU, Bidirectional
from keras.layers.embeddings import Embedding
from keras.initializers import Constant
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, auc, roc_curve, accuracy_score #metrics


sss = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
for train_index, test_index in sss.split(all_data['Tweet'], all_data['class']):

  ##### CREATE TRAIN AND EVAL #####
  X_train_pad = tweet_pad[train_index]
  y_train = all_data.loc[train_index,'class'].values
  X_test_pad = tweet_pad[test_index]
  y_test = all_data.loc[test_index,'class'].values


  ##### BUILD RNN MODEL #####
  model = Sequential()
  embedding_layer = Embedding(num_words,
                            300,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length = max_length,
                            trainable = False)
  model.add(embedding_layer)
  model.add(GRU(units=64, dropout=0.4, recurrent_dropout=0.4))
  model.add(Dense(1, activation='sigmoid'))
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  print(model.summary())

  ##### TRAIN MODEL #####
  model.fit(X_train_pad, y_train, batch_size=32, epochs=25, validation_data=(X_test_pad,y_test), verbose=2)

  ##### EVALUATION METRICS #####
  predict_proba = model.predict(x=X_test_pad)
  predict_class = predict_proba.copy()
  predict_class[predict_class >= 0.5] = 1
  predict_class[predict_class < 0.5] = 0

  conf_matrix = confusion_matrix(y_test,predict_class) #build confusion matrix
  precision = precision_score(y_test,predict_class) #calculate precision
  recall = recall_score(y_test,predict_class) #calculate recall
  f1 = f1_score(y_test,predict_class) #calculate f1
  fpr, tpr, thresholds = roc_curve(y_test,predict_class)
  auc_score = auc(fpr, tpr) #calculate auc
  accuracy = accuracy_score(y_test,predict_class) #calculate accuracy
  print(conf_matrix)
  print('precision = ' + str(precision))
  print('recall = ' + str(recall))
  print('f1 = ' + str(f1))
  print('auc = ' + str(auc_score))
  print('accuracy = ' + str(accuracy))

  model_run = np.append(model_run,n)
  name = np.append(name,'rnn_glove')
  tn = np.append(tn,conf_matrix[0][0])
  fp = np.append(fp,conf_matrix[0][1])
  fn = np.append(fn,conf_matrix[1][0])
  tp = np.append(tp,conf_matrix[1][1])
  p = np.append(p,precision)
  r = np.append(r,recall)
  f_1 = np.append(f_1,f1)
  auc_sc = np.append(auc_sc,auc_score)
  acc = np.append(acc,accuracy)

  n = n+1

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 42, 300)           2228100   
_________________________________________________________________
gru_3 (GRU)                  (None, 64)                70080     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 65        
Total params: 2,298,245
Trainable params: 70,145
Non-trainable params: 2,228,100
_________________________________________________________________
None
Train on 4083 samples, validate on 1021 samples
Epoch 1/25
 - 7s - loss: 0.3112 - acc: 0.8771 - val_loss: 0.1863 - val_acc: 0.9216
Epoch 2/25
 - 5s - loss: 0.1852 - acc: 0.9292 - val_loss: 0.1344 - val_acc: 0.9471
Epoch 3/25
 - 5s - loss: 0.1521 - acc: 0.9439 - val_loss: 0.1273 - val_acc: 0.9500
Epoch 4/25
 - 5s - loss: 0.1310 - acc: 0.9515 - val_loss: 0.1179 - 

In [0]:
metrics = pd.DataFrame({
  'model_run': model_run,  
  'name' : name,
  'tn' : tn,
  'fp' : fp,
  'fn' : fn,
  'tp' : tp,
  'p' : p,
  'r' : r,
  'f_1' : f_1,
  'auc_sc' : auc_sc,
  'acc' : acc  
})

In [28]:
metrics

Unnamed: 0,model_run,name,tn,fp,fn,tp,p,r,f_1,auc_sc,acc
0,0.0,rnn_glove,831.0,22.0,11.0,157.0,0.877095,0.934524,0.904899,0.954366,0.967679
1,1.0,rnn_glove,834.0,19.0,16.0,152.0,0.888889,0.904762,0.896755,0.941244,0.96572
2,2.0,rnn_glove,836.0,17.0,25.0,143.0,0.89375,0.85119,0.871951,0.91563,0.958864
3,3.0,rnn_glove,840.0,13.0,28.0,140.0,0.915033,0.833333,0.872274,0.909047,0.959843
4,4.0,rnn_glove,843.0,10.0,18.0,150.0,0.9375,0.892857,0.914634,0.940567,0.972576
5,5.0,rnn_glove,840.0,13.0,23.0,145.0,0.917722,0.863095,0.889571,0.923927,0.96474
6,6.0,rnn_glove,835.0,18.0,18.0,150.0,0.892857,0.892857,0.892857,0.935878,0.96474
7,7.0,rnn_glove,842.0,11.0,20.0,148.0,0.930818,0.880952,0.905199,0.934028,0.969638
8,8.0,rnn_glove,836.0,17.0,16.0,152.0,0.899408,0.904762,0.902077,0.942416,0.967679
9,9.0,rnn_glove,834.0,19.0,19.0,149.0,0.886905,0.886905,0.886905,0.932315,0.962782


## Authenticate location for saving files

In [29]:
# Set the output directory for saving model file
# Optionally, set a GCP bucket location
import tensorflow as tf

FILE_OUTPUT_DIR = 'glove_rnn_assess_files'#@param {type:"string"}
#@markdown Whether or not to clear/delete the directory and create a new one
DO_DELETE = True #@param {type:"boolean"}
#@markdown Set USE_BUCKET and BUCKET if you want to (optionally) store model output on GCP bucket.
USE_BUCKET = True #@param {type:"boolean"}
BUCKET = 'dissertation_bucket' #@param {type:"string"}

if USE_BUCKET:
  FILE_OUTPUT_DIR = 'gs://{}/{}'.format(BUCKET, FILE_OUTPUT_DIR)
  from google.colab import auth
  auth.authenticate_user()

if DO_DELETE:
  try:
    tf.gfile.DeleteRecursively(FILE_OUTPUT_DIR)
  except:
    # Doesn't matter if the directory didn't exist
    pass
tf.gfile.MakeDirs(FILE_OUTPUT_DIR)
print('***** File output directory: {} *****'.format(FILE_OUTPUT_DIR))

***** File output directory: gs://dissertation_bucket/glove_rnn_assess_files *****


## Save metrics and predictions per tweet

In [30]:
!pip install gcsfs #google cloud storage
metrics.to_csv('gs://dissertation_bucket/glove_rnn_assess_files/metrics.csv')

Collecting gcsfs
[?25l  Downloading https://files.pythonhosted.org/packages/ab/92/0297f2813cb240c52e90f8587420149970565800e019e1b08ef5ad28b6d9/gcsfs-0.3.1.tar.gz (43kB)
[K     |███████▋                        | 10kB 14.8MB/s eta 0:00:01[K     |███████████████▏                | 20kB 2.4MB/s eta 0:00:01[K     |██████████████████████▊         | 30kB 3.5MB/s eta 0:00:01[K     |██████████████████████████████▎ | 40kB 2.2MB/s eta 0:00:01[K     |████████████████████████████████| 51kB 2.1MB/s 
Building wheels for collected packages: gcsfs
  Building wheel for gcsfs (setup.py) ... [?25l[?25hdone
  Created wheel for gcsfs: filename=gcsfs-0.3.1-py2.py3-none-any.whl size=17936 sha256=373d34d42d22d4097e10cf97f4dccebee08b0be03e0238ef7b6f02c619bdcd10
  Stored in directory: /root/.cache/pip/wheels/9d/2b/6f/86954f0d8caa1173841e62bb780dc0f8693bd268e04a267682
Successfully built gcsfs
Installing collected packages: gcsfs
Successfully installed gcsfs-0.3.1
