## Common Imports

In [0]:
import tensorflow as tf
import pandas as pd

## Authorise access to Google Cloud Storage

In [9]:
# Set the output directory for saving model file
# Optionally, set a GCP bucket location

OUTPUT_DIR = ''#@param {type:"string"}
#@markdown Whether or not to clear/delete the directory and create a new one
DO_DELETE = False #@param {type:"boolean"}
#@markdown Set USE_BUCKET and BUCKET if you want to (optionally) store model output on GCP bucket.
USE_BUCKET = True #@param {type:"boolean"}
BUCKET = 'dissertation_bucket' #@param {type:"string"}

if USE_BUCKET:
  OUTPUT_DIR = 'gs://{}/{}'.format(BUCKET, OUTPUT_DIR)
  from google.colab import auth
  auth.authenticate_user()

if DO_DELETE:
  try:
    tf.gfile.DeleteRecursively(OUTPUT_DIR)
  except:
    # Doesn't matter if the directory didn't exist
    pass
tf.gfile.MakeDirs(OUTPUT_DIR)
print('***** Model output directory: {} *****'.format(OUTPUT_DIR))

***** Model output directory: gs://dissertation_bucket/ *****


## Collate the metrics from the embeddings, CNN, and BERT experiments

In [10]:
#******fasttext 300 CNN******
!gsutil cp gs://dissertation_bucket/fasttext300_cnn_files/prediction_summary.pickle /tmp/
preds_per_tweet = pd.read_pickle('/tmp/prediction_summary.pickle')
print(preds_per_tweet.shape)

!gsutil cp gs://dissertation_bucket/fasttext300_cnn_files/metrics.csv /tmp/
metrics = pd.read_csv('/tmp/metrics.csv')


#******glove 300 CNN******
!gsutil cp gs://dissertation_bucket/glove300_840_cnn_files/prediction_summary.pickle /tmp/
df = pd.read_pickle('/tmp/prediction_summary.pickle')
preds_per_tweet = preds_per_tweet.append(df)
print(df.shape)
print(preds_per_tweet.shape)

!gsutil cp gs://dissertation_bucket/glove300_840_cnn_files/metrics.csv /tmp/
df = pd.read_csv('/tmp/metrics.csv')
metrics = metrics.append(df)


#******w2v CNN******
!gsutil cp gs://dissertation_bucket/w2v_cnn_files/prediction_summary.pickle /tmp/
df = pd.read_pickle('/tmp/prediction_summary.pickle')
preds_per_tweet = preds_per_tweet.append(df)
print(df.shape)
print(preds_per_tweet.shape)

!gsutil cp gs://dissertation_bucket/w2v_cnn_files/metrics.csv /tmp/
df = pd.read_csv('/tmp/metrics.csv')
metrics = metrics.append(df)


#******w2v RNN******
!gsutil cp gs://dissertation_bucket/w2v_rnn_files/prediction_summary.pickle /tmp/
df = pd.read_pickle('/tmp/prediction_summary.pickle')
preds_per_tweet = preds_per_tweet.append(df)
print(df.shape)
print(preds_per_tweet.shape)

!gsutil cp gs://dissertation_bucket/w2v_rnn_files/metrics.csv /tmp/
df = pd.read_csv('/tmp/metrics.csv')
metrics = metrics.append(df)


#******glove RNN******
!gsutil cp gs://dissertation_bucket/glove_rnn_files/prediction_summary.pickle /tmp/
df = pd.read_pickle('/tmp/prediction_summary.pickle')
preds_per_tweet = preds_per_tweet.append(df)
print(df.shape)
print(preds_per_tweet.shape)

!gsutil cp gs://dissertation_bucket/glove_rnn_files/metrics.csv /tmp/
df = pd.read_csv('/tmp/metrics.csv')
metrics = metrics.append(df)


#******fasttext RNN******
!gsutil cp gs://dissertation_bucket/fasttext_rnn_files/prediction_summary.pickle /tmp/
df = pd.read_pickle('/tmp/prediction_summary.pickle')
preds_per_tweet = preds_per_tweet.append(df)
print(df.shape)
print(preds_per_tweet.shape)

!gsutil cp gs://dissertation_bucket/fasttext_rnn_files/metrics.csv /tmp/
df = pd.read_csv('/tmp/metrics.csv')
metrics = metrics.append(df)


#******BERT LARGE******
!gsutil cp gs://dissertation_bucket/bert_large_files/prediction_summary.pickle /tmp/
df = pd.read_pickle('/tmp/prediction_summary.pickle')
preds_per_tweet = preds_per_tweet.append(df)
print(df.shape)
print(preds_per_tweet.shape)

!gsutil cp gs://dissertation_bucket/bert_large_files/metrics.csv /tmp/
df = pd.read_csv('/tmp/metrics.csv')
metrics = metrics.append(df)

#******BERT BASE******
!gsutil cp gs://dissertation_bucket/bert_base_files/prediction_summary.pickle /tmp/
df = pd.read_pickle('/tmp/prediction_summary.pickle')
preds_per_tweet = preds_per_tweet.append(df)
print(df.shape)
print(preds_per_tweet.shape)

!gsutil cp gs://dissertation_bucket/bert_base_files/metrics.csv /tmp/
df = pd.read_csv('/tmp/metrics.csv')
metrics = metrics.append(df)


#******classify_embeddings_fasttext******
!gsutil cp gs://dissertation_bucket/classify_embeddings_fasttext/df.pickle /tmp/
df = pd.read_pickle('/tmp/df.pickle')
preds_per_tweet = preds_per_tweet.append(df)
print(df.shape)
print(preds_per_tweet.shape)

!gsutil cp gs://dissertation_bucket/classify_embeddings_fasttext/classifications.pickle /tmp/
df = pd.read_pickle('/tmp/classifications.pickle')
metrics = metrics.append(df)


#******classify_embeddings_glove******
!gsutil cp gs://dissertation_bucket/classify_embeddings_glove/df.pickle /tmp/
df = pd.read_pickle('/tmp/df.pickle')
preds_per_tweet = preds_per_tweet.append(df)
print(df.shape)
print(preds_per_tweet.shape)

!gsutil cp gs://dissertation_bucket/classify_embeddings_glove/classifications.pickle /tmp/
df = pd.read_pickle('/tmp/classifications.pickle')
metrics = metrics.append(df)


#******classify_embeddings_glove_twitter200******
!gsutil cp gs://dissertation_bucket/classify_embeddings_glove_twitter200/df.pickle /tmp/
df = pd.read_pickle('/tmp/df.pickle')
preds_per_tweet = preds_per_tweet.append(df)
print(df.shape)
print(preds_per_tweet.shape)

!gsutil cp gs://dissertation_bucket/classify_embeddings_glove_twitter200/classifications.pickle /tmp/
df = pd.read_pickle('/tmp/classifications.pickle')
metrics = metrics.append(df)


#******classify_embeddings_w2v******
!gsutil cp gs://dissertation_bucket/classify_embeddings_w2v/df.pickle /tmp/
df = pd.read_pickle('/tmp/df.pickle')
preds_per_tweet = preds_per_tweet.append(df)
print(df.shape)
print(preds_per_tweet.shape)

!gsutil cp gs://dissertation_bucket/classify_embeddings_w2v/classifications.pickle /tmp/
df = pd.read_pickle('/tmp/classifications.pickle')
metrics = metrics.append(df)

#******model comparisons******
!gsutil cp gs://dissertation_bucket/bert_large_assess_files/metrics.csv /tmp/
compare_bert = pd.read_csv('/tmp/metrics.csv')

!gsutil cp gs://dissertation_bucket/glove_rnn_assess_files/metrics.csv /tmp/
compare_rnn = pd.read_csv('/tmp/metrics.csv')
compare = compare_bert.append(compare_rnn)




Copying gs://dissertation_bucket/fasttext300_cnn_files/prediction_summary.pickle...
/ [0 files][    0.0 B/ 74.5 KiB]                                                / [1 files][ 74.5 KiB/ 74.5 KiB]                                                
Operation completed over 1 objects/74.5 KiB.                                     
(1021, 4)
Copying gs://dissertation_bucket/fasttext300_cnn_files/metrics.csv...
/ [1 files][  210.0 B/  210.0 B]                                                
Operation completed over 1 objects/210.0 B.                                      
Copying gs://dissertation_bucket/glove300_840_cnn_files/prediction_summary.pickle...
/ [1 files][ 74.5 KiB/ 74.5 KiB]                                                
Operation completed over 1 objects/74.5 KiB.                                     
(1021, 4)
(2042, 4)
Copying gs://dissertation_bucket/glove300_840_cnn_files/metrics.csv...
/ [1 files][  211.0 B/  211.0 B]                                                
Operatio

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


Copying gs://dissertation_bucket/glove_rnn_files/prediction_summary.pickle...
/ [0 files][    0.0 B/ 42.3 KiB]                                                / [1 files][ 42.3 KiB/ 42.3 KiB]                                                
Operation completed over 1 objects/42.3 KiB.                                     
(1021, 4)
(5105, 4)
Copying gs://dissertation_bucket/glove_rnn_files/metrics.csv...
/ [1 files][  172.0 B/  172.0 B]                                                
Operation completed over 1 objects/172.0 B.                                      
Copying gs://dissertation_bucket/fasttext_rnn_files/prediction_summary.pickle...
/ [1 files][ 42.3 KiB/ 42.3 KiB]                                                
Operation completed over 1 objects/42.3 KiB.                                     
(1021, 4)
(6126, 4)
Copying gs://dissertation_bucket/fasttext_rnn_files/metrics.csv...
/ [1 files][  164.0 B/  164.0 B]                                                
Operation complete

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


Copying gs://dissertation_bucket/bert_base_files/prediction_summary.pickle...
/ [0 files][    0.0 B/ 38.6 KiB]                                                / [1 files][ 38.6 KiB/ 38.6 KiB]                                                
Operation completed over 1 objects/38.6 KiB.                                     
(1021, 4)
(8168, 4)
Copying gs://dissertation_bucket/bert_base_files/metrics.csv...
/ [1 files][  179.0 B/  179.0 B]                                                
Operation completed over 1 objects/179.0 B.                                      


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


Copying gs://dissertation_bucket/classify_embeddings_fasttext/df.pickle...
/ [1 files][478.0 KiB/478.0 KiB]                                                
Operation completed over 1 objects/478.0 KiB.                                    
(10210, 4)
(18378, 4)
Copying gs://dissertation_bucket/classify_embeddings_fasttext/classifications.pickle...
/ [1 files][  2.2 KiB/  2.2 KiB]                                                
Operation completed over 1 objects/2.2 KiB.                                      


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


Copying gs://dissertation_bucket/classify_embeddings_glove/df.pickle...
/ [0 files][    0.0 B/478.0 KiB]                                                / [1 files][478.0 KiB/478.0 KiB]                                                
Operation completed over 1 objects/478.0 KiB.                                    
(10210, 4)
(28588, 4)
Copying gs://dissertation_bucket/classify_embeddings_glove/classifications.pickle...
/ [1 files][  2.1 KiB/  2.1 KiB]                                                
Operation completed over 1 objects/2.1 KiB.                                      


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


Copying gs://dissertation_bucket/classify_embeddings_glove_twitter200/df.pickle...
/ [1 files][478.1 KiB/478.1 KiB]                                                
Operation completed over 1 objects/478.1 KiB.                                    
(10210, 4)
(38798, 4)
Copying gs://dissertation_bucket/classify_embeddings_glove_twitter200/classifications.pickle...
/ [1 files][  2.2 KiB/  2.2 KiB]                                                
Operation completed over 1 objects/2.2 KiB.                                      


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


Copying gs://dissertation_bucket/classify_embeddings_w2v/df.pickle...
/ [1 files][478.0 KiB/478.0 KiB]                                                
Operation completed over 1 objects/478.0 KiB.                                    
(10210, 4)
(49008, 4)
Copying gs://dissertation_bucket/classify_embeddings_w2v/classifications.pickle...
/ [1 files][  2.1 KiB/  2.1 KiB]                                                
Operation completed over 1 objects/2.1 KiB.                                      


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


Copying gs://dissertation_bucket/bert_large_assess_files/metrics.csv...
/ [1 files][  1.3 KiB/  1.3 KiB]                                                
Operation completed over 1 objects/1.3 KiB.                                      
Copying gs://dissertation_bucket/glove_rnn_assess_files/metrics.csv...
/ [1 files][  1.3 KiB/  1.3 KiB]                                                
Operation completed over 1 objects/1.3 KiB.                                      


## Pickle the predictions per tweet and the metrics for each experiment

In [0]:
#pickle
metrics.to_pickle('/tmp/metrics_colab.pickle')
preds_per_tweet.to_pickle('/tmp/preds_per_tweet_colab.pickle')
compare.to_pickle('/tmp/compare.pickle')

## Clone the github repository

In [12]:
!git clone https://github.com/ScottJK-20190706/tweet_classifier


Cloning into 'tweet_classifier'...
remote: Enumerating objects: 87, done.[K
remote: Counting objects:   1% (1/87)[Kremote: Counting objects:   2% (2/87)[Kremote: Counting objects:   3% (3/87)[Kremote: Counting objects:   4% (4/87)[Kremote: Counting objects:   5% (5/87)[Kremote: Counting objects:   6% (6/87)[Kremote: Counting objects:   8% (7/87)[Kremote: Counting objects:   9% (8/87)[Kremote: Counting objects:  10% (9/87)[Kremote: Counting objects:  11% (10/87)[Kremote: Counting objects:  12% (11/87)[Kremote: Counting objects:  13% (12/87)[Kremote: Counting objects:  14% (13/87)[Kremote: Counting objects:  16% (14/87)[Kremote: Counting objects:  17% (15/87)[Kremote: Counting objects:  18% (16/87)[Kremote: Counting objects:  19% (17/87)[Kremote: Counting objects:  20% (18/87)[Kremote: Counting objects:  21% (19/87)[Kremote: Counting objects:  22% (20/87)[Kremote: Counting objects:  24% (21/87)[Kremote: Counting objects:  25% (22/87)[Kremote: 

## Push the files to github

In [13]:
#copy files to the repo
!cp /tmp/preds_per_tweet_colab.pickle tweet_classifier/results
!cp /tmp/metrics_colab.pickle tweet_classifier/results
!cp /tmp/compare.pickle tweet_classifier/results

#change directory to the repo
import os
os.chdir('tweet_classifier/results')

#set username and password
!git config --global user.email "scott.kilgariff@gmail.com" #
!git config --global user.name "ScottJK-20190706"

#add
!git add preds_per_tweet_colab.pickle
!git add metrics_colab.pickle
!git add compare.pickle

#commit
!git commit --message="Add predictions per tweet from colab"
!git commit --message="Add metrics from colab"
!git commit --message="Add model comaprison from colab"

#push
!git remote set-url origin https://ScottJK-20190706:K1l64r1ff@github.com/ScottJK-20190706/tweet_classifier.git
!git push origin

[master cb49198] Add predictions per tweet from colab
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 results/compare.pickle
On branch master
Your branch is ahead of 'origin/master' by 1 commit.
  (use "git push" to publish your local commits)

nothing to commit, working tree clean
On branch master
Your branch is ahead of 'origin/master' by 1 commit.
  (use "git push" to publish your local commits)

nothing to commit, working tree clean
Counting objects: 4, done.
Delta compression using up to 2 threads.
Compressing objects: 100% (4/4), done.
Writing objects: 100% (4/4), 1.78 KiB | 1.78 MiB/s, done.
Total 4 (delta 2), reused 0 (delta 0)
remote: Resolving deltas: 100% (2/2), completed with 2 local objects.[K
To https://github.com/ScottJK-20190706/tweet_classifier.git
   bc4e8a0..cb49198  master -> master
