## Common Imports

In [0]:
import tensorflow as tf
import pandas as pd

## Authorise access to Google Cloud Storage

In [8]:
# Set the output directory for saving model file
# Optionally, set a GCP bucket location

OUTPUT_DIR = ''#@param {type:"string"}
#@markdown Whether or not to clear/delete the directory and create a new one
DO_DELETE = False #@param {type:"boolean"}
#@markdown Set USE_BUCKET and BUCKET if you want to (optionally) store model output on GCP bucket.
USE_BUCKET = True #@param {type:"boolean"}
BUCKET = 'dissertation_bucket' #@param {type:"string"}

if USE_BUCKET:
  OUTPUT_DIR = 'gs://{}/{}'.format(BUCKET, OUTPUT_DIR)
  from google.colab import auth
  auth.authenticate_user()

if DO_DELETE:
  try:
    tf.gfile.DeleteRecursively(OUTPUT_DIR)
  except:
    # Doesn't matter if the directory didn't exist
    pass
tf.gfile.MakeDirs(OUTPUT_DIR)
print('***** Model output directory: {} *****'.format(OUTPUT_DIR))

***** Model output directory: gs://dissertation_bucket/ *****


## Collate the metrics from the embeddings, CNN, and BERT experiments

In [9]:
#******fasttext 300 CNN******
!gsutil cp gs://dissertation_bucket/fasttext300_cnn_files/prediction_summary.pickle /tmp/
preds_per_tweet = pd.read_pickle('/tmp/prediction_summary.pickle')
print(preds_per_tweet.shape)

!gsutil cp gs://dissertation_bucket/fasttext300_cnn_files/metrics.csv /tmp/
metrics = pd.read_csv('/tmp/metrics.csv')


#******glove 300 CNN******
!gsutil cp gs://dissertation_bucket/glove300_840_cnn_files/prediction_summary.pickle /tmp/
df = pd.read_pickle('/tmp/prediction_summary.pickle')
preds_per_tweet = preds_per_tweet.append(df)
print(df.shape)
print(preds_per_tweet.shape)

!gsutil cp gs://dissertation_bucket/glove300_840_cnn_files/metrics.csv /tmp/
df = pd.read_csv('/tmp/metrics.csv')
metrics = metrics.append(df)


#******w2v CNN******
!gsutil cp gs://dissertation_bucket/w2v_cnn_files/prediction_summary.pickle /tmp/
df = pd.read_pickle('/tmp/prediction_summary.pickle')
preds_per_tweet = preds_per_tweet.append(df)
print(df.shape)
print(preds_per_tweet.shape)

!gsutil cp gs://dissertation_bucket/w2v_cnn_files/metrics.csv /tmp/
df = pd.read_csv('/tmp/metrics.csv')
metrics = metrics.append(df)


#******w2v RNN******
!gsutil cp gs://dissertation_bucket/w2v_rnn_files/prediction_summary.pickle /tmp/
df = pd.read_pickle('/tmp/prediction_summary.pickle')
preds_per_tweet = preds_per_tweet.append(df)
print(df.shape)
print(preds_per_tweet.shape)

!gsutil cp gs://dissertation_bucket/w2v_rnn_files/metrics.csv /tmp/
df = pd.read_csv('/tmp/metrics.csv')
metrics = metrics.append(df)


#******glove RNN******
!gsutil cp gs://dissertation_bucket/glove_rnn_files/prediction_summary.pickle /tmp/
df = pd.read_pickle('/tmp/prediction_summary.pickle')
preds_per_tweet = preds_per_tweet.append(df)
print(df.shape)
print(preds_per_tweet.shape)

!gsutil cp gs://dissertation_bucket/glove_rnn_files/metrics.csv /tmp/
df = pd.read_csv('/tmp/metrics.csv')
metrics = metrics.append(df)


#******fasttext RNN******
!gsutil cp gs://dissertation_bucket/fasttext_rnn_files/prediction_summary.pickle /tmp/
df = pd.read_pickle('/tmp/prediction_summary.pickle')
preds_per_tweet = preds_per_tweet.append(df)
print(df.shape)
print(preds_per_tweet.shape)

!gsutil cp gs://dissertation_bucket/fasttext_rnn_files/metrics.csv /tmp/
df = pd.read_csv('/tmp/metrics.csv')
metrics = metrics.append(df)


#******BERT******
!gsutil cp gs://dissertation_bucket/bert_files/prediction_summary.pickle /tmp/
df = pd.read_pickle('/tmp/prediction_summary.pickle')
preds_per_tweet = preds_per_tweet.append(df)
print(df.shape)
print(preds_per_tweet.shape)

!gsutil cp gs://dissertation_bucket/bert_files/metrics.csv /tmp/
df = pd.read_csv('/tmp/metrics.csv')
metrics = metrics.append(df)


#******classify_embeddings_fasttext******
!gsutil cp gs://dissertation_bucket/classify_embeddings_fasttext/df.pickle /tmp/
df = pd.read_pickle('/tmp/df.pickle')
preds_per_tweet = preds_per_tweet.append(df)
print(df.shape)
print(preds_per_tweet.shape)

!gsutil cp gs://dissertation_bucket/classify_embeddings_fasttext/classifications.pickle /tmp/
df = pd.read_pickle('/tmp/classifications.pickle')
metrics = metrics.append(df)


#******classify_embeddings_glove******
!gsutil cp gs://dissertation_bucket/classify_embeddings_glove/df.pickle /tmp/
df = pd.read_pickle('/tmp/df.pickle')
preds_per_tweet = preds_per_tweet.append(df)
print(df.shape)
print(preds_per_tweet.shape)

!gsutil cp gs://dissertation_bucket/classify_embeddings_glove/classifications.pickle /tmp/
df = pd.read_pickle('/tmp/classifications.pickle')
metrics = metrics.append(df)


#******classify_embeddings_glove_twitter200******
!gsutil cp gs://dissertation_bucket/classify_embeddings_glove_twitter200/df.pickle /tmp/
df = pd.read_pickle('/tmp/df.pickle')
preds_per_tweet = preds_per_tweet.append(df)
print(df.shape)
print(preds_per_tweet.shape)

!gsutil cp gs://dissertation_bucket/classify_embeddings_glove_twitter200/classifications.pickle /tmp/
df = pd.read_pickle('/tmp/classifications.pickle')
metrics = metrics.append(df)


#******classify_embeddings_w2v******
!gsutil cp gs://dissertation_bucket/classify_embeddings_w2v/df.pickle /tmp/
df = pd.read_pickle('/tmp/df.pickle')
preds_per_tweet = preds_per_tweet.append(df)
print(df.shape)
print(preds_per_tweet.shape)

!gsutil cp gs://dissertation_bucket/classify_embeddings_w2v/classifications.pickle /tmp/
df = pd.read_pickle('/tmp/classifications.pickle')
metrics = metrics.append(df)




Copying gs://dissertation_bucket/fasttext300_cnn_files/prediction_summary.pickle...
/ [0 files][    0.0 B/ 74.5 KiB]                                                / [1 files][ 74.5 KiB/ 74.5 KiB]                                                
Operation completed over 1 objects/74.5 KiB.                                     
(1021, 4)
Copying gs://dissertation_bucket/fasttext300_cnn_files/metrics.csv...
/ [1 files][  210.0 B/  210.0 B]                                                
Operation completed over 1 objects/210.0 B.                                      
Copying gs://dissertation_bucket/glove300_840_cnn_files/prediction_summary.pickle...
/ [1 files][ 74.5 KiB/ 74.5 KiB]                                                
Operation completed over 1 objects/74.5 KiB.                                     
(1021, 4)
(2042, 4)
Copying gs://dissertation_bucket/glove300_840_cnn_files/metrics.csv...
/ [1 files][  211.0 B/  211.0 B]                                                
Operatio

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


Copying gs://dissertation_bucket/glove_rnn_files/prediction_summary.pickle...
/ [0 files][    0.0 B/ 42.3 KiB]                                                / [1 files][ 42.3 KiB/ 42.3 KiB]                                                
Operation completed over 1 objects/42.3 KiB.                                     
(1021, 4)
(5105, 4)
Copying gs://dissertation_bucket/glove_rnn_files/metrics.csv...
/ [1 files][  172.0 B/  172.0 B]                                                
Operation completed over 1 objects/172.0 B.                                      
Copying gs://dissertation_bucket/fasttext_rnn_files/prediction_summary.pickle...
/ [1 files][ 42.3 KiB/ 42.3 KiB]                                                
Operation completed over 1 objects/42.3 KiB.                                     
(1021, 4)
(6126, 4)
Copying gs://dissertation_bucket/fasttext_rnn_files/metrics.csv...
/ [1 files][  164.0 B/  164.0 B]                                                
Operation complete

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


(1021, 4)
(7147, 4)
Copying gs://dissertation_bucket/bert_files/metrics.csv...
/ [1 files][  192.0 B/  192.0 B]                                                
Operation completed over 1 objects/192.0 B.                                      
Copying gs://dissertation_bucket/classify_embeddings_fasttext/df.pickle...
/ [1 files][478.0 KiB/478.0 KiB]                                                
Operation completed over 1 objects/478.0 KiB.                                    


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


(10210, 4)
(17357, 4)
Copying gs://dissertation_bucket/classify_embeddings_fasttext/classifications.pickle...
/ [1 files][  2.2 KiB/  2.2 KiB]                                                
Operation completed over 1 objects/2.2 KiB.                                      


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


Copying gs://dissertation_bucket/classify_embeddings_glove/df.pickle...
/ [0 files][    0.0 B/478.0 KiB]                                                / [1 files][478.0 KiB/478.0 KiB]                                                
Operation completed over 1 objects/478.0 KiB.                                    


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


(10210, 4)
(27567, 4)
Copying gs://dissertation_bucket/classify_embeddings_glove/classifications.pickle...
/ [1 files][  2.1 KiB/  2.1 KiB]                                                
Operation completed over 1 objects/2.1 KiB.                                      


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


Copying gs://dissertation_bucket/classify_embeddings_glove_twitter200/df.pickle...
/ [0 files][    0.0 B/478.1 KiB]                                                / [1 files][478.1 KiB/478.1 KiB]                                                
Operation completed over 1 objects/478.1 KiB.                                    


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


(10210, 4)
(37777, 4)
Copying gs://dissertation_bucket/classify_embeddings_glove_twitter200/classifications.pickle...
/ [1 files][  2.2 KiB/  2.2 KiB]                                                
Operation completed over 1 objects/2.2 KiB.                                      


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


Copying gs://dissertation_bucket/classify_embeddings_w2v/df.pickle...
/ [0 files][    0.0 B/478.0 KiB]                                                / [1 files][478.0 KiB/478.0 KiB]                                                
Operation completed over 1 objects/478.0 KiB.                                    


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


(10210, 4)
(47987, 4)
Copying gs://dissertation_bucket/classify_embeddings_w2v/classifications.pickle...
/ [1 files][  2.1 KiB/  2.1 KiB]                                                
Operation completed over 1 objects/2.1 KiB.                                      


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


## Pickle the predictions per tweet and the metrics for each experiment

In [0]:
#pickle
metrics.to_pickle('/tmp/metrics_colab.pickle')
preds_per_tweet.to_pickle('/tmp/preds_per_tweet_colab.pickle')

## Clone the github repository

In [11]:
!git clone https://github.com/ScottJK-20190706/tweet_classifier


Cloning into 'tweet_classifier'...
remote: Enumerating objects: 39, done.[K
remote: Counting objects:   2% (1/39)[Kremote: Counting objects:   5% (2/39)[Kremote: Counting objects:   7% (3/39)[Kremote: Counting objects:  10% (4/39)[Kremote: Counting objects:  12% (5/39)[Kremote: Counting objects:  15% (6/39)[Kremote: Counting objects:  17% (7/39)[Kremote: Counting objects:  20% (8/39)[Kremote: Counting objects:  23% (9/39)[Kremote: Counting objects:  25% (10/39)[Kremote: Counting objects:  28% (11/39)[Kremote: Counting objects:  30% (12/39)[Kremote: Counting objects:  33% (13/39)[Kremote: Counting objects:  35% (14/39)[Kremote: Counting objects:  38% (15/39)[Kremote: Counting objects:  41% (16/39)[Kremote: Counting objects:  43% (17/39)[Kremote: Counting objects:  46% (18/39)[Kremote: Counting objects:  48% (19/39)[Kremote: Counting objects:  51% (20/39)[Kremote: Counting objects:  53% (21/39)[Kremote: Counting objects:  56% (22/39)[Kremote: 

## Push the files to github

In [12]:
#copy files to the repo
!cp /tmp/preds_per_tweet_colab.pickle tweet_classifier/results
!cp /tmp/metrics_colab.pickle tweet_classifier/results

#change directory to the repo
import os
os.chdir('tweet_classifier/results')

#set username and password
!git config --global user.email "scott.kilgariff@gmail.com" #
!git config --global user.name "ScottJK-20190706"

#add
!git add preds_per_tweet_colab.pickle
!git add metrics_colab.pickle

#commit
!git commit --message="Add predictions per tweet from colab"
!git commit --message="Add metrics from colab"

#push
!git remote set-url origin https://ScottJK-20190706:K1l64r1ff@github.com/ScottJK-20190706/tweet_classifier.git
!git push origin

[master d50eee8] Add predictions per tweet from colab
 2 files changed, 0 insertions(+), 0 deletions(-)
On branch master
Your branch is ahead of 'origin/master' by 1 commit.
  (use "git push" to publish your local commits)

nothing to commit, working tree clean
Counting objects: 5, done.
Delta compression using up to 2 threads.
Compressing objects: 100% (5/5), done.
Writing objects: 100% (5/5), 48.69 KiB | 1.62 MiB/s, done.
Total 5 (delta 3), reused 0 (delta 0)
remote: Resolving deltas: 100% (3/3), completed with 3 local objects.[K
To https://github.com/ScottJK-20190706/tweet_classifier.git
   8191704..d50eee8  master -> master
