In [0]:
%tensorflow_version 1.x
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [0]:
!wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
!unzip uncased_L-12_H-768_A-12.zip

--2020-04-14 16:44:16--  https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 216.58.193.208, 2607:f8b0:4007:80c::2010
Connecting to storage.googleapis.com (storage.googleapis.com)|216.58.193.208|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 407727028 (389M) [application/zip]
Saving to: ‘uncased_L-12_H-768_A-12.zip.1’


2020-04-14 16:44:17 (329 MB/s) - ‘uncased_L-12_H-768_A-12.zip.1’ saved [407727028/407727028]

Archive:  uncased_L-12_H-768_A-12.zip
replace uncased_L-12_H-768_A-12/bert_model.ckpt.meta? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace uncased_L-12_H-768_A-12/bert_model.ckpt.data-00000-of-00001? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace uncased_L-12_H-768_A-12/vocab.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace uncased_L-12_H-768_A-12/bert_model.ckpt.index? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace uncased_L-12_H-768_A-12/bert_config.jso

In [0]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split

# Read in the dataset into a Pandas DataFrame
df = pd.read_csv('data/dataset.csv', sep=',',header=None,encoding='latin')

# Drop unnecessary columns, leaving behind the [label, text] columns
df = df.drop(df.columns[[1, 2, 3, 4]], axis=1)

# Rename these columns
df.columns = ['label', 'text']
df.label = df.label.apply(lambda x: 0 if x == 0 else 1)

hashtags = re.compile(r"^#\S+|\s#\S+")
mentions = re.compile(r"^@\S+|\s@\S+")
urls = re.compile(r"https?://\S+")

def process_text(text):
  text = hashtags.sub(' hashtag', text)
  text = mentions.sub(' entity', text)
  return text.strip().lower()
  
def match_expr(pattern, string):
  return not pattern.search(string) == None

def get_data_wo_urls(dataset):
    link_with_urls = dataset.text.apply(lambda x: match_expr(urls, x))
    return dataset[[not e for e in link_with_urls]]

df.text = df.text.apply(process_text)

TRAIN_SIZE = 0.75
VAL_SIZE = 0.05
dataset_count = len(df)

df_train_val, df_test = train_test_split(df, test_size=1-TRAIN_SIZE-VAL_SIZE, random_state=42)
df_train, df_val = train_test_split(df_train_val, test_size=VAL_SIZE / (VAL_SIZE + TRAIN_SIZE), random_state=42)

print("TRAIN size:", len(df_train))
print("VAL size:", len(df_val))
print("TEST size:", len(df_test))

df_train = get_data_wo_urls(df_train)
df_train.sample(frac=1.0).reset_index(drop=True).to_csv('dataset/train.tsv', sep='\t', index=None, header=None)
df_val.to_csv('dataset/dev.tsv', sep='\t', index=None, header=None)
df_test.to_csv('dataset/test.tsv', sep='\t', index=None, header=None)


TRAIN size: 1200000
VAL size: 80000
TEST size: 320000


In [0]:
!pip install BertLibrary



In [0]:
from BertLibrary import BertFTModel
import numpy as np

ft_model = BertFTModel( model_dir='uncased_L-12_H-768_A-12',
                        ckpt_name="bert_model.ckpt",
                        labels=['0','1'],
                        lr=1e-05,
                        num_train_steps=30000,
                        num_warmup_steps=1000,
                        ckpt_output_dir='output',
                        save_check_steps=1000,
                        do_lower_case=False,
                        max_seq_len=50,
                        batch_size=32,
                        )

ft_trainer =  ft_model.get_trainer()
ft_evaluator = ft_model.get_evaluator()
ft_trainer.train_from_file('dataset', 35000)
ft_evaluator.evaluate_from_file('dataset', checkpoint="output/model.ckpt-35000")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
INFO:tensorflow:  name = bert/encoder/layer_10/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_10/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_10/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_10/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_10/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_10/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_10/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_10/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/l

In [0]:
from BertLibrary import BertFTModel
import numpy as np

ft_model = BertFTModel( model_dir='/content/drive/My Drive/bert',
                        ckpt_name="model.ckpt-35000",
                        labels=['0','1'],
                        lr=1e-05,
                        num_train_steps=30000,
                        num_warmup_steps=1000,
                        ckpt_output_dir='output',
                        save_check_steps=1000,
                        do_lower_case=False,
                        max_seq_len=50,
                        batch_size=32,
                        )


INFO:tensorflow:Using config: {'_model_dir': 'output', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 1000, '_save_checkpoints_secs': None, '_session_config': device_count {
  key: "GPU"
  value: 1
}
gpu_options {
  per_process_gpu_memory_fraction: 0.5
  allow_growth: true
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f4b50cf0c88>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [0]:
# Get the predictor from the desired model
predictor =  ft_model.get_predictor()

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:*** Features ***
INFO:tensorflow:  name = input_ids, shape = (?, 50)
INFO:tensorflow:  name = input_mask, shape = (?, 50)
INFO:tensorflow:  name = label_ids, shape = (?, 1)
INFO:tensorflow:  name = segment_ids, shape = (?, 50)
INFO:tensorflow:**** Trainable Variables ****
INFO:tensorflow:  name = bert/embeddings/word_embeddings:0, shape = (30522, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/embeddings/token_type_embeddings:0, shape = (2, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/embeddings/position_embeddings:0, shape = (512, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/embeddings/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/embeddings/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_0/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_0/attention/self/query/bias:0, sh

In [0]:
import time

# negative 
sentences = ["school email won't open  and i have geography stuff on there to revise! *stupid school* :'(",
             "entity apparently you dont have time for ur fans!!!!!!!!!!",
             "i don't feel good",
             "flatmates are still in the bathroom! arggh!!",
             "time to sleep... long day again tomorrow"]

start = time.time()
print(predictor(sentences))
end = time.time()

print("Time: ", end - start)

[[0.9863046  0.01369547]
 [0.83784825 0.16215174]
 [0.99776137 0.00223866]
 [0.9607379  0.03926216]
 [0.9567858  0.04321426]]
Time:  0.020629405975341797


In [0]:
# positive
sentences = ["i'm feeling quite sleepy today, wish i could stay in bed today...but ok! is my last year, so let's go to school",
             "if anyone wanted to attend tedmed but can't make the date, i'll happily take your place, and will live-blog it for you too",
             "first night in myers. just not the same w/out lydia!  but i'm actually excited about this summer!",
             "listening to the nutcracker.  who says it's only a christmas thing?!",
             "can't stop listening to the clips of entity new album lines, vines and trying times its so amazing can't wait for the whole thing"]
start = time.time()
print(predictor(sentences))
end = time.time()

print("Time: ", end - start)

[[0.3869416  0.6130584 ]
 [0.02177608 0.978224  ]
 [0.53217596 0.467824  ]
 [0.08622759 0.9137724 ]
 [0.00701665 0.99298334]]
Time:  0.019965171813964844
