In [2]:
# Authentication for loading data from Google Drive
# Import packages
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
from google.colab import drive
from os import path

In [3]:
# Authenticate User
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
auth_drive = GoogleDrive(gauth)

In [4]:
DRIVE_PATH = '/content/drive'
drive.mount(DRIVE_PATH)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
DATA_PATH = path.join(DRIVE_PATH, 'My Drive', 'LinkedIn_Articles', 'Datasets', 'Twitter_Real_or_Not')
OUTPUT_PATH = path.join(DRIVE_PATH, 'My Drive', 'LinkedIn_Articles', 'NLP & EDA')

In [6]:
!pip install bert-for-tf2



In [7]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import bert
from keras import Model
from keras import layers
from keras import Input
from keras.optimizers import Adam
from keras.losses import binary_crossentropy

In [8]:
df_train = pd.read_csv(path.join(DATA_PATH, 'train_cleaned.csv'), index_col='id')
df_test = pd.read_csv(path.join(DATA_PATH, 'test_cleaned.csv'), index_col='id')

In [9]:
MAX_LEN = 256

In [10]:
BertTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_wwm_cased_L-24_H-1024_A-16/2', trainable=False)
vocabulary_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
to_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = BertTokenizer(vocabulary_file, to_lower_case)

In [11]:
tokenizer.tokenize(df_train['text_cleaned'].iloc[0])

['Our',
 'Dee',
 '##ds',
 'are',
 'the',
 'Reason',
 'of',
 'this',
 '#',
 'earthquake',
 'May',
 'AL',
 '##LA',
 '##H',
 'For',
 '##gi',
 '##ve',
 'us',
 'all']

In [12]:
tokenizer.convert_tokens_to_ids(tokenizer.tokenize(df_train['text_cleaned'].iloc[0]))

[3458,
 9115,
 3680,
 1132,
 1103,
 21642,
 1104,
 1142,
 108,
 8386,
 1318,
 18589,
 10783,
 3048,
 1370,
 5389,
 2707,
 1366,
 1155]

In [13]:
# def tokenize_tweets(tweet):
  # return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(tweet))

In [14]:
# train_ids = [tokenize_tweets(tweet) for tweet in df_train['text_cleaned']]

In [15]:
# train_ids[1]

[4089, 1783, 1485, 2001, 6413, 2176, 17784, 5276, 119, 1803]

In [16]:
def encode_tweets(tweets, max_len=MAX_LEN):
  tokens = []
  masks = []
  segments = []

  for tweet in tweets:
    tweet = tokenizer.tokenize(tweet)
    tweet = tweet[:max_len - 2]
    input_seq = ['[CLS]'] + tweet + ['[SEP]']

    pad_seq = max_len - len(input_seq)

    token = tokenizer.convert_tokens_to_ids(input_seq)
    token += [0] * pad_seq
    
    mask = [1] * len(input_seq) + [0] * pad_seq

    segment = [0] * max_len

    tokens.append(token)
    masks.append(mask)
    segments.append(segment)

  return np.array(tokens), np.array(masks), np.array(segments)

In [17]:
train_encoded = encode_tweets(df_train['text_cleaned'])

In [18]:
test_encoded = encode_tweets(df_test['text_cleaned'])

In [27]:
def build_bert_model(max_seq_length=MAX_LEN):
  input_word_ids = Input(shape=(max_seq_length,), dtype=tf.int32, name="input_word_ids")
  input_mask = Input(shape=(max_seq_length,), dtype=tf.int32, name="input_mask")
  segment_ids = Input(shape=(max_seq_length,), dtype=tf.int32, name="segment_ids")
  pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])

  bert_output = sequence_output[:, 0, :]

  additional_layers = layers.Reshape((1, 1024))(bert_output)
  # additional_layers = layers.LSTM(300, return_sequences=True)(additional_layers)
  # additional_layers = layers.LSTM(200, return_sequences=True)(additional_layers)
  additional_layers = layers.LSTM(10, return_sequences=True)(additional_layers)
  additional_layers = layers.LSTM(10)(additional_layers)
  additional_layers = layers.Dense(1, activation='sigmoid')(additional_layers)

  model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=additional_layers)

  model.compile(optimizer=Adam(lr=0.001), metrics=['accuracy'], loss=binary_crossentropy)

  return model

In [28]:
nn = build_bert_model()

In [29]:
nn.summary()

Model: "functional_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 256)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 256)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 256)]        0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 1024), (None 333579265   input_word_ids[0][0]             
                                                                 input_mask[0][0]      

In [None]:
nn.fit(train_encoded, df_train['target'], validation_split=0.2, epochs=5, batch_size=32)

Epoch 1/5


In [None]:
test_target = nn.predict_classes(test_encoded)

In [None]:
test_target.flatten()

In [None]:
submission = pd.DataFrame({'id': df_test.index, 'target': test_target.flatten()})
submission.to_csv(path.join(DATA_PATH, 'submission_bert.csv'), index=False)