In [1]:
!pip install bert
!pip install transformers
!pip install tf-models-official
# !pip install bert-for-tf2

Collecting bert
  Downloading bert-2.2.0.tar.gz (3.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting erlastic (from bert)
  Downloading erlastic-2.0.0.tar.gz (6.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: bert, erlastic
  Building wheel for bert (setup.py) ... [?25l[?25hdone
  Created wheel for bert: filename=bert-2.2.0-py3-none-any.whl size=3745 sha256=4e34e34a8e457d3f7bc1e957cfc9e0ff6b395537c6d1979244102972ca7b707e
  Stored in directory: /root/.cache/pip/wheels/55/82/8d/a9bad0b8280eb858aa3dcb4e617ee5a1653fdeb239e1e8c3fe
  Building wheel for erlastic (setup.py) ... [?25l[?25hdone
  Created wheel for erlastic: filename=erlastic-2.0.0-py3-none-any.whl size=6780 sha256=c13121ca425772c743cfaaa3ab442db0a564bdc3e4f9b2a5c31da682d4398821
  Stored in directory: /root/.cache/pip/wheels/63/ea/24/ab8ff86604f1a87ca69a06af89bb7e080a5e064fbf5581423f
Successfully built bert erlastic
Installing collected packages: erlastic,

In [2]:
# !pip install --upgrade pip setuptools wheel
# !pip install bert-for-tf2==0.14.4

In [3]:
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import layers
import random
import pandas as pd
import numpy as np
# from keras import backend as K
from tensorflow.keras import backend as K
from sklearn.metrics import classification_report


import bert
# from bert import tokenization
from transformers import BertTokenizer

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
def recall_m(y_true, y_pred):
    y_true = tf.cast(y_true, tf.float32)
    y_pred = tf.cast(tf.squeeze(y_pred, axis=-1), tf.float32)
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    y_true = tf.cast(y_true, tf.float32)
    y_pred = tf.cast(tf.squeeze(y_pred, axis=-1), tf.float32)
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

def dataset_embedding(dataset_path, tokenizer, batch_size=32):
    dataset = pd.read_csv(dataset_path)[["tweet", "sarcastic"]]
    dataset = dataset[dataset['tweet'].notna()]

    tokenized_tweets = [tokenizer.convert_tokens_to_ids(tokenizer.tokenize(tweet)) for tweet in dataset['tweet']]

    tweets_with_len = [[tweet, dataset['sarcastic'].iloc[i], len(tweet)] for i, tweet in enumerate(tokenized_tweets)]
    random.Random(42).shuffle(tweets_with_len)

    tweets_with_len.sort(key=lambda x: x[2])
    sorted_tweets_labels = [(tweet_lab[0], tweet_lab[1]) for tweet_lab in tweets_with_len] # remove tweet len
    processed_dataset = tf.data.Dataset.from_generator(lambda: sorted_tweets_labels, output_types=(tf.int32, tf.int32))

    return processed_dataset.padded_batch(batch_size, padded_shapes=((None, ), ()))

# def dataset_embedding(dataset_path, tokenizer, batch_size=32):
#     dataset = pd.read_csv(dataset_path)[["tweet", "sarcastic"]]
#     dataset = dataset[dataset['tweet'].notna()]
#     tokenized_tweets = [
#         tokenizer.convert_tokens_to_ids(
#             tokenizer.tokenize(tweet)
#         )
#         for tweet in dataset['tweet']
#     ]

#     tweets_with_len = [
#         [tweet, dataset['sarcastic'].iloc[i], len(tweet)]
#         for i, tweet in enumerate(tokenized_tweets)
#     ]
#     random.Random(42).shuffle(tweets_with_len)
#     tweets_with_len.sort(key=lambda x: x[2])

#     sorted_tweets_labels = [
#         (tweet_lab[0], tweet_lab[1]) for tweet_lab in tweets_with_len
#     ]

#     processed_dataset = tf.data.Dataset.from_generator(
#         lambda: sorted_tweets_labels,
#         output_types=(tf.int32, tf.int32)
#     )
#     return processed_dataset.padded_batch(batch_size, padded_shapes=((None, ), ()))


# def prepare_datasets(train_path, test_path):
#     BertTokenizer = bert.bert_tokenization.FullTokenizer
#     bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1", trainable=False)
#     vocabulary_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
#     to_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
#     tokenizer = BertTokenizer(vocabulary_file, to_lower_case)


#     dataset_train = dataset_embedding(train_path, tokenizer)
#     dataset_test = dataset_embedding(test_path, tokenizer)

#     return dataset_train, dataset_test, tokenizer

def prepare_datasets(train_path, test_path):
    bert_layer = hub.KerasLayer(
        "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
        trainable=True
    )
    vocabulary_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
    to_lower_case = bert_layer.resolved_object.do_lower_case.numpy()

    vocabulary_file_str = vocabulary_file.decode("utf-8")
    do_lower_case_bool = bool(to_lower_case)

    tokenizer = BertTokenizer(
        vocab_file=vocabulary_file_str,
        do_lower_case=do_lower_case_bool
    )

    dataset_train = dataset_embedding(train_path, tokenizer)
    dataset_test = dataset_embedding(test_path, tokenizer)

    return dataset_train, dataset_test, tokenizer



In [8]:
train_path = '/content/drive/My Drive/Colab Notebooks/6812_gwp/data/Train_Dataset.csv'
test_path = '/content/drive/My Drive/Colab Notebooks/6812_gwp/data/Test_Dataset.csv'

train_data, test_data, tokenizer = prepare_datasets(train_path, test_path)

lstm = tf.keras.Sequential([
    tf.keras.layers.Embedding(len(tokenizer.vocab), 128),
    tf.keras.layers.LSTM(64, return_sequences=True, dropout=0.3),
    tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(64, activation='relu')),
    tf.keras.layers.LSTM(64, return_sequences=True, dropout=0.3),
    tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(64, activation='relu')),
    tf.keras.layers.LSTM(64, dropout=0.3),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# lstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', f1_m])
lstm.compile(
    loss='binary_crossentropy',
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
    metrics=['accuracy', f1_m]
)
print(lstm.summary())

lstm.fit(train_data, epochs=10, validation_data=test_data, class_weight={1:4, 0:1})

loss_test, acc_test, f1_test = lstm.evaluate(test_data)
print("Loss:", loss_test, "Accuracy:", acc_test, "F1:", f1_test)


None
Epoch 1/10
    217/Unknown [1m18s[0m 48ms/step - accuracy: 0.3284 - f1_m: 0.3630 - loss: 1.1908



[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 55ms/step - accuracy: 0.3282 - f1_m: 0.3631 - loss: 1.1908 - val_accuracy: 0.1429 - val_f1_m: 0.2337 - val_loss: 0.7453
Epoch 2/10
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 51ms/step - accuracy: 0.2526 - f1_m: 0.3893 - loss: 1.1703 - val_accuracy: 0.5679 - val_f1_m: 0.2124 - val_loss: 0.7194
Epoch 3/10
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 62ms/step - accuracy: 0.5659 - f1_m: 0.4882 - loss: 1.0425 - val_accuracy: 0.4521 - val_f1_m: 0.2359 - val_loss: 1.0282
Epoch 4/10
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 51ms/step - accuracy: 0.6506 - f1_m: 0.5801 - loss: 0.8836 - val_accuracy: 0.6086 - val_f1_m: 0.2199 - val_loss: 0.9474
Epoch 5/10
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 48ms/step - accuracy: 0.8042 - f1_m: 0.6672 - loss: 0.7537 - v

In [9]:
all_labels = []
all_preds = []

for x_batch, y_batch in test_data:
    preds = lstm.predict(x_batch)
    preds = (preds > 0.5).astype(int).flatten()
    all_preds.extend(preds)
    all_labels.extend(y_batch.numpy())

print(classification_report(all_labels, all_preds))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/s

In [11]:
Blstm = tf.keras.Sequential([
    tf.keras.layers.Embedding(len(tokenizer.vocab), 128),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True, dropout=0.3, recurrent_dropout=0.3)),
    tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(128, activation='relu')),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True, dropout=0.3, recurrent_dropout=0.3)),
    tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(128, activation='relu')),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, dropout=0.3, recurrent_dropout=0.3)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

Blstm.compile(
    loss='binary_crossentropy',
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
    metrics=['accuracy', f1_m]
)
print(Blstm.summary())

Blstm.fit(train_data, epochs=10, validation_data=test_data, class_weight={1:4, 0:1})

loss_test, acc_test, f1_test = Blstm.evaluate(test_data)
print("Loss:", loss_test, "Accuracy:", acc_test, "F1:", f1_test)


None
Epoch 1/10
    217/Unknown [1m150s[0m 447ms/step - accuracy: 0.3482 - f1_m: 0.3540 - loss: 1.1896



[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m157s[0m 477ms/step - accuracy: 0.3479 - f1_m: 0.3541 - loss: 1.1896 - val_accuracy: 0.1429 - val_f1_m: 0.2337 - val_loss: 0.7470
Epoch 2/10
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 467ms/step - accuracy: 0.2568 - f1_m: 0.3915 - loss: 1.1674 - val_accuracy: 0.5364 - val_f1_m: 0.2379 - val_loss: 0.7851
Epoch 3/10
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 461ms/step - accuracy: 0.6408 - f1_m: 0.5490 - loss: 0.9228 - val_accuracy: 0.5664 - val_f1_m: 0.2277 - val_loss: 0.8409
Epoch 4/10
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 471ms/step - accuracy: 0.8323 - f1_m: 0.7036 - loss: 0.6981 - val_accuracy: 0.4971 - val_f1_m: 0.2332 - val_loss: 1.2221
Epoch 5/10
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 465ms/step - accuracy: 0.8120 - f1_m: 0.7234 - loss

In [12]:
all_labels = []
all_preds = []

for x_batch, y_batch in test_data:
    preds = Blstm.predict(x_batch)
    preds = (preds > 0.5).astype(int).flatten()
    all_preds.extend(preds)
    all_labels.extend(y_batch.numpy())

print(classification_report(all_labels, all_preds))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/s