In [None]:
!pip install tqdm  >> /dev/null

In [None]:
!pip install bert-for-tf2 >> /dev/null

In [None]:
!pip install sentencepiece >> /dev/null

In [None]:
import os
import math
import datetime

from tqdm import tqdm

import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow import keras

import bert
from bert import BertModelLayer
from bert.loader import StockBertConfig, map_stock_config_to_params, load_stock_weights
from bert.tokenization.bert_tokenization import FullTokenizer
from sklearn import model_selection
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from matplotlib import rc
from sklearn.utils import resample

from sklearn.metrics import confusion_matrix, classification_report

%matplotlib inline
%config InlineBackend.figure_format='retina'

sns.set(style='whitegrid', palette='muted', font_scale=1.2)

HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00"]

sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))

rcParams['figure.figsize'] = 12, 8

RANDOM_SEED = 42

np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

In [None]:
train = pd.read_csv("/content/train_cleaned_v2.2.csv")
test = pd.read_csv("/content/valid_cleaned_v2.2.csv")

In [None]:
test.head()

Unnamed: 0,level_0,text,updated
0,0,love this,1
1,1,mymoon straight,-1
2,2,her face,-1
3,3,i be not cry,-1
4,4,not to democrats,-1


In [None]:
header_list = ["text","intent","buffer"]
train = pd.read_csv("https://raw.githubusercontent.com/T-I-P/Hope-Speech-Detection/master/English/english_hope_train.csv", '\t',header=None,names=header_list)
test = pd.read_csv("https://raw.githubusercontent.com/T-I-P/Hope-Speech-Detection/master/English/english_hope_dev.csv",'\t',header=None,names=header_list)

In [None]:
train.isnull().sum()

Unnamed: 0    0
text          3
updated       0
dtype: int64

In [None]:
train = train.dropna(axis=0)

In [None]:
test.isnull().sum()

index         0
Unnamed: 0    0
text          0
updated       0
dtype: int64

In [None]:
train = train.reset_index()
test = test.reset_index()

In [None]:
train = train.drop(labels=['Unnamed: 0','index'], axis=1)
test = test.drop(labels=['Unnamed: 0','index'],axis =1)

In [None]:
test = test.drop(labels=['level_0'],axis =1)

In [None]:
train = train[train['updated']!=-1]
test = test[test['updated']!=-1]

In [None]:
train.shape

(10007, 3)

In [None]:
train.text = train.text.str.strip()

In [None]:
test.text = test.text.str.strip()

In [None]:
train['length'] = train['text'].str.split().str.len()

In [None]:
test['length'] = test['text'].str.split().str.len()

In [None]:
train = train.sort_values(by='length')

In [None]:
test = test.sort_values(by='length')

In [None]:
train.shape

(21055, 2)

In [None]:
train.to_csv('train_original_sorted.csv')

In [None]:
train['intent'] = train['intent'].replace(to_replace ="Non_hope_speech", value =0)
train['intent'] = train['intent'].replace(to_replace ="Hope_speech", value =1)
train['intent'].unique()

test['intent'] = test['intent'].replace(to_replace ="Non_hope_speech", value =0)
test['intent'] = test['intent'].replace(to_replace ="Hope_speech", value =1)
test['intent'].unique()

array([0, 1])

In [None]:
train =train.dropna(axis=0)

In [None]:
train.reset_index(drop=True,inplace=True)

In [None]:
test.reset_index(drop=True,inplace=True)

In [None]:
train = train[1389:]

In [None]:
test = test[37:]

In [None]:
test.head()

Unnamed: 0,text,intent,length
37,all the time,0,3
38,read the bible,0,3
39,well tony dennis,0,3
40,get brain sir,0,3
41,burn the witch,0,3


In [None]:
train, test  = model_selection.train_test_split(train ,test_size=0.3,shuffle=True)

In [None]:
train.shape

(14738, 2)

In [None]:
!wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip

--2021-04-27 12:09:09--  https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 172.217.13.80, 172.253.115.128, 172.253.122.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|172.217.13.80|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 407727028 (389M) [application/zip]
Saving to: ‘uncased_L-12_H-768_A-12.zip’


2021-04-27 12:09:11 (249 MB/s) - ‘uncased_L-12_H-768_A-12.zip’ saved [407727028/407727028]



In [None]:
!unzip uncased_L-12_H-768_A-12.zip

Archive:  uncased_L-12_H-768_A-12.zip
   creating: uncased_L-12_H-768_A-12/
  inflating: uncased_L-12_H-768_A-12/bert_model.ckpt.meta  
  inflating: uncased_L-12_H-768_A-12/bert_model.ckpt.data-00000-of-00001  
  inflating: uncased_L-12_H-768_A-12/vocab.txt  
  inflating: uncased_L-12_H-768_A-12/bert_model.ckpt.index  
  inflating: uncased_L-12_H-768_A-12/bert_config.json  


In [None]:
os.makedirs("model", exist_ok=True)

In [None]:
!mv uncased_L-12_H-768_A-12/ model

In [None]:
bert_model_name="uncased_L-12_H-768_A-12"
bert_ckpt_dir = os.path.join("model/", bert_model_name)
bert_ckpt_file = os.path.join(bert_ckpt_dir, "bert_model.ckpt")
bert_config_file = os.path.join(bert_ckpt_dir, "bert_config.json")

In [None]:
class IntentDetectionData:
  DATA_COLUMN = "text"
  LABEL_COLUMN = "updated"

  def __init__(self, train, test, tokenizer: FullTokenizer, classes, max_seq_len=192):
    self.tokenizer = tokenizer
    self.max_seq_len = 0
    self.classes = classes

    train, test = map(lambda df: df.reindex(df[IntentDetectionData.DATA_COLUMN].str.len().sort_values().index), [train, test])

    ((self.train_x, self.train_y), (self.test_x, self.test_y)) = map(self._prepare, [train, test])

    print("max seq_len", self.max_seq_len)
    self.max_seq_len = min(self.max_seq_len, max_seq_len)
    self.train_x, self.test_x = map(self._pad, [self.train_x, self.test_x])

  def _prepare(self, df):
    x, y = [], []

    for _, row in tqdm(df.iterrows()):
      text, label = row[IntentDetectionData.DATA_COLUMN], row[IntentDetectionData.LABEL_COLUMN]
      tokens = self.tokenizer.tokenize(text)
      tokens = ["[CLS]"] + tokens + ["[SEP]"]
      token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
      self.max_seq_len = max(self.max_seq_len, len(token_ids))
      x.append(token_ids)
      y.append(self.classes.index(label))

    return np.array(x), np.array(y)

  def _pad(self, ids):
    x = []
    for input_ids in ids:
      input_ids = input_ids[:min(len(input_ids), self.max_seq_len - 2)]
      input_ids = input_ids + [0] * (self.max_seq_len - len(input_ids))
      x.append(np.array(input_ids))
    return np.array(x)

In [None]:
tokenizer = FullTokenizer(vocab_file=os.path.join(bert_ckpt_dir, "vocab.txt"))

In [None]:
tokenizer.tokenize("these tiktoks radiate gay chaotic energy and i love it")

['these',
 'ti',
 '##kt',
 '##ok',
 '##s',
 'ra',
 '##dia',
 '##te',
 'gay',
 'chaotic',
 'energy',
 'and',
 'i',
 'love',
 'it']

In [None]:
tokens = tokenizer.tokenize("Non-Hope")
tokenizer.convert_tokens_to_ids(tokens)
#print(tokens)

[2512, 1011, 3246]

In [None]:
def create_model(max_seq_len, bert_ckpt_file):

  with tf.io.gfile.GFile(bert_config_file, "r") as reader:
      bc = StockBertConfig.from_json_string(reader.read())
      bert_params = map_stock_config_to_params(bc)
      bert_params.adapter_size = None
      bert = BertModelLayer.from_params(bert_params, name="bert")

  #bert.apply_adapter_freeze()

  input_ids = keras.layers.Input(shape=(max_seq_len, ), dtype='int32', name="input_ids")
  bert_output = bert(input_ids)

  print("bert shape", bert_output.shape)

  cls_out = keras.layers.Lambda(lambda seq: seq[:, 0, :])(bert_output)
  cls_out = keras.layers.Dropout(0.5)(cls_out)
  logits = keras.layers.Dense(units=768, activation="tanh")(cls_out)
  logits = keras.layers.Dropout(0.5)(logits)
  logits = keras.layers.Dense(units=len(classes), activation="softmax")(logits)

  model = keras.Model(inputs=input_ids, outputs=logits)
  model.build(input_shape=(None, max_seq_len))

  load_stock_weights(bert, bert_ckpt_file)

  return model

In [None]:
classes = train.updated.unique().tolist()

data = IntentDetectionData(train, test, tokenizer, classes, max_seq_len=128)

10007it [00:03, 3326.07it/s]
1663it [00:00, 2947.29it/s]


max seq_len 67


In [None]:
data.train_x.shape

(14738, 73)

In [None]:
data.train_x[56]

array([ 101, 8840, 2140, 3524,  102,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0])

In [None]:
model = create_model(data.max_seq_len, bert_ckpt_file)

bert shape (None, 67, 768)
Done loading 196 BERT weights from: model/uncased_L-12_H-768_A-12/bert_model.ckpt into <bert.model.BertModelLayer object at 0x7f7e5e720210> (prefix:bert). Count of weights not found in the checkpoint was: [0]. Count of weights with mismatched shape: [0]
Unused weights from checkpoint: 
	bert/embeddings/token_type_embeddings
	bert/pooler/dense/bias
	bert/pooler/dense/kernel
	cls/predictions/output_bias
	cls/predictions/transform/LayerNorm/beta
	cls/predictions/transform/LayerNorm/gamma
	cls/predictions/transform/dense/bias
	cls/predictions/transform/dense/kernel
	cls/seq_relationship/output_bias
	cls/seq_relationship/output_weights


In [None]:

model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_ids (InputLayer)       [(None, 67)]              0         
_________________________________________________________________
bert (BertModelLayer)        (None, 67, 768)           108890112 
_________________________________________________________________
lambda (Lambda)              (None, 768)               0         
_________________________________________________________________
dropout (Dropout)            (None, 768)               0         
_________________________________________________________________
dense (Dense)                (None, 768)               590592    
_________________________________________________________________
dropout_1 (Dropout)          (None, 768)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 1538  

In [None]:
model.compile(
  optimizer=keras.optimizers.Adam(1e-5),
  loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
  metrics=[keras.metrics.SparseCategoricalAccuracy(name="acc")]
)

In [None]:
from sklearn.utils import class_weight
class_weights = class_weight.compute_class_weight('balanced',
                                                 np.unique(train['updated']),
                                                 train['updated'])
weight = {i : class_weights[i] for i in range(2)}

In [None]:
class_weights

array([0.84504307, 1.22454723])

In [None]:
log_dir = "log/intent_detection/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%s")
tensorboard_callback = keras.callbacks.TensorBoard(log_dir=log_dir)

history = model.fit(
  x=data.train_x,
  y=data.train_y,
  class_weight = weight,
  validation_split=0.3,
  batch_size=16,
  shuffle=True,
  epochs=10,
  callbacks=[tensorboard_callback]
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
 29/438 [>.............................] - ETA: 2:52 - loss: 0.0225 - acc: 0.9892

KeyboardInterrupt: ignored

In [None]:
_, test_acc = model.evaluate(data.test_x, data.test_y)
print("test acc", test_acc)

test acc 0.8496692776679993


In [None]:
y_pred = model.predict(data.test_x).argmax(axis=-1)

In [None]:
print(classification_report(data.test_y, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.85      0.83       696
           1       0.89      0.85      0.87       967

    accuracy                           0.85      1663
   macro avg       0.84      0.85      0.85      1663
weighted avg       0.85      0.85      0.85      1663



In [None]:
classes

[-1, 1, 0]

In [None]:
cm = confusion_matrix(data.test_y, y_pred)
df_cm = pd.DataFrame(cm, index=classes, columns=classes)

In [None]:
print(cm)

[[591 105]
 [145 822]]


In [None]:
train.updated.unique()

array([-1,  1,  0])