In [1]:
import os
import sys
import pandas as pd
import numpy as np
import tensorflow as tf
import keras
import math
import random
import codecs
import gc


from keras.models import Sequential,Model
from keras.layers import Dense, Embedding, LSTM,Flatten,SpatialDropout1D,Dropout,Input
from keras.layers import CuDNNLSTM, Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D


# BERT files
os.listdir("../input/pretrained-bert-including-scripts/master/bert-master")
sys.path.insert(0, '../input/pretrained-bert-including-scripts/master/bert-master')
!cp -r '../input/kerasbert/keras_bert' '/kaggle/working'
BERT_PRETRAINED_DIR = '../input/pretrained-bert-including-scripts/uncased_l-12_h-768_a-12/uncased_L-12_H-768_A-12'

from keras_bert.keras_bert import Tokenizer
from keras_bert.keras_bert import get_base_dict, get_model, gen_batch_inputs
from keras_bert.keras_bert import load_trained_model_from_checkpoint

from run_classifier import *
import tokenization




Using TensorFlow backend.


In [None]:
train = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv')
test = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv')

train['comment_text'] = train['comment_text'].replace({r'\s+$': '', r'^\s+': ''}, regex=True).replace(r'\n',  ' ', regex=True)
test['comment_text'] = test['comment_text'].replace({r'\s+$': '', r'^\s+': ''}, regex=True).replace(r'\n',  ' ', regex=True)

x_train = preprocess(train['comment_text'])
y_train = np.where(train['target'] >= 0.5, 1, 0)

del train
gc.collect()
x_test = preprocess(test['comment_text'])


In [3]:
config_file = os.path.join(BERT_PRETRAINED_DIR, 'bert_config.json')
dict_file = os.path.join(BERT_PRETRAINED_DIR, 'vocab.txt')
checkpoint_file = os.path.join(BERT_PRETRAINED_DIR, 'bert_model.ckpt')
bert_model = load_trained_model_from_checkpoint(config_file, checkpoint_file,seq_len=32)


token_dict = {}
with codecs.open(dict_file, 'r', 'utf8') as reader:
    for line in reader:
        token = line.strip()
        token_dict[token] = len(token_dict)
tokenizer = Tokenizer(token_dict)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [4]:
model = Sequential()
model.add(Bidirectional(CuDNNLSTM(128,return_sequences=True),input_shape=(32,768)))
model.add(SpatialDropout1D(0.3))
model.add(Bidirectional(CuDNNLSTM(128,return_sequences=True)))
model.add(Flatten())
model.add(Dense(512,activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(256,activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_1 (Bidirection (None, 32, 256)           919552    
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 32, 256)           0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 32, 256)           395264    
_________________________________________________________________
flatten_1 (Flatten)          (None, 8192)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 512)               4194816   
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 256)               131328    
__________

In [5]:
class DataGenerator(keras.utils.Sequence):

    def __init__(self, dataX, dataY, batch_size=1, shuffle=True):
        self.batch_size = batch_size
        self.dataX = dataX
        self.dataY = dataY
        # 验证dataX训练数据和标签是否数量一致
        assert(len(self.dataX)==len(self.dataY))
        self.indexes = np.arange(len(self.dataX))
        self.shuffle = shuffle
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __len__(self):
        #计算每一个epoch的迭代次数
        return math.ceil(len(self.dataX) / float(self.batch_size))

    def __getitem__(self, index):
        # 生成每个batch数据
        # 生成batch_size个索引
        batch_indexs = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        # 根据索引获取datas集合中的数据
        batch_X = [self.dataX[k] for k in batch_indexs]
        batch_Y = [self.dataY[k] for k in batch_indexs]

        # 生成数据
        X, y = self.data_generation(batch_X, batch_Y)
        return X, y

    def on_epoch_end(self):
        #在每一次epoch结束是否需要进行一次随机，重新随机一下index
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def data_generation(self, batch_X, batch_Y):
        y=[]
        # 生成数据
        indices=[]
        segments=[]
        
        for i, text in enumerate(batch_X):
            index, segment = tokenizer.encode(first=text, max_len=32)
            indices.append(index)
            segments.append(segment)
            
        word_vec = bert_model.predict([np.array(indices), np.array(segments)])
        for label in batch_Y:
            if(label==1):
                y.append([0,1])
            else:
                y.append([1,0])
        
        return word_vec, np.array(y)

In [6]:
dataGen = DataGenerator(x_train,y_train,64,True)


In [7]:
for x,y in dataGen:
    print(x.shape)
    break

(64, 32, 768)


In [8]:
model.fit_generator(dataGen)

Instructions for updating:
Use tf.cast instead.
Epoch 1/1
 5966/28202 [=====>........................] - ETA: 53:50 - loss: 0.1998 - acc: 0.9337

In [9]:
indices=[]
segments=[]
import tqdm
for text in tqdm.tqdm(x_test):
    index, segment = tokenizer.encode(first=text, max_len=32)
    indices.append(index)
    segments.append(segment)
test_data = [np.array(indices), np.array(segments)]
test_vec = bert_model.predict(test_data)



100%|██████████| 97320/97320 [01:13<00:00, 1331.03it/s]


In [10]:
test_vec.shape

(97320, 32, 768)

In [11]:
del x_test
del dataGen
del test_data
del indices
del segments
gc.collect()

0

In [12]:
predictions = model.predict_proba(test_vec,batch_size=32)[:,1]
submission = pd.DataFrame.from_dict({
    'id': test['id'],
    'prediction': predictions
})

submission.to_csv('submission.csv', index=False)
