In [None]:
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py
!wget --quiet https://github.com/spongebob-shortpants/nlp/blob/main/tokenization.py.txt

In [None]:
!pip install tensorflow_addons

Collecting tensorflow_addons
[?25l  Downloading https://files.pythonhosted.org/packages/74/e3/56d2fe76f0bb7c88ed9b2a6a557e25e83e252aec08f13de34369cd850a0b/tensorflow_addons-0.12.1-cp37-cp37m-manylinux2010_x86_64.whl (703kB)
[K     |▌                               | 10kB 13.4MB/s eta 0:00:01[K     |█                               | 20kB 18.8MB/s eta 0:00:01[K     |█▍                              | 30kB 12.8MB/s eta 0:00:01[K     |█▉                              | 40kB 9.8MB/s eta 0:00:01[K     |██▎                             | 51kB 7.1MB/s eta 0:00:01[K     |██▉                             | 61kB 7.2MB/s eta 0:00:01[K     |███▎                            | 71kB 7.2MB/s eta 0:00:01[K     |███▊                            | 81kB 7.8MB/s eta 0:00:01[K     |████▏                           | 92kB 7.9MB/s eta 0:00:01[K     |████▋                           | 102kB 8.2MB/s eta 0:00:01[K     |█████▏                          | 112kB 8.2MB/s eta 0:00:01[K     |█████▋      

In [None]:
!pip install sentencepiece

Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/f5/99/e0808cb947ba10f575839c43e8fafc9cc44e4a7a2c8f79c60db48220a577/sentencepiece-0.1.95-cp37-cp37m-manylinux2014_x86_64.whl (1.2MB)
[K     |████████████████████████████████| 1.2MB 8.8MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.95


In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

import tensorflow_hub as hub
import keras
import tokenization
import tensorflow_addons as tfa

In [None]:
def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [None]:
def build_model(bert_layer, max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    #out = Dense(1, activation='sigmoid')(clf_output)
    out = Dense(20, activation='softmax')(clf_output)
    
    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    
    optimizer_adamw = tfa.optimizers.AdamW(lr = 1e-5, weight_decay = 2e-6, clipvalue = 700)
    optimizer_adam = Adam(lr=1e-5)
    
    model.compile(optimizer_adamw, loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
%%time
module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1"
bert_layer = hub.KerasLayer(module_url, trainable=True)

CPU times: user 22.1 s, sys: 4.18 s, total: 26.3 s
Wall time: 34.8 s


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
train = pd.read_csv('/content/gdrive/My Drive/train.csv', names=['label', 'node idx'])
text  = pd.read_csv('/content/gdrive/My Drive/text.csv', names=['paper id', 'title', 'abstract'])
test  = pd.read_csv('/content/gdrive/My Drive/test.csv', names=['node idx'])
node2paper = pd.read_csv('/content/gdrive/My Drive/nodeid2paperid.csv', header=0)

train = pd.merge(train, node2paper, on='node idx')
train = pd.merge(train, text, on='paper id')

test = pd.merge(test, node2paper, on='node idx')
test = pd.merge(test, text, on='paper id')

NUM_CLASSES = 20
KFOLD = 1

In [None]:
train = train[:10]
train

Unnamed: 0,label,node idx,paper id,title,abstract
0,4,0,9657784,evasion attacks against machine learning at te...,"In security-sensitive applications, the succes..."
1,5,1,39886162,how hard is computing parity with noisy commun...,We show a tight lower bound of $\Omega(N \log\...
2,8,3,121432379,a promise theory perspective on data networks,Networking is undergoing a transformation thro...
3,6,6,1444859417,webvrgis based city bigdata 3d visualization a...,This paper shows the WEBVRGIS platform overlyi...
4,4,7,1483430697,information theoretic authentication and secre...,"In the splitting model, information theoretic ..."
5,3,8,1486601621,whealth transforming telehealth services,A worldwide increase in proportions of older p...
6,16,14,1528301850,a bi level view of inpainting based image comp...,"Inpainting based image compression approaches,..."
7,19,19,1546946208,electrical structure based pmu placement in el...,Recent work on complex networks compared the t...
8,14,21,1551937652,detecting simultaneous integer relations for s...,An algorithm which either finds an nonzero int...
9,3,29,1578902217,time critical social mobilization,The World Wide Web is commonly seen as a platf...


In [None]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

In [None]:
train_input = bert_encode(train.abstract.values, tokenizer, max_len=128)
test_input = bert_encode(test.abstract.values, tokenizer, max_len=128)
train_labels = keras.utils.to_categorical(train.label.values, NUM_CLASSES)

In [None]:
test_input = bert_encode(test.abstract.values, tokenizer, max_len=128)

In [None]:
model = build_model(bert_layer, max_len=128)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 128)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 128)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 128)]        0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 1024), (None 335141889   input_word_ids[0][0]             
                                                                 input_mask[0][0]             

In [None]:

early_stopping = EarlyStopping(monitor='val_accuracy', verbose=0, mode='max', patience=2)
    
train_history = model.fit(
    train_input, 
    train_labels,
    validation_split=0.2,
    epochs=8,
    batch_size=16,
    callbacks=[early_stopping],
)

Epoch 1/8
Epoch 2/8
Epoch 3/8


In [None]:
model.save('model.h5')
#model.load_weights('model.h5')

In [None]:
proba_test = np.zeros((len(test), NUM_CLASSES))
proba_test += model.predict(test_input, verbose=1, batch_size=16) / KFOLD
sub_test = np.argmax(proba_test, axis=1)



In [None]:
sub = pd.DataFrame()
sub['id'] = test['node idx']
sub['label'] = sub_test
sub.to_csv('submission.csv', index=False)