In [55]:
import pandas as pd
import numpy as np 
import tensorflow as tf
from transformers import TFRobertaModel
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model
from tensorflow.keras.layers import concatenate,Dense,Dot,BatchNormalization,Dropout
from tokenizers import ByteLevelBPETokenizer,processors,Tokenizer
import random
random.seed(0)
np.random.seed(0)
from transformers import RobertaConfig
from tqdm import tqdm
import time
print(tf.__version__)

2.1.0


In [56]:
tf.test.is_gpu_available()

True

In [106]:
df=pd.read_csv("pairs_data.csv")
print(df.columns)
df['common'][df['common']==0]=-1
df.common.value_counts()

Index(['log1', 'log2', 'common'], dtype='object')


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


 1    37800
-1    30502
Name: common, dtype: int64

In [58]:
df.shape

(68302, 3)

In [59]:
tokenizer = ByteLevelBPETokenizer()
tokenizer.train(files=['training_text.txt'], vocab_size=25000, min_frequency=15, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])
tokenizer.post_processor = processors.RobertaProcessing(
  sep=("</s>", tokenizer.token_to_id("</s>")),
  cls=("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.save(".", "tokenizer")

['./tokenizer-vocab.json', './tokenizer-merges.txt']

In [60]:
MAX_LEN=200

In [61]:
tokenizer.enable_truncation(max_length=MAX_LEN)

In [62]:
def generateIds(output):
    ids=output.ids
    ids=ids[0:MAX_LEN-2]
    ids=[tokenizer.token_to_id('<s>')]+ids+[tokenizer.token_to_id('</s>')]
    attentionmask=[1]*len(ids)
    
    final_ids=ids+[tokenizer.token_to_id('<pad>')]*(MAX_LEN-len(ids))
    final_attention_mask=attentionmask+[0]*(MAX_LEN-len(ids))
    return final_ids,final_attention_mask


def tokenize(x):
    output=tokenizer.encode(x)
    final_ids,final_attention_mask=generateIds(output)
    return tf.convert_to_tensor(final_ids,tf.int64),tf.convert_to_tensor(final_attention_mask,tf.int64)

In [65]:
def batchTokenize(data):
    toks=[]
    atts=[]
    for i in tqdm(data):
        x,y=tokenize(i)
        toks.append(tf.reshape(x,[1,MAX_LEN]))
        atts.append(tf.reshape(y,[1,MAX_LEN]))
    toks=tf.concat(toks,0)
    atts=tf.concat(atts,0)
    print(toks.shape)
    print(atts.shape)
    return (toks,atts)

In [107]:
%%time
output=batchTokenize(df.log1)
toks1_input=output[0]
atts1_input=output[1]

100%|██████████| 68302/68302 [00:37<00:00, 1803.88it/s]


(68302, 200)
(68302, 200)
CPU times: user 45.4 s, sys: 4.44 s, total: 49.8 s
Wall time: 38.2 s


In [108]:
%%time
output=batchTokenize(df.log2)
toks2_input=output[0]
atts2_input=output[1]

100%|██████████| 68302/68302 [00:36<00:00, 1855.16it/s]


(68302, 200)
(68302, 200)
CPU times: user 44.5 s, sys: 4.16 s, total: 48.7 s
Wall time: 37.1 s


In [109]:
print(toks1_input.shape,atts1_input.shape,toks2_input.shape,atts2_input.shape)

(68302, 200) (68302, 200) (68302, 200) (68302, 200)


In [110]:
config= RobertaConfig()
config.num_hidden_layers=2
config.attention_probs_dropout_prob=0.3
config.vocab_size=tokenizer.get_vocab_size()

In [111]:
roberta = TFRobertaModel(config)

In [112]:
config

RobertaConfig {
  "architectures": null,
  "attention_probs_dropout_prob": 0.3,
  "bos_token_id": null,
  "do_sample": false,
  "eos_token_ids": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 512,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_beams": 1,
  "num_hidden_layers": 2,
  "num_labels": 2,
  "num_return_sequences": 1,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pad_token_id": null,
  "pruned_heads": {},
  "repetition_penalty": 1.0,
  "temperature": 1.0,
  "top_k": 50,
  "top_p": 1.0,
  "torchscript": false,
  "type_vocab_size": 2,
  "use_bfloat16": false,
  "vocab

In [113]:
target_values=df.common

In [114]:
toks1=Input(shape=(MAX_LEN,), dtype='int64')
atts1=Input(shape=(MAX_LEN,),dtype="int64")
out1=roberta(inputs={'input_ids':toks1,'attention_mask':atts1})
toks2=Input(shape=(MAX_LEN,), dtype='int64')
atts2=Input(shape=(MAX_LEN,),dtype="int64")
out2=roberta(inputs={'input_ids':toks2,'attention_mask':atts2})
mean1=tf.reduce_mean(out1[0],1)
mean2=tf.reduce_mean(out2[0],1)

#########Comment this block if objective is cosine similarity calculation
cosine_similarity=Dot(axes=1,normalize=True)
preds=cosine_similarity([mean1,mean2])

#######Uncomment This block if objective is classification

# diff=tf.math.subtract(mean1,mean2)
# diff=tf.abs(diff)
# merged = concatenate([mean1,mean2,diff])
# merged = BatchNormalization()(merged)
# merged = Dropout(0.1)(merged)
# preds = Dense(1, activation='sigmoid')(merged)

(None, 1)


In [115]:
model = Model(inputs=[toks1,atts1,toks2,atts2], outputs=preds)
# model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['acc'])
model.compile(loss='mse', optimizer='nadam', metrics=['mse'])

In [116]:
model.fit([toks1_input,atts1_input,toks2_input,atts2_input],target_values,
                  epochs=2, batch_size=64,shuffle=True,validation_split=0.1)

Train on 61471 samples, validate on 6831 samples
Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7fce0b7f6080>

In [126]:
model.save("SBERT/model/",include_optimizer=False)

INFO:tensorflow:Assets written to: /home/opsmxuser/SBERT/model/assets
