In [None]:
!pip install transformers

In [5]:
import tensorflow as tf
import numpy as np
import pandas as pd

In [6]:
train_df=pd.read_csv('snli_train.csv',nrows=100000)
valid_df=pd.read_csv('snli_valid.csv')

In [7]:
train_df.head()

Unnamed: 0,similarity,sentence1,sentence2
0,neutral,A person on a horse jumps over a broken down a...,A person is training his horse for a competition.
1,contradiction,A person on a horse jumps over a broken down a...,"A person is at a diner, ordering an omelette."
2,entailment,A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse."
3,neutral,Children smiling and waving at camera,They are smiling at their parents
4,entailment,Children smiling and waving at camera,There are children present


In [8]:
print(train_df.shape)

(35722, 3)


In [9]:
print(f"Sentence1: {train_df.loc[1,'sentence1']}")
print(f"Sentence2: {train_df.loc[1,'sentence2']}")
print(f"Similarity: {train_df.loc[1,'similarity']}")

Sentence1: A person on a horse jumps over a broken down airplane.
Sentence2: A person is at a diner, ordering an omelette.
Similarity: contradiction


Preprocessing

In [10]:
print('Number of missing_values')
print(train_df.isnull().sum())

Number of missing_values
similarity    0
sentence1     0
sentence2     1
dtype: int64


In [11]:
#Drop the rows with NaN values
train_df.dropna(axis=0,inplace=True)

In [12]:
#Count of labels
print(train_df.similarity.value_counts())

entailment       11910
contradiction    11887
neutral          11884
-                   40
Name: similarity, dtype: int64


In [13]:
#Count of labels
print(valid_df.similarity.value_counts())

entailment       3329
contradiction    3278
neutral          3235
-                 158
Name: similarity, dtype: int64


In [14]:
#To remove the rows with '-' in the similarity column
train_df=train_df[train_df.similarity!='-'].reset_index(drop=True)
valid_df=valid_df[valid_df.similarity!='-'].reset_index(drop=True)

In [15]:
train_df['label']=train_df['similarity'].apply(
    lambda x:0 if x=='contradiction' else 1 if x=='entailment' else 2)

valid_df['label']=valid_df['similarity'].apply(
    lambda x:0 if x=='contradiction' else 1 if x=='entailment' else 2)

In [16]:
train_df.head()

Unnamed: 0,similarity,sentence1,sentence2,label
0,neutral,A person on a horse jumps over a broken down a...,A person is training his horse for a competition.,2
1,contradiction,A person on a horse jumps over a broken down a...,"A person is at a diner, ordering an omelette.",0
2,entailment,A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse.",1
3,neutral,Children smiling and waving at camera,They are smiling at their parents,2
4,entailment,Children smiling and waving at camera,There are children present,1


In [17]:
train_sentence1=list(train_df['sentence1'])
train_sentence2=list(train_df['sentence2'])

valid_sentence1=list(valid_df['sentence1'])
valid_sentence2=list(valid_df['sentence2'])

In [18]:
from transformers import BertTokenizer

tokenizer=BertTokenizer.from_pretrained('bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [19]:
train_encodings=tokenizer(train_sentence1,train_sentence2,padding=True,truncation=True,
                          return_token_type_ids=True,return_attention_mask=True)

In [20]:
valid_encodings=tokenizer(valid_sentence1,valid_sentence2,padding=True,truncation=True,
                          return_token_type_ids=True,return_attention_mask=True)

In [21]:
train_encodings.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [22]:
len(train_encodings['input_ids'][1])

162

In [24]:
from transformers import TFBertModel
bert_model=TFBertModel.from_pretrained('bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=536063208.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [29]:
input_ids=tf.keras.layers.Input(shape=(None,),dtype=tf.int32,name='input_ids')

token_type_ids=tf.keras.layers.Input(shape=(None,),dtype=tf.int32,name='token_type_ids')

attention_mask=tf.keras.layers.Input(shape=(None,),dtype=tf.int32,name='attention_mask')

bert_model.trainable=False
sequence_output,_=bert_model(input_ids,token_type_ids=token_type_ids,attention_mask=attention_mask)
  
lstm=tf.keras.layers.Bidirectional(
      tf.keras.layers.LSTM(64,return_sequences=True))(sequence_output)

avg_pool=tf.keras.layers.GlobalAveragePooling1D()(lstm)
max_pool=tf.keras.layers.GlobalMaxPooling1D()(lstm)
concat=tf.keras.layers.concatenate([avg_pool,max_pool])
dropout=tf.keras.layers.Dropout(0.3)(concat)
output=tf.keras.layers.Dense(3,activation='softmax')(dropout)

model=tf.keras.models.Model(
      inputs=[input_ids,token_type_ids,attention_mask],outputs=output) 
  
model.compile(loss='sparse_categorical_crossentropy',
                optimizer='adam',metrics=['acc'])

model.summary()

Model: "functional_7"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, None)]       0                                            
__________________________________________________________________________________________________
attention_mask (InputLayer)     [(None, None)]       0                                            
__________________________________________________________________________________________________
token_type_ids (InputLayer)     [(None, None)]       0                                            
__________________________________________________________________________________________________
tf_bert_model (TFBertModel)     ((None, None, 768),  109482240   input_ids[0][0]                  
                                                                 attention_mask[0][0]  

In [30]:
y_train=tf.constant(train_df['label'])
y_val=tf.constant(valid_df['label'])

In [31]:
print(len(train_encodings['input_ids']))
print(y_train.shape)

35681
(35681,)


In [32]:
print(y_train)

tf.Tensor([2 0 1 ... 1 2 0], shape=(35681,), dtype=int64)


In [33]:
BUFFER_SIZE=100000
train_dataset=tf.data.Dataset.from_tensor_slices((dict(train_encodings),y_train))
train_dataset=train_dataset.shuffle(BUFFER_SIZE).batch(32)

In [34]:
validation_dataset=tf.data.Dataset.from_tensor_slices((dict(valid_encodings),y_val))
validation_dataset=validation_dataset.batch(32)

In [36]:
next(iter(train_dataset))

({'attention_mask': <tf.Tensor: shape=(32, 162), dtype=int32, numpy=
  array([[1, 1, 1, ..., 0, 0, 0],
         [1, 1, 1, ..., 0, 0, 0],
         [1, 1, 1, ..., 0, 0, 0],
         ...,
         [1, 1, 1, ..., 0, 0, 0],
         [1, 1, 1, ..., 0, 0, 0],
         [1, 1, 1, ..., 0, 0, 0]], dtype=int32)>,
  'input_ids': <tf.Tensor: shape=(32, 162), dtype=int32, numpy=
  array([[ 101, 1037, 2210, ...,    0,    0,    0],
         [ 101, 1037, 2450, ...,    0,    0,    0],
         [ 101, 1996, 2775, ...,    0,    0,    0],
         ...,
         [ 101, 2235, 2316, ...,    0,    0,    0],
         [ 101, 1037, 2450, ...,    0,    0,    0],
         [ 101, 2045, 2003, ...,    0,    0,    0]], dtype=int32)>,
  'token_type_ids': <tf.Tensor: shape=(32, 162), dtype=int32, numpy=
  array([[0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         ...,
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0]], dty

In [37]:
history=model.fit(train_dataset,
                  epochs=3,
                  validation_data=validation_dataset)

Epoch 1/3








Epoch 2/3
Epoch 3/3
