In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import transformers
from transformers import BertModel, BertConfig, BertTokenizer, RobertaModel



In [None]:
!pip uninstall transformers -y

In [None]:
!pip install transformers==2.11.0

In [2]:
transformers.__version__

'2.11.0'

In [3]:
df=pd.read_csv('../input/semeval/subtaskA_data_all.csv', index_col=0)
df_dev=pd.read_csv('../input/semeval/subtaskA_dev_data.csv', index_col=0)
df_test=pd.read_csv('../input/semeval/subtaskA_test_data.csv',index_col=0)

In [4]:
df1=pd.read_csv('../input/semeval/subtaskA_answers_all.csv', index_col=0, names = ["id", "Class"])
df1_dev=pd.read_csv('../input/semeval/subtaskA_gold_answers.csv', index_col=0, names = ["id", "Class"])
df1_test=pd.read_csv('../input/semeval/subtaskA_gold_answers 2.csv', index_col=0, names=["id", "Class"])

In [5]:
frames=[df,df1]
result=pd.concat(frames, axis=1)
frames=[df_dev,df1_dev]
result_dev=pd.concat(frames, axis=1)
frames=[df_test,df1_test]
result_test=pd.concat(frames, axis=1)

In [6]:
result

Unnamed: 0_level_0,sent0,sent1,Class
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,He poured orange juice on his cereal.,He poured milk on his cereal.,0
1,He drinks apple.,He drinks milk.,0
2,Jeff ran a mile today,"Jeff ran 100,000 miles today",1
3,A mosquito stings me,I sting a mosquito,1
4,A niece is a person.,A giraffe is a person.,1
...,...,...,...
9995,Mark ate a big bitter cherry pie,Mark ate a big sweet cherry pie,0
9996,Gloria wears a cat on her head,Gloria wears a hat on her head,0
9997,Harry went to the barbershop to have his hair cut,Harry went to the barbershop to have his glass...,1
9998,Reilly is sleeping on the couch,Reilly is sleeping on the window,1


In [7]:
y_train = tf.keras.utils.to_categorical(result.Class, num_classes=2)
y_dev = tf.keras.utils.to_categorical(result_dev.Class, num_classes=2)
y_test = tf.keras.utils.to_categorical(result_test.Class, num_classes=2)

In [8]:
max_length = 128  # Maximum length of input sentence to the model.
batch_size = 32
epochs = 10

# Labels in our dataset.
labels = [0,1]

In [9]:
class BertSemanticDataGenerator(tf.keras.utils.Sequence):
    """Generates batches of data.

    Args:
        sentence_pairs: Array of premise and hypothesis input sentences.
        labels: Array of labels.
        batch_size: Integer batch size.
        shuffle: boolean, whether to shuffle the data.
        include_targets: boolean, whether to incude the labels.

    Returns:
        Tuples `([input_ids, attention_mask, `token_type_ids], labels)`
        (or just `[input_ids, attention_mask, `token_type_ids]`
         if `include_targets=False`)
    """

    def __init__(
        self,
        sentence_pairs,
        labels,
        batch_size=batch_size,
        shuffle=True,
        include_targets=True,
    ):
        self.sentence_pairs = sentence_pairs
        self.labels = labels
        self.shuffle = shuffle
        self.batch_size = batch_size
        self.include_targets = include_targets
        # Load our BERT Tokenizer to encode the text.
        # We will use base-base-uncased pretrained model.
        self.tokenizer = transformers.BertTokenizer.from_pretrained(
            "bert-base-uncased", do_lower_case=True
        )
        self.indexes = np.arange(len(self.sentence_pairs))
        self.on_epoch_end()

    def __len__(self):
        # Denotes the number of batches per epoch.
        return len(self.sentence_pairs) // self.batch_size

    def __getitem__(self, idx):
        # Retrieves the batch of index.
        indexes = self.indexes[idx * self.batch_size : (idx + 1) * self.batch_size]
        sentence_pairs = self.sentence_pairs[indexes]

        # With BERT tokenizer's batch_encode_plus batch of both the sentences are
        # encoded together and separated by [SEP] token.
        encoded = self.tokenizer.batch_encode_plus(
            sentence_pairs.tolist(),
            add_special_tokens=True,
            max_length=max_length,
            return_attention_mask=True,
            return_token_type_ids=True,
            pad_to_max_length=True,
            return_tensors="tf",
        )

        # Convert batch of encoded features to numpy array.
        input_ids = np.array(encoded["input_ids"], dtype="int32")
        attention_masks = np.array(encoded["attention_mask"], dtype="int32")
        token_type_ids = np.array(encoded["token_type_ids"], dtype="int32")

        # Set to true if data generator is used for training/validation.
        if self.include_targets:
            labels = np.array(self.labels[indexes], dtype="int32")
            return [input_ids, attention_masks, token_type_ids], labels
        else:
            return [input_ids, attention_masks, token_type_ids]

    def on_epoch_end(self):
        # Shuffle indexes after each epoch if shuffle is set to True.
        if self.shuffle:
            np.random.RandomState(42).shuffle(self.indexes)


In [14]:
# Create the model under a distribution strategy scope.
strategy = tf.distribute.MirroredStrategy()

with strategy.scope():
    # Encoded token ids from BERT tokenizer.
    input_ids = tf.keras.layers.Input(
        shape=(max_length,), dtype=tf.int32, name="input_ids"
    )
    # Attention masks indicates to the model which tokens should be attended to.
    attention_masks = tf.keras.layers.Input(
        shape=(max_length,), dtype=tf.int32, name="attention_masks"
    )
    # Token type ids are binary masks identifying different sequences in the model.
    token_type_ids = tf.keras.layers.Input(
        shape=(max_length,), dtype=tf.int32, name="token_type_ids"
    )
    # Loading pretrained BERT model.
    bert_model = transformers.TFBertModel.from_pretrained("bert-base-uncased")
    # Freeze the BERT model to reuse the pretrained features without modifying them.
    bert_model.trainable = False

    sequence_output, pooled_output = bert_model(
        input_ids, attention_mask=attention_masks, token_type_ids=token_type_ids
    )
    # Add trainable layers on top of frozen layers to adapt the pretrained features on the new data.
    bi_lstm = tf.keras.layers.Bidirectional(tf.compat.v1.keras.layers.CuDNNLSTM(64, return_sequences=True))(sequence_output)
    # Applying hybrid pooling approach to bi_lstm sequence output.
    avg_pool = tf.keras.layers.GlobalAveragePooling1D()(bi_lstm)
    max_pool = tf.keras.layers.GlobalMaxPooling1D()(bi_lstm)
    concat = tf.keras.layers.concatenate([avg_pool, max_pool])
    dropout = tf.keras.layers.Dropout(0.3)(concat)
    output = tf.keras.layers.Dense(2, activation="softmax")(dropout)
    model = tf.keras.models.Model(
        inputs=[input_ids, attention_masks, token_type_ids], outputs=output
    )

    model.compile(
        optimizer=tf.keras.optimizers.Adam(),
        loss="binary_crossentropy",
        metrics=["acc"],
    )


print(f"Strategy: {strategy}")
model.summary()

Strategy: <tensorflow.python.distribute.mirrored_strategy.MirroredStrategy object at 0x7f6234389a10>
Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 128)]        0                                            
__________________________________________________________________________________________________
attention_masks (InputLayer)    [(None, 128)]        0                                            
__________________________________________________________________________________________________
token_type_ids (InputLayer)     [(None, 128)]        0                                            
__________________________________________________________________________________________________
tf_bert_model_1 (TFBertModel)   ((None, 128, 768), ( 109482240   input_ids[0][0]          

In [15]:
train_data = BertSemanticDataGenerator(
    result[["sent0", "sent1"]].values.astype("str"),
    y_train,
    batch_size=batch_size,
    shuffle=True,
)
valid_data = BertSemanticDataGenerator(
    result_dev[["sent0", "sent1"]].values.astype("str"),
    y_dev,
    batch_size=batch_size,
    shuffle=False,
)

In [16]:
history = model.fit(
    train_data,
    validation_data=valid_data,
    epochs=epochs,
    use_multiprocessing=True,
    workers=20,
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [17]:
# Unfreeze the bert_model.
bert_model.trainable = True
# Recompile the model to make the change effective.
model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-5),
    loss="binary_crossentropy",
    metrics=["accuracy"],
)
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 128)]        0                                            
__________________________________________________________________________________________________
attention_masks (InputLayer)    [(None, 128)]        0                                            
__________________________________________________________________________________________________
token_type_ids (InputLayer)     [(None, 128)]        0                                            
__________________________________________________________________________________________________
tf_bert_model_1 (TFBertModel)   ((None, 128, 768), ( 109482240   input_ids[0][0]                  
                                                                 attention_masks[0][0]      

In [18]:
history = model.fit(
    train_data,
    validation_data=valid_data,
    epochs=5,
    use_multiprocessing=True,
    workers=25,
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [24]:
test_data = BertSemanticDataGenerator(
    result_test[["sent0", "sent1"]].values.astype("str"),
    y_test,
    batch_size=1000,
    shuffle=False,
)
model.evaluate(test_data, verbose=1)



[0.4714723825454712, 0.8579999804496765]

In [25]:
predictions = model.predict(test_data)

In [26]:
print(len(predictions))

1000


In [27]:
pred0=[]
pred1=[]
for i in range(len(predictions)):
    pred0.append(predictions[i][0])
    pred1.append(predictions[i][1])

In [35]:
predictions[0]

array([9.9999917e-01, 8.0528844e-07], dtype=float32)

In [39]:
result_test.iloc[0]

sent0     He loves to stroll at the park with his bed
sent1    He loves to stroll at the park with his dog.
Class                                               0
Name: 1175, dtype: object

In [64]:
column_names = ["0", "1"]
df = pd.DataFrame(columns = column_names)
df["0"] = pred0
df["1"] = pred1

In [65]:
df

Unnamed: 0,0,1
0,9.999992e-01,8.052884e-07
1,9.986173e-01,1.382704e-03
2,9.999846e-01,1.535250e-05
3,9.999962e-01,3.841644e-06
4,3.787434e-07,9.999996e-01
...,...,...
995,9.202700e-01,7.973003e-02
996,3.406400e-08,1.000000e+00
997,9.999784e-01,2.158790e-05
998,9.999689e-01,3.110559e-05


In [66]:
def func(r):
    if(r['1']>r['0']):
        return 1
    else:
        return 0

df['Class'] = df.apply(lambda r: func(r), 1)
df = df.drop(['1','0'], 1)

In [67]:
df

Unnamed: 0,Class
0,0
1,0
2,0
3,0
4,1
...,...
995,0
996,1
997,0
998,0


In [68]:
temp = pd.read_csv('../input/semeval/subtaskA_test_data.csv')
df['in'] = temp['id']

In [69]:
df = df[['in', 'Class']]

In [70]:
df

Unnamed: 0,in,Class
0,1175,0
1,452,0
2,275,0
3,869,0
4,50,1
...,...,...
995,1114,0
996,8,1
997,1945,0
998,1053,0


In [71]:
df.to_csv('subtaskA_answers.csv', index = False, header = False)