In [None]:
import sys
import pandas as pd
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install torch torchvision transformers



In [None]:
!pip install emoji



In [None]:
train_df = pd.read_csv('train.csv')
dev_df = pd.read_csv('dev.csv')
test_df = pd.read_csv('test.csv')

In [None]:
train_df

Unnamed: 0.1,Unnamed: 0,text,label,id
0,0,4. Can eating garlic help prevent infection wi...,0,1250219300389974016
1,1,French police chief killed himself after #Char...,1,554886875303780352
2,2,Coronavirus disease (COVID-19) advice for the ...,0,1237901309011021825
3,3,Ottawa police confirm that there were multiple...,0,524958128392376320
4,4,if the primary focus of a government isn't to ...,0,1239295488677085185
...,...,...,...,...
1802,1802,Desperate Ted Cruz Claims Planned Parenthood S...,1,671181758692507648
1803,1803,"""Thoughts and prayers are not enough."" Pres. O...",1,672513234419638273
1804,1804,Police have surrounded this building where the...,0,553508098825261056
1805,1805,@Kirstenjoyweiss @MattFabrication @prestone85 ...,0,1249479605582327808


In [None]:
from transformers import BertTokenizer
import tensorflow as tf 
from tensorflow.keras.optimizers import Adam
from transformers import TFBertModel
from keras import backend as k_backend

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
def bert_encode(data, max_seq_length) :
    input_ids = []
    attention_masks = []
    for i in range(len(data.text)):
        encoded = tokenizer.encode_plus(
            data.text[i],
            add_special_tokens=True,
            max_length=max_seq_length,
            pad_to_max_length=True,
            return_attention_mask=True,
            truncation = True
        )

        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    return np.array(input_ids),np.array(attention_masks)

In [None]:
def recall(y_true, y_pred):
    true_positives = k_backend.sum(k_backend.round(k_backend.clip(y_true * y_pred, 0, 1)))
    possible_positives = k_backend.sum(k_backend.round(k_backend.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + k_backend.epsilon())
    return recall

def precision(y_true, y_pred):
    true_positives = k_backend.sum(k_backend.round(k_backend.clip(y_true * y_pred, 0, 1)))
    predicted_positives = k_backend.sum(k_backend.round(k_backend.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + k_backend.epsilon())
    return precision
    
def f1(y_true, y_pred):
    precision_value = precision(y_true, y_pred)
    recall_value = recall(y_true, y_pred)
    return 2*((precision_value*recall_value) / (precision_value+recall_value+k_backend.epsilon()))

In [None]:
def create_model(bert_model):
    input_ids = tf.keras.Input(shape=(300,),dtype='int32')
    attention_masks = tf.keras.Input(shape=(300,),dtype='int32')

    output = bert_model([input_ids,attention_masks])
    output = output[1]
    output = tf.keras.layers.Dense(32,activation='relu')(output)
    output = tf.keras.layers.Dropout(0.3)(output)
    output = tf.keras.layers.Dense(1,activation='sigmoid')(output)
    model = tf.keras.models.Model(inputs = [input_ids,attention_masks],outputs = output)
    model.compile(Adam(lr=6e-6), loss='binary_crossentropy', metrics=['accuracy', recall, precision, f1])
    return model

In [None]:
TFbert_model = TFBertModel.from_pretrained('bert-base-uncased')
model = create_model(TFbert_model)

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [None]:
train_input_ids,train_attention_masks = bert_encode(train_df,300)
dev_input_ids,dev_attention_masks = bert_encode(dev_df,300)
test_input_ids,test_attention_masks = bert_encode(test_df,300)
dev_data = ([dev_input_ids, dev_attention_masks], dev_df.label)



In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)
history = model.fit([train_input_ids,train_attention_masks], train_df.label, validation_data=dev_data ,callbacks=[callback], epochs=8, batch_size=4)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [None]:
model.save(".TFbert_final")



INFO:tensorflow:Assets written to: .TFbert_final/assets


INFO:tensorflow:Assets written to: .TFbert_final/assets


In [None]:
pre_dev_labels = model.predict([dev_input_ids, dev_attention_masks])
pre_dev_labels = np.round(pre_dev_labels).astype(int)

In [None]:
import sklearn.metrics as metrics
print(metrics.accuracy_score(dev_df.label, pre_dev_labels))

0.9378151260504202


In [None]:
print(metrics.precision_score(dev_df.label, pre_dev_labels))
print(metrics.recall_score(dev_df.label, pre_dev_labels))
print(metrics.f1_score(dev_df.label, pre_dev_labels, average='macro'))
print(metrics.roc_auc_score(dev_df.label, pre_dev_labels))

0.8623188405797102
0.8686131386861314
0.9125086934922999
0.913564211264463


In [None]:
# For Kaggle
result = model.predict([test_input_ids, test_attention_masks])
result = np.round(result).astype(int)

In [None]:
with open('test.predictions_TF.txt', 'w') as output:
  output.write('Id,Predicted\n')
  counter = 0
  for elem in list(result):
    for elem2 in list(elem):
      output.write(str(counter) + ',' + str(elem2)+'\n')
      counter+=1


In [None]:
#For Task 2
covid_df = pd.read_csv('covid.csv')

In [None]:
covid_input_ids,covid_attention_masks = bert_encode(covid_df,512)



In [None]:
result_covid = model.predict([covid_input_ids, covid_attention_masks])
result_covid = np.round(result_covid).astype(int)

In [None]:
with open('covid.predictions.txt', 'w') as output:
  output.write('Id,Predicted\n')
  counter = 0
  for elem in list(result_covid):
    for elem2 in list(elem):
      output.write(str(counter) + ',' + str(elem2)+'\n')
      counter+=1

In [None]:
#covid_labels = np.argmax(result_covid, axis = -1)
#covid_output = pd.DataFrame({'id':covid_df.id,'target':covid_labels})
#covid_output.to_csv("covid_labels.csv",index=False,sep=',')