In [None]:
import sys
import pandas as pd
import numpy as np

In [None]:
!pip install torch torchvision transformers



In [None]:
!pip install emoji



In [None]:
train_df = pd.read_csv('train_sr.csv')
dev_df = pd.read_csv('dev_sr.csv')
test_df = pd.read_csv('test_sr.csv')

In [None]:
train_df

Unnamed: 0.1,Unnamed: 0,text,label,index
0,0,5. Can regularly rinsing your nose with saline...,0,0
1,1,5. Can regularly rinsing your nose with saline...,0,0
2,2,5. Can regularly rinsing your nose with saline...,0,0
3,3,5. Can regularly rinsing your nose with saline...,0,0
4,4,5. Can regularly rinsing your nose with saline...,0,0
...,...,...,...,...
22680,22680,@lynneSimpkin I can help! 👩‍🏫\n9am: Socialism ...,0,1894
22681,22681,@lynneSimpkin I can help! 👩‍🏫\n9am: Socialism ...,0,1894
22682,22682,@lynneSimpkin I can help! 👩‍🏫\n9am: Socialism ...,0,1894
22683,22683,@lynneSimpkin I can help! 👩‍🏫\n9am: Socialism ...,0,1894


In [None]:
from transformers import BertTokenizer
import tensorflow as tf 
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from transformers import TFBertModel
from keras import backend as k_backend

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [None]:
def bert_encode(data, maximum_sequence_length) :
    input_ids = []
    attention_masks = []
    for i in range(len(data.text)):
        encoded = tokenizer.encode_plus(
            data.text[i],
            add_special_tokens=True,
            max_length=maximum_sequence_length,
            pad_to_max_length=True,
            return_attention_mask=True,
            truncation = True
        )

        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    return np.array(input_ids),np.array(attention_masks)

In [None]:
def recall(y_true, y_pred):
    true_positives = k_backend.sum(k_backend.round(k_backend.clip(y_true * y_pred, 0, 1)))
    possible_positives = k_backend.sum(k_backend.round(k_backend.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + k_backend.epsilon())
    return recall

def precision(y_true, y_pred):
    true_positives = k_backend.sum(k_backend.round(k_backend.clip(y_true * y_pred, 0, 1)))
    predicted_positives = k_backend.sum(k_backend.round(k_backend.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + k_backend.epsilon())
    return precision

def f1(y_true, y_pred):
    precision_value = precision(y_true, y_pred)
    recall_value = recall(y_true, y_pred)
    return 2*((precision_value*recall_value) / (precision_value+recall_value+K.epsilon()))

In [None]:
def create_model(bert_model):
    input_ids = tf.keras.Input(shape=(300,),dtype='int32')
    attention_masks = tf.keras.Input(shape=(300,),dtype='int32')
    output = bert_model([input_ids,attention_masks])
    output = output[1]
    output = tf.keras.layers.Dense(32,activation='relu')(output)
    output = tf.keras.layers.Dropout(0.3)(output)
    output = tf.keras.layers.Dense(1,activation='sigmoid')(output)
    
    model = tf.keras.models.Model(inputs = [input_ids,attention_masks],outputs = output)
    model.compile(Adam(lr=6e-6), loss='binary_crossentropy', metrics=['accuracy', recall, precision, f1])
    return model

In [None]:
TFbert_model = TFBertModel.from_pretrained('bert-base-uncased')
model = create_model(TFbert_model)

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [None]:
train_input_ids,train_attention_masks = bert_encode(train_df,300)
dev_input_ids,dev_attention_masks = bert_encode(dev_df,300)
test_input_ids,test_attention_masks = bert_encode(test_df,300)
dev_data = ([dev_input_ids, dev_attention_masks], dev_df.label)



In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)
history = model.fit([train_input_ids,train_attention_masks], train_df.label, validation_data=dev_data ,callbacks=[callback], epochs=7, batch_size=16)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
model.save("/content/drive/MyDrive/Colab/tfmodel_sr1")



INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab/tfmodel_sr1/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab/tfmodel_sr1/assets


In [None]:
model.save("./pure_bert/pure_bert_v11")



INFO:tensorflow:Assets written to: ./pure_bert/pure_bert_v11/assets


INFO:tensorflow:Assets written to: ./pure_bert/pure_bert_v11/assets


In [None]:
result_dev = model.predict([dev_input_ids, dev_attention_masks])

In [None]:
from collections import Counter, defaultdict
pred_scores = result_dev

index_list = list(dev_df['index'])
label_all = list(dev_df['label'])

counter = Counter(index_list)
proba_sum = defaultdict(float)
label_sum = defaultdict(int)

for i in range(len(index_list)):
  proba_sum[index_list[i]] += pred_scores[i]
  label_sum[index_list[i]] += label_all[i]
pred_labels = []
proba_list = []
label_list = []
for i in sorted(list(proba_sum.keys())):
  proba = 0
  label = 0
  proba_list.append(proba_sum[i]/counter[i])
  label_list.append(int(label_sum[i]/counter[i]))
  if proba_sum[i]/counter[i]>0.5:
    label = 1
  pred_labels.append(label)

print(len(proba_list))
print(len(label_list))

537
537


In [None]:
import sklearn.metrics as metrics

print(metrics.accuracy_score(label_list, pred_labels))
print(metrics.precision_score(label_list, pred_labels))
print(metrics.recall_score(label_list, pred_labels))
print(metrics.roc_auc_score(label_list, proba_list))

0.9757914338919925
0.9636363636363636
0.9217391304347826
0.9930970533690502


In [None]:
print(metrics.f1_score(dev_df.label, pre_dev_labels, average='macro'))

In [None]:
result_test = model.predict([test_input_ids, test_attention_masks])

In [None]:
pred_scores = result_test

index_list = list(test_df['index'])

from collections import Counter, defaultdict
counter = Counter(index_list)
proba_sum = defaultdict(float)

for i in range(len(index_list)):
  proba_sum[index_list[i]] += pred_scores[i]

pred_labels = []
for i in sorted(list(proba_sum.keys())):
  label = 0
  if proba_sum[i]/counter[i]>0.5:
    label = 1
  pred_labels.append(label)
  

In [None]:
with open('test.predictions_tf1.txt', 'w') as output:
  output.write('Id,Predicted\n')
  counter = 0
  for elem in pred_labels:
    output.write(str(counter) + ',' + str(elem)+'\n')
    counter+=1

In [None]:
# result = model.predict([test_input_ids, test_attention_masks])
# result = np.round(result).astype(int)

In [None]:
# with open('test.predictions.txt', 'w') as output:
#   output.write('Id,Predicted\n')
#   counter = 0
#   for elem in list(result):
#     for elem2 in list(elem):
#       output.write(str(counter) + ',' + str(elem2)+'\n')
#       counter+=1

In [None]:
#For Task 2
covid_df = pd.read_csv('covid.csv')

In [None]:
covid_input_ids,covid_attention_masks = bert_encode(covid_df,512)



In [None]:
result_covid = model.predict([covid_input_ids, covid_attention_masks])
result_covid = np.round(result_covid).astype(int)

In [None]:
with open('covid.predictions.txt', 'w') as output:
  output.write('Id,Predicted\n')
  counter = 0
  for elem in list(result_covid):
    for elem2 in list(elem):
      output.write(str(counter) + ',' + str(elem2)+'\n')
      counter+=1

In [None]:
#covid_labels = np.argmax(result_covid, axis = -1)
#covid_output = pd.DataFrame({'id':covid_df.id,'target':covid_labels})
#covid_output.to_csv("covid_labels.csv",index=False,sep=',')