## **Install the necessary package**

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 4.8 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 2.3 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 15.7 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 28.4 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 36.0 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found ex

## **Import the required libraries**

In [None]:
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures
import tensorflow as tf
import pandas as pd
import os
import shutil

In [None]:
from google.colab import files
uploaded = files.upload()

Saving data.npz to data.npz
Saving test.npz to test.npz


### **We create a training dataset and a validation dataset from our "aclImdb/train" directory with a 80/20 split.**

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
ss_data = np.load('data.npz')
train_feat, val_feat = train_test_split(ss_data['a'], test_size=0.2, random_state=42,shuffle=True)
train_lab, val_lab = train_test_split(ss_data['b'], test_size=0.2, random_state=42,shuffle=True)

In [None]:
labelcol = train_lab.astype('float')
train_lab = labelcol.astype('int')

train_data = pd.DataFrame([train_feat, train_lab]).T
train_data.columns = ['DATA_COLUMN', 'LABEL_COLUMN']
train_data['DATA_COLUMN'] = train_data['DATA_COLUMN']
train_data.head()

Unnamed: 0,DATA_COLUMN,LABEL_COLUMN
0,I rented this movie for a few laughs. I had ne...,1
1,"Besides being boring, the scenes were oppressi...",1
2,This definitely is NOT the intellectual film w...,0
3,If you are going to attempt building tension i...,1
4,"Fox's ""The True Story Of Jesse James"" (1957) i...",1


In [None]:
labelcol = val_lab.astype('float')
val_lab = labelcol.astype('int')

val_data = pd.DataFrame([val_feat, val_lab]).T
val_data.columns = ['DATA_COLUMN', 'LABEL_COLUMN']
val_data['DATA_COLUMN'] = val_data['DATA_COLUMN']
val_data.head()

Unnamed: 0,DATA_COLUMN,LABEL_COLUMN
0,I am amazed that movies like this can still be...,0
1,"""Mad Dog Time""...""Trigger Happy"" whatever you ...",1
2,We tend to forget that the master/slave contex...,0
3,I read some previous comments stating that thi...,0
4,"THE RUNNING MAN, along with TOTAL RECALL, is m...",0


In [None]:
test_data = np.load('test.npz')

In [None]:
test_feat = test_data['a']
labelcol = test_data['b'].astype('float')
test_lab = labelcol.astype('int')

test_data = pd.DataFrame([test_feat, test_lab]).T
test_data.columns = ['DATA_COLUMN', 'LABEL_COLUMN']
test_data['DATA_COLUMN'] = test_data['DATA_COLUMN']
test_data.head()

Unnamed: 0,DATA_COLUMN,LABEL_COLUMN
0,"I found this movie really hard to sit through,...",1
1,The movie starts off with Reeve (Ekin) and his...,1
2,I had a VERY hard time sitting through this fi...,1
3,"I'm not a big fan of musicals, but I was alway...",0
4,I honestly fail to understand why people love ...,1


## **We create functions to turn training and validation data into BERT and Tensorflow format**

In [None]:
def convert_data_to_examples(train, test, DATA_COLUMN, LABEL_COLUMN): 
  train_InputExamples = train_data.apply(lambda x: InputExample(guid=None, 
                                                          text_a = x[DATA_COLUMN], 
                                                          text_b = None,
                                                          label = x[LABEL_COLUMN]), axis = 1)

  validation_InputExamples = val_data.apply(lambda x: InputExample(guid=None, 
                                                          text_a = x[DATA_COLUMN], 
                                                          text_b = None,
                                                          label = x[LABEL_COLUMN]), axis = 1)
  
  return train_InputExamples, validation_InputExamples
  
def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
    features = [] # -> will hold InputFeatures to be converted later

    for e in examples:
        # Documentation is really strong for this method, so please take a look at it
        input_dict = tokenizer.encode_plus(
            e.text_a,
            add_special_tokens=True,
            max_length=max_length, # truncates if len(s) > max_length
            return_token_type_ids=True,
            return_attention_mask=True,
            pad_to_max_length=True, # pads to the right by default # CHECK THIS for pad_to_max_length
            truncation=True
        )

        input_ids, token_type_ids, attention_mask = (input_dict["input_ids"],
            input_dict["token_type_ids"], input_dict['attention_mask'])

        features.append(
            InputFeatures(
                input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=e.label
            )
        )

    def gen():
        for f in features:
            yield (
                {
                    "input_ids": f.input_ids,
                    "attention_mask": f.attention_mask,
                    "token_type_ids": f.token_type_ids,
                },
                f.label,
            )

    return tf.data.Dataset.from_generator(
        gen,
        ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
        (
            {
                "input_ids": tf.TensorShape([None]),
                "attention_mask": tf.TensorShape([None]),
                "token_type_ids": tf.TensorShape([None]),
            },
            tf.TensorShape([]),
        ),
    )


In [None]:
InputExample(guid=None,
             text_a = "Hello, world",
             text_b = None,
             label = 1)

InputExample(guid=None, text_a='Hello, world', text_b=None, label=1)

In [None]:
DATA_COLUMN = 'DATA_COLUMN'
LABEL_COLUMN = 'LABEL_COLUMN'

train_InputExamples, validation_InputExamples = convert_data_to_examples(train_data, 
                                                                           val_data, 
                                                                           DATA_COLUMN, 
                                                                           LABEL_COLUMN)

## **Create the BERT model**

In [None]:
model = TFBertForSequenceClassification.from_pretrained("google/bert_uncased_L-2_H-128_A-2", from_pt=True)
tokenizer = BertTokenizer.from_pretrained("google/bert_uncased_L-2_H-128_A-2")

Downloading:   0%|          | 0.00/382 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/16.9M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

In [None]:
model.summary()

Model: "tf_bert_for_sequence_classification_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  4385920   
                                                                 
 dropout_61 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  258       
                                                                 
Total params: 4,386,178
Trainable params: 4,386,178
Non-trainable params: 0
_________________________________________________________________


## **Now we create the training and validation data using the functions**

In [None]:
train_InputExamples, validation_InputExamples = convert_data_to_examples(train_data, val_data, DATA_COLUMN, LABEL_COLUMN)

training_data = convert_examples_to_tf_dataset(list(train_InputExamples), tokenizer)
training_data = training_data.shuffle(100).batch(32).repeat(2)

validation_data = convert_examples_to_tf_dataset(list(validation_InputExamples), tokenizer)
validation_data = validation_data.batch(32)



## **Compile and train the model**

In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0), 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])

model.fit(training_data, epochs=1, validation_data=validation_data)



<keras.callbacks.History at 0x7fcca2ef41d0>

## **Test the model prediction**

In [None]:
rs = np.array(test_data['DATA_COLUMN'].values.tolist())
batches = np.reshape(rs, (-1, 50))

tf_batch = tokenizer(list(batches[0]), max_length=128, padding=True, truncation=True, return_tensors='tf')
tf_outputs = model(tf_batch)
tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
labels = ['Negative','Positive']
label = tf.argmax(tf_predictions, axis=1)
label = label.numpy()
arr1 = label
for i in range(1,500):
  tf_batch = tokenizer(list(batches[i]), max_length=128, padding=True, truncation=True, return_tensors='tf')
  tf_outputs = model(tf_batch)
  tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
  labels = ['Negative','Positive']
  label = tf.argmax(tf_predictions, axis=1)
  label = label.numpy()
  arr1 = np.concatenate((arr1, label))

[1 1 1 ... 1 1 0]


# **Compute Test Accuracy, Precision, Recall and F1 Score**

# **Get the Confusion Matrix (TP, FP, TN, FN)**

# **Accuracy = (TP + TN)/(TP + FP + TN + FN)**
# **Precision = TP/(TP + FP)**
# **Recall = TP/(TP + FN)**
# **F1_Score = 2 * (Precision*Recall)/(Precision+Recall) = TP/(TP + (FP+FN)/2)** 

## **You can use the sklearn package to compute the above**

In [None]:
from sklearn import metrics

In [None]:
import numpy as np
actual_labels = np.array(test_data['LABEL_COLUMN'].values.tolist())
predicted_labels = arr1

print(metrics.confusion_matrix(actual_labels, predicted_labels))
print("Test Accuracy: ", metrics.accuracy_score(actual_labels, predicted_labels))
print("Precision: ",metrics.precision_score(actual_labels, predicted_labels))
print("Recall: ",metrics.recall_score(actual_labels, predicted_labels))
print("F1_Score: ",metrics.f1_score(actual_labels, predicted_labels))

[[10497  2003]
 [ 2907  9593]]
Test Accuracy:  0.8036
Precision:  0.8272680234563643
Recall:  0.76744
F1_Score:  0.7962317397078353
