# Imports

In [7]:
import numpy as np 
import pandas as pd 
import tensorflow as tf
import sklearn
from tqdm import tqdm

from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures

  from .autonotebook import tqdm as notebook_tqdm


# Load and Preprocess Data

In [6]:
df=pd.read_csv("./IMDB Dataset.csv")
df.sample()

Unnamed: 0,review,sentiment
24068,If you are having trouble sleeping or just wan...,negative


In [8]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

Downloading config.json: 100%|██████████| 570/570 [00:00<00:00, 304kB/s]
Downloading tf_model.h5: 100%|██████████| 511M/511M [00:49<00:00, 10.8MB/s] 
2022-09-04 20:34:36.760111: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-09-04 20:34:36.764804: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.
All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on 

In [15]:
def cat2num(value):
    if value=='positive': 
        return 1
    else: 
        return 0
    
df['sentiment']  =  df['sentiment'].apply(cat2num)
train = df[:45000]
test = df[45000:49000]
val = df[49000:]

Data Processing

In [17]:
def convert_data_to_examples(train, test, val, review, sentiment): 
    train_InputExamples = train.apply(lambda x: InputExample(
                                                            guid=None, 
                                                            text_a = x[review], 
                                                            label = x[sentiment]), 
                                                            axis = 1,
                                                        )

    test_InputExamples = test.apply(lambda x: InputExample(
                                                            guid=None, 
                                                            text_a = x[review], 
                                                            label = x[sentiment]), 
                                                            axis = 1,
                                                        )
    
    validation_InputExamples = val.apply(lambda x: InputExample(
                                                            guid=None, 
                                                            text_a = x[review], 
                                                            label = x[sentiment]), 
                                                            axis = 1,
                                                        )
  
    return train_InputExamples, test_InputExamples, validation_InputExamples

train_InputExamples, test_InputExamples, validation_InputExamples = convert_data_to_examples(train, test, val, 'review',  'sentiment')

In [11]:
def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
    features = [] 

    for e in tqdm(examples):
        input_dict = tokenizer.encode_plus(
            e.text_a,
            add_special_tokens=True,    
            max_length=max_length,    
            return_token_type_ids=True,
            return_attention_mask=True,
            pad_to_max_length=True, 
            truncation=True
        )

        input_ids, token_type_ids, attention_mask = (input_dict["input_ids"],input_dict["token_type_ids"], input_dict['attention_mask'])
        features.append(InputFeatures( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=e.label) )

    def gen():
        for f in features:
            yield (
                {
                    "input_ids": f.input_ids,
                    "attention_mask": f.attention_mask,
                    "token_type_ids": f.token_type_ids,
                },
                f.label,
            )

    return tf.data.Dataset.from_generator(
        gen,
        ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
        (
            {
                "input_ids": tf.TensorShape([None]),
                "attention_mask": tf.TensorShape([None]),
                "token_type_ids": tf.TensorShape([None]),
            },
            tf.TensorShape([]),
        ),
    )


DATA_COLUMN = 'review'
LABEL_COLUMN = 'sentiment'

In [12]:
train_data = convert_examples_to_tf_dataset(list(train_InputExamples), tokenizer)
train_data = train_data.shuffle(100).batch(32).repeat(2)

100%|██████████| 45000/45000 [03:16<00:00, 228.43it/s]


In [18]:
test_data = convert_examples_to_tf_dataset(list(test_InputExamples), tokenizer)
test_data = test_data.batch(32)

100%|██████████| 4000/4000 [00:18<00:00, 221.81it/s]


In [19]:
validation_data = convert_examples_to_tf_dataset(list(validation_InputExamples), tokenizer)
validation_data = validation_data.batch(32)

100%|██████████| 1000/1000 [00:04<00:00, 213.97it/s]


# Model

In this example we will measure the fine tuning time in EPOCHS epochs.

In [1]:
EPOCHS = 2

In [23]:
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0), 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])

model.fit(train_data, epochs=EPOCHS, validation_data=validation_data)

Epoch 1/2


: 

: 