In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import sklearn
from transformers import BertTokenizer

In [2]:
pip install transformers --quiet

You should consider upgrading via the '/usr/bin/python -m pip install --upgrade pip' command.[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [3]:
cd Data

/home/Data


In [3]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,scenario,label
0,[POWER] After accomplishing every task I cross...,1
1,"[CONFORMITY] when i was a young boy, i told on...",-1
2,[BENEVOLENCE] getting mad at my mom for not le...,-1
3,[BENEVOLENCE] sleeping with my friend's roomat...,-1
4,[BENEVOLENCE] laughing about a boy's disgrace,-1


### The data in train combines all this individual csv data as well. So we good!

In [5]:
train_final = train.copy()

In [6]:
train_final['label'] = train_final['label'] + 1

In [7]:
train_final.tail()

Unnamed: 0,scenario,label
22803,[UNIVERSALISM] I want to intentionally have an...,1
22804,[UNIVERSALISM] I love being friends with talen...,1
22805,[UNIVERSALISM] I think I'm a racist,1
22806,[UNIVERSALISM] I keep deleting my old facebook...,1
22807,[UNIVERSALISM] I've been a subscriber to a pod...,1


In [8]:
train_final.describe()

Unnamed: 0,label
count,22808.0
mean,0.777841
std,0.653946
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,2.0


In [9]:
train['label'] = train['label'] + 1
train.head()

Unnamed: 0,scenario,label
0,[POWER] After accomplishing every task I cross...,2
1,"[CONFORMITY] when i was a young boy, i told on...",0
2,[BENEVOLENCE] getting mad at my mom for not le...,0
3,[BENEVOLENCE] sleeping with my friend's roomat...,0
4,[BENEVOLENCE] laughing about a boy's disgrace,0


In [10]:
train['label'].value_counts()

1    11929
0     7973
2     2906
Name: label, dtype: int64

In [11]:
seq_len = 350
num_samples = len(train)

X_ids = np.zeros((num_samples, seq_len))
X_mask = np.zeros((num_samples, seq_len))

In [12]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

for i, sent in enumerate(train['scenario']):
    tokens = tokenizer.encode_plus(sent, max_length = seq_len, truncation=True, padding='max_length',
                                   add_special_tokens=True, return_tensors='tf')
    X_ids[i, :] = tokens['input_ids']
    X_mask[i, :] = tokens['attention_mask']

2022-06-02 16:57:54.153923: I tensorflow/core/platform/cpu_feature_guard.cc:152] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE3 SSE4.1 SSE4.2 AVX
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-06-02 16:57:54.737912: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 14807 MB memory:  -> device: 0, name: Quadro RTX 5000, pci bus id: 0000:1d:00.0, compute capability: 7.5


In [13]:
array = train['label'].values
array

array([2, 0, 0, ..., 1, 1, 1])

In [14]:
labels = np.zeros((num_samples, array.max()+1))

In [15]:
labels[np.arange(num_samples), array] = 1

In [16]:
# One Hot Encoding!
labels

array([[0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       ...,
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.]])

In [17]:
dataset = tf.data.Dataset.from_tensor_slices((X_ids, X_mask, labels))

dataset.take(1)

<TakeDataset element_spec=(TensorSpec(shape=(350,), dtype=tf.float64, name=None), TensorSpec(shape=(350,), dtype=tf.float64, name=None), TensorSpec(shape=(3,), dtype=tf.float64, name=None))>

In [18]:
def map_function(input_ids, masks, labels):
    return {'input_ids': input_ids, 'attention_mask': masks}, labels

In [19]:
dataset = dataset.map(map_function)

In [20]:
dataset.take(1)

<TakeDataset element_spec=({'input_ids': TensorSpec(shape=(350,), dtype=tf.float64, name=None), 'attention_mask': TensorSpec(shape=(350,), dtype=tf.float64, name=None)}, TensorSpec(shape=(3,), dtype=tf.float64, name=None))>

In [21]:
batch_size = 8

dataset = dataset.shuffle(4000).batch(batch_size, drop_remainder=True)

dataset.take(1)

<TakeDataset element_spec=({'input_ids': TensorSpec(shape=(8, 350), dtype=tf.float64, name=None), 'attention_mask': TensorSpec(shape=(8, 350), dtype=tf.float64, name=None)}, TensorSpec(shape=(8, 3), dtype=tf.float64, name=None))>

In [22]:
split = 0.9

size = int((num_samples/batch_size) * split)

In [23]:
train_ds = dataset.take(size)
val_ds = dataset.skip(size)

In [24]:
from transformers import TFAutoModel

bert = TFAutoModel.from_pretrained('bert-base-cased')
bert.summary()

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "tf_bert_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  108310272 
                                                                 
Total params: 108,310,272
Trainable params: 108,310,272
Non-trainable params: 0
_________________________________________________________________


In [25]:
input_ids = tf.keras.layers.Input(shape=(seq_len,), name='input_ids', dtype='int32')
mask = tf.keras.layers.Input(shape=(seq_len,), name='attention_mask', dtype='int32')

embeddings = bert.bert(input_ids, attention_mask=mask)[1]   #<- This[1] is to get the 3d tensors pooled into 2d

x = tf.keras.layers.Dense(1024, activation='relu')(embeddings)
y = tf.keras.layers.Dense(3, activation='softmax', name='outputs')(x)

In [26]:
model = tf.keras.Model(inputs=[input_ids, mask], outputs=y)

model.layers[2].trainable=True
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 350)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 350)]        0           []                               
                                                                                                  
 bert (TFBertMainLayer)         TFBaseModelOutputWi  108310272   ['input_ids[0][0]',              
                                thPooling(last_hidd               'attention_mask[0][0]']         
                                en_state=(None, 350                                               
                                , 768),                                                       

In [27]:
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5, decay=1e-6)
loss = tf.keras.losses.CategoricalCrossentropy()
accuracy = tf.keras.metrics.CategoricalAccuracy('accuracy')

In [28]:
model.compile(optimizer=optimizer, loss=loss, metrics=[accuracy])

In [29]:
history = model.fit(
                    train_ds,
                    validation_data=val_ds,
                    epochs=6
                    )

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [31]:
model.save('value_model_350')

2022-06-02 19:01:39.872960: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: value_model_350/assets


INFO:tensorflow:Assets written to: value_model_350/assets
