## I. Environment Setup

In [1]:
# !pip install datasets
# !pip install transformers
# !pip install pycaret
# !pip install tensorflow
# !pip install sklearn

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
import pycaret
import transformers
from transformers import BertTokenizer
from transformers import TFBertModel
from transformers import TFDistilBertModel, DistilBertTokenizerFast
from tqdm.auto import tqdm

import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import torch
import torch.nn as nn

## II. Load DataSet


In [3]:
from datasets import load_dataset

dataset = load_dataset("liar")

train_set = pd.DataFrame(dataset['train'])
test_set = pd.DataFrame(dataset['test'])
validation_set = pd.DataFrame(dataset['validation'])

combined_data = pd.concat([train_set, test_set, validation_set], ignore_index=True)
combined_data = combined_data.sample(frac=1, random_state=42).reset_index(drop=True)

print("total samples", combined_data.shape)
# View the first entry in the training set
print(train_set.iloc[0])


total samples (12836, 14)
id                                                              2635.json
label                                                                   0
statement               Says the Annies List political group supports ...
subject                                                          abortion
speaker                                                      dwayne-bohac
job_title                                            State representative
state_info                                                          Texas
party_affiliation                                              republican
barely_true_counts                                                    0.0
false_counts                                                          1.0
half_true_counts                                                      0.0
mostly_true_counts                                                    0.0
pants_on_fire_counts                                                  0.0
context     

In [4]:
combined_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12836 entries, 0 to 12835
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    12836 non-null  object 
 1   label                 12836 non-null  int64  
 2   statement             12836 non-null  object 
 3   subject               12836 non-null  object 
 4   speaker               12836 non-null  object 
 5   job_title             12836 non-null  object 
 6   state_info            12836 non-null  object 
 7   party_affiliation     12836 non-null  object 
 8   barely_true_counts    12836 non-null  float64
 9   false_counts          12836 non-null  float64
 10  half_true_counts      12836 non-null  float64
 11  mostly_true_counts    12836 non-null  float64
 12  pants_on_fire_counts  12836 non-null  float64
 13  context               12836 non-null  object 
dtypes: float64(5), int64(1), object(8)
memory usage: 1.4+ MB


## III. Preprocess and Tokenize Data

In [5]:
# tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

token = tokenizer.encode_plus(
    train_set['statement'].iloc[0],
    max_length=64,
    truncation=True,
    padding='max_length',
    add_special_tokens=True,
    return_tensors='tf'
)

In [6]:
X_input_ids = np.zeros((len(combined_data), 64))
X_attn_masks = np.zeros((len(combined_data), 64))
X_input_ids.shape

(12836, 64)

In [7]:
def generate_training_data(df, ids, masks, tokenizer):
    for i, text in tqdm(enumerate(df['statement'])):
        tokenized_text = tokenizer.encode_plus(
            text,
            max_length=64,
            truncation=True,
            padding='max_length',
            add_special_tokens=True,
            return_tensors='tf'
        )
        ids[i, :] = tokenized_text.input_ids
        masks[i, :] = tokenized_text.attention_mask
    return ids, masks

In [8]:
X_input_ids, X_attn_masks = generate_training_data(combined_data, X_input_ids, X_attn_masks, tokenizer)

0it [00:00, ?it/s]

In [9]:
number_of_unique_labels = len(combined_data['label'].unique())

labels = np.zeros((len(combined_data), number_of_unique_labels))
labels[np.arange(len(combined_data)), combined_data['label'].values] = 1 # one-hot encoded target tensor

labels

array([[0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       ...,
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.]])

In [10]:
# creating a data pipeline using tensorflow dataset utility, creates batches of data for easy loading...
dataset = tf.data.Dataset.from_tensor_slices((X_input_ids, X_attn_masks, labels))
dataset.take(1)

<_TakeDataset element_spec=(TensorSpec(shape=(64,), dtype=tf.float64, name=None), TensorSpec(shape=(64,), dtype=tf.float64, name=None), TensorSpec(shape=(6,), dtype=tf.float64, name=None))>

In [11]:
def LabelDatasetMapFunction(input_ids, attn_masks, labels):
    return {
        'input_ids': input_ids,
        'attention_mask': attn_masks
    }, labels

In [12]:
dataset = dataset.map(LabelDatasetMapFunction)

In [13]:
dataset = dataset.shuffle(10000).batch(16, drop_remainder=True)

total_batches = len(dataset)
print(total_batches)

# Calculate the size of each subset
train_batches = int(total_batches * 0.7)
val_batches = int(total_batches * 0.15)
test_batches = total_batches - train_batches - val_batches


# Split the dataset
train_dataset = dataset.take(train_batches)
val_dataset = dataset.skip(train_batches).take(val_batches)
test_dataset = dataset.skip(train_batches + val_batches)

# Print the sizes to verify
print("Training Set Size:", len(train_dataset))
print("Validation Set Size:", len(val_dataset))
print("Test Set Size:", len(test_dataset))


802
Training Set Size: 561
Validation Set Size: 120
Test Set Size: 121


## IV. Model

In [14]:
# model = TFBertModel.from_pretrained('bert-base-cased')
model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [15]:
input_ids = tf.keras.layers.Input(shape=(64,), name='input_ids', dtype='int32')
attn_masks = tf.keras.layers.Input(shape=(64,), name='attention_mask', dtype='int32')

embedding_layer = model(input_ids, attention_mask=attn_masks)[0]
pooled_output = tf.keras.layers.GlobalAveragePooling1D()(embedding_layer)
relu_layer = tf.keras.layers.Dense(128, activation='relu', name='relu_layer')(pooled_output)
output_layer = tf.keras.layers.Dense(6, activation='softmax', name='output_layer')(relu_layer)

liar_model = tf.keras.Model(inputs=[input_ids, attn_masks], outputs=output_layer)
liar_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_ids (InputLayer)      [(None, 64)]                 0         []                            
                                                                                                  
 attention_mask (InputLayer  [(None, 64)]                 0         []                            
 )                                                                                                
                                                                                                  
 tf_distil_bert_model (TFDi  TFBaseModelOutput(last_hid   6636288   ['input_ids[0][0]',           
 stilBertModel)              den_state=(None, 64, 768),   0          'attention_mask[0][0]']      
                              hidden_states=None, atten                                       

In [16]:
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
loss = tf.keras.losses.CategoricalCrossentropy()
accuracy_score = tf.keras.metrics.CategoricalAccuracy('accuracy')

In [17]:
liar_model.compile(optimizer=optimizer, loss=loss, metrics=[accuracy_score])


In [18]:
class CustomCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        print(f"End of epoch {epoch}")
        print(f"Training loss: {logs.get('loss')}, Training accuracy: {logs.get('accuracy')}")
        print(f"Validation loss: {logs.get('val_loss')}, Validation accuracy: {logs.get('val_accuracy')}")


In [None]:
# Training the model
hist = liar_model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=4,
    callbacks=[CustomCallback()]
)

# Plotting training and validation accuracy
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(hist.history['accuracy'], label='Train Accuracy')
plt.plot(hist.history['val_accuracy'], label='Validation Accuracy')
plt.title('Accuracy over Epochs')
plt.legend()

# Plotting training and validation loss
plt.subplot(1, 2, 2)
plt.plot(hist.history['loss'], label='Train Loss')
plt.plot(hist.history['val_loss'], label='Validation Loss')
plt.title('Loss over Epochs')
plt.legend()

plt.show()

Epoch 1/4
 83/561 [===>..........................] - ETA: 1:05:03 - loss: 1.7488 - accuracy: 0.2357

In [None]:
liar_model.save('BERT_liar_model')

## V. Prediction


In [None]:
liar_model = tf.keras.models.load_model('BERT_liar_model')

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

def prepare_data(input_text, tokenizer):
    token = tokenizer.encode_plus(
        input_text,
        max_length=64,
        truncation=True,
        padding='max_length',
        add_special_tokens=True,
        return_tensors='tf'
    )
    return {
        'input_ids': tf.cast(token.input_ids, tf.float64),
        'attention_mask': tf.cast(token.attention_mask, tf.float64)
    }

def make_prediction(model, processed_data, classes=['False', 'Half-True', 'Mostly-True', 'True', 'Barely-True', 'Pants-Fire']):
    probs = model.predict(processed_data)[0]
    return classes[np.argmax(probs)]

In [None]:
input_text = input('Enter news article title: ')
processed_data = prepare_data(input_text, tokenizer)
result = make_prediction(liar_model, processed_data=processed_data)
print(f"Predicted Truthfullness: {result}")