<a href="https://colab.research.google.com/github/RobinSmits/FakeNews-Generator-And-Detector/blob/main/FakeNews_Generator_And_Detector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm

import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras.mixed_precision import experimental as mixed_precision
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import load_model

!pip install transformers==3.5.0
from transformers import *

Collecting transformers==3.5.0
[?25l  Downloading https://files.pythonhosted.org/packages/9c/34/fb092588df61bf33f113ade030d1cbe74fb73a0353648f8dd938a223dce7/transformers-3.5.0-py3-none-any.whl (1.3MB)
[K     |████████████████████████████████| 1.3MB 5.3MB/s 
Collecting sentencepiece==0.1.91
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 9.9MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 23.9MB/s 
[?25hCollecting tokenizers==0.9.3
[?25l  Downloading https://files.pythonhosted.org/packages/4c/34/b39eb9994bc3c999270b69c9eea40ecc6f0e97991dba28282b9fd32d44ee/tokenizers-0.9.3-cp36-cp36m-manylinux1_x86_64.whl (

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Set Folder to use...
WORK_DIR = '/content/drive/My Drive/fake_news'
os.makedirs(WORK_DIR, exist_ok = True) 

Mounted at /content/drive


In [None]:
# Set strategy choice
USE_GPU = True
USE_CPU = False

# Set strategy with config. Our code should run on all.
if USE_GPU:
    strategy = tf.distribute.OneDeviceStrategy(device = "/gpu:0")
if USE_CPU:
    strategy = tf.distribute.OneDeviceStrategy(device = "/cpu:0")

# Constants
MAX_LEN = 512
VERBOSE = 1

# Batch Size
GENERATE_BATCH_SIZE = 38 * strategy.num_replicas_in_sync
PREDICT_BATCH_SIZE = 16 * strategy.num_replicas_in_sync
print(f'Predict Batch Size: {PREDICT_BATCH_SIZE}')
print(f'Generate Batch Size: {GENERATE_BATCH_SIZE}')

Predict Batch Size: 16
Generate Batch Size: 38


In [None]:
# Set T5 Type
t5_size = 't5-base'
print(f'T5 Model Type: {t5_size}')

# Set T5 Task Name
task_name = 'generate fake news: '
print(f'T5 Task Name: {task_name}')

# Set T5 Config
t5_config = T5Config.from_pretrained(t5_size)

# Set T5 Tokenizer
t5_tokenizer = T5Tokenizer.from_pretrained(t5_size, return_dict = True)

T5 Model Type: t5-base
T5 Task Name: generate fake news: 


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1199.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=791656.0, style=ProgressStyle(descripti…




In [None]:
# Set RoBERTa Type
roberta_type = 'roberta-base'
print(f'RoBERTa Model Type: {roberta_type}')

# Set RoBERTa Config
roberta_config = RobertaConfig.from_pretrained(roberta_type, num_labels = 2)

# Set RoBERTa Tokenizer
roberta_tokenizer = RobertaTokenizer.from_pretrained(roberta_type, 
                                             return_dict = True,
                                             add_prefix_space = True,
                                             do_lower_case = True)

RoBERTa Model Type: roberta-base


In [None]:
# Get data and datasets
ag_news_ds, info = tfds.load('ag_news_subset', split = ['test'], with_info = True, shuffle_files = True, as_supervised = False)
test_ds = ag_news_ds[0]

# Dataset features
print(info.features)

# Samples
total_samples = info.splits['test'].num_examples 
print(f'Total Samples: {total_samples}')

INFO:absl:Load dataset info from /root/tensorflow_datasets/ag_news_subset/1.0.0
INFO:absl:Reusing dataset ag_news_subset (/root/tensorflow_datasets/ag_news_subset/1.0.0)
INFO:absl:Constructing tf.data.Dataset for split ['test'], from /root/tensorflow_datasets/ag_news_subset/1.0.0


FeaturesDict({
    'description': Text(shape=(), dtype=tf.string),
    'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=4),
    'title': Text(shape=(), dtype=tf.string),
})
Total Samples: 7600


In [None]:
# Map and Decode Split(s)
def decode_example(example):
    decoded_example = info.features.decode_example(example)
    
    description = decoded_example['description']
    title = decoded_example['title']
    
    return title, description

# Map
test_ds = test_ds.map(decode_example, num_parallel_calls = tf.data.experimental.AUTOTUNE)

In [None]:
class KerasTFT5ForConditionalGeneration(TFT5ForConditionalGeneration):
    def __init__(self, *args, log_dir = None, cache_dir = None, **kwargs):
        super().__init__(*args, **kwargs)
        self.loss_tracker= tf.keras.metrics.Mean(name='loss') 
    
    @tf.function
    def train_step(self, data):
        x = data[0]
        y = x['labels']
        y = tf.reshape(y, [-1, 1])
        with tf.GradientTape() as tape:
            outputs = self(x, training = True)
            loss = outputs[0]
            logits = outputs[1]
            loss = tf.reduce_mean(loss)
            grads = tape.gradient(loss, self.trainable_variables)
            
        self.optimizer.apply_gradients(zip(grads, self.trainable_variables))
        self.loss_tracker.update_state(loss)        
        self.compiled_metrics.update_state(y, logits)
        metrics = {m.name: m.result() for m in self.metrics}
        
        return metrics

    def test_step(self, data):
        x = data[0]
        y = x["labels"]
        y = tf.reshape(y, [-1, 1])
        output = self(x, training = False)
        loss = output[0]
        loss = tf.reduce_mean(loss)
        logits = output[1]
        
        self.loss_tracker.update_state(loss)
        metrics = self.compiled_metrics.update_state(y, logits)
        
        return metrics

In [None]:
# Create Model
with strategy.scope():
    model = KerasTFT5ForConditionalGeneration.from_pretrained(t5_size, config = t5_config)
    model.compile(optimizer = tf.keras.optimizers.Adam(), 
                  metrics = [tf.keras.metrics.SparseTopKCategoricalAccuracy(name = 'accuracy')])

# Summary
model.summary()

# Load Weights
model.load_weights(WORK_DIR + '/t5_base_model.h5')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=892146080.0, style=ProgressStyle(descri…




All model checkpoint layers were used when initializing KerasTFT5ForConditionalGeneration.

Some layers of KerasTFT5ForConditionalGeneration were not initialized from the model checkpoint at t5-base and are newly initialized: ['loss']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "keras_tf_t5for_conditional_generation_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
shared (TFSharedEmbeddings)  multiple                  24674304  
_________________________________________________________________
encoder (TFT5MainLayer)      multiple                  84954240  
_________________________________________________________________
decoder (TFT5MainLayer)      multiple                  113275392 
_________________________________________________________________
loss (Mean)                  multiple                  2         
Total params: 222,903,938
Trainable params: 222,903,936
Non-trainable params: 2
_________________________________________________________________


In [None]:
# Placeholders
titles, descriptions = [], []
 
# Process Tensorflow Dataset as Numpy ... otherwise not possible to process tokenization.
generate_ds_numpy = tfds.as_numpy(test_ds)

for index, sample in tqdm(zip(range(total_samples), generate_ds_numpy), total = total_samples):
    # Get title and description as strings
    titles.append(sample[0].decode('utf-8'))             # title
    descriptions.append(sample[1].decode('utf-8'))       # description = 

# Create Dataframe
df = pd.DataFrame()
df['title'] = titles
df['description'] = descriptions
df['generated'] = ''

# Summary
print(df.head())
print(df.shape)

HBox(children=(FloatProgress(value=0.0, max=7600.0), HTML(value='')))


                                               title  ... generated
0      Philippine Rebels Free Troops, Talks in Doubt  ...          
1               Carolina's Davis Done for the Season  ...          
2          New Rainbow Six Franchise for Spring 2005  ...          
3                          Kiwis heading for big win  ...          
4  Shelling, shooting resumes in breakaway Georgi...  ...          

[5 rows x 3 columns]
(7600, 3)


In [None]:
text_list = None
generated = []

for index, row in tqdm(zip(range(total_samples), df.iterrows()), total = total_samples):
    index += 1

    if text_list is None:
        text_list = []

    # Prep input text
    text_list.append(task_name + row[1]['title'])
    
    if index % GENERATE_BATCH_SIZE == 0:
        # Batch Encode with Special Tokens
        textlist_encoded = t5_tokenizer.batch_encode_plus(text_list, add_special_tokens = True, max_length = MAX_LEN, padding = True, truncation = True, return_tensors = 'tf')
        
        input_ids = textlist_encoded['input_ids']
        
        # Generate FakeNews
        generated_fakenews = model.generate(input_ids, 
                                          max_length = MAX_LEN, 
                                          top_p = 0.95, 
                                          top_k = 256, 
                                          temperature = 1.1,
                                          num_beams = 1, 
                                          num_return_sequences = 1, 
                                          repetition_penalty = 1.1)
        
        for mapping in generated_fakenews.numpy():
            generated.append(t5_tokenizer.decode(mapping))

        # Reset Text List
        text_list = []

# Generate Final File
df['generated'] = generated
df.to_csv(WORK_DIR + '/generated_fake_news_final.csv')
print(df.head())

HBox(children=(FloatProgress(value=0.0, max=7600.0), HTML(value='')))


                                               title  ...                                          generated
0      Philippine Rebels Free Troops, Talks in Doubt  ...  Philippine rebels freed more than 2,000 troops...
1               Carolina's Davis Done for the Season  ...  CHARLOTTE, NC (Sports Network) - Carolina's Je...
2          New Rainbow Six Franchise for Spring 2005  ...  The Rainbow Six franchise will be released in ...
3                          Kiwis heading for big win  ...  The Kiwis are heading for a big win in the fir...
4  Shelling, shooting resumes in breakaway Georgi...  ...  AFP - A new round of shelling and shootings in...

[5 rows x 3 columns]


### RoBERTa FakeNews Detector

In [None]:
def create_dataset(df):
    number_of_samples = df.shape[0]
    total_samples = 2 * df.shape[0]

    # Placeholders input
    input_ids = np.zeros((total_samples, MAX_LEN), dtype = 'int32')
    input_masks = np.zeros((total_samples, MAX_LEN), dtype = 'int32')
    labels = np.zeros((total_samples, ), dtype = 'int32')

    for index, row in tqdm(zip(range(0, total_samples, 2), df.iterrows()), total = number_of_samples):
        
        # Get title and description as strings
        description = row[1]['description']
        generated = row[1]['generated']

        # Process Description - Set Label for real as 0
        input_encoded = roberta_tokenizer.encode_plus(description, add_special_tokens = True, max_length = MAX_LEN, truncation = True)
        input_ids_sample = input_encoded['input_ids']
        input_ids[index,:len(input_ids_sample)] = input_ids_sample
        attention_mask_sample = input_encoded['attention_mask']
        input_masks[index,:len(attention_mask_sample)] = attention_mask_sample
        labels[index] = 0

        # Process Generated - Set Label for fake as 1
        input_encoded = roberta_tokenizer.encode_plus(generated, add_special_tokens = True, max_length = MAX_LEN, truncation = True)
        input_ids_sample = input_encoded['input_ids']
        input_ids[index+1,:len(input_ids_sample)] = input_ids_sample
        attention_mask_sample = input_encoded['attention_mask']
        input_masks[index+1,:len(attention_mask_sample)] = attention_mask_sample
        labels[index+1] = 1

    # Create DatasetDictionary structure is also preserved.
    dataset = tf.data.Dataset.from_tensor_slices(({'input_ids': input_ids, 'attention_mask': input_masks}, labels))

    # Return Dataset
    return dataset

In [None]:
# Show Sizes
print(f'Test DF Shape: {df.shape}')

# Create Validation Dataset
test_dataset = create_dataset(df)
test_dataset = test_dataset.batch(PREDICT_BATCH_SIZE)
test_dataset = test_dataset.repeat(-1)
test_dataset = test_dataset.prefetch(128)

# Steps
test_steps = (df.shape[0] * 2) // PREDICT_BATCH_SIZE
print(f'Val Steps: {test_steps}')

Test DF Shape: (7600, 3)


HBox(children=(FloatProgress(value=0.0, max=7600.0), HTML(value='')))


Val Steps: 950


In [None]:
def build_model():
    # Create Model
    with strategy.scope():      
        model = TFRobertaForSequenceClassification.from_pretrained(roberta_type, config = roberta_config)
        
        optimizer = tf.keras.optimizers.Adam()
        loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True)
        metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

        model.compile(optimizer = optimizer, loss = loss, metrics = [metric])        
        
        return model

In [None]:
# Create Model
model = build_model()

# Summary
model.summary()

# Load Weights
model.load_weights(WORK_DIR + '/roberta_base_model.h5')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=657434796.0, style=ProgressStyle(descri…




Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaForSequenceClassification: ['lm_head']
- This IS expected if you are initializing TFRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_roberta_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
roberta (TFRobertaMainLayer) multiple                  124645632 
_________________________________________________________________
classifier (TFRobertaClassif multiple                  592130    
Total params: 125,237,762
Trainable params: 125,237,762
Non-trainable params: 0
_________________________________________________________________


In [None]:
# Evaluate Dataset
eval = model.evaluate(test_dataset, steps = test_steps, verbose = 1)
print(f'Detection Accuracy: {eval[1] * 100}%')

Detection Accuracy: 96.78289294242859%


In [None]:
# Predict Dataset
preds = model.predict(test_dataset, steps = test_steps, verbose = 1)

# Raw Predictions
print(preds)

# Probabilities
print(tf.nn.softmax(preds).numpy()[0])

(array([[ 4.468864 , -4.2316957],
       [-4.1424737,  4.145637 ],
       [ 4.5235734, -4.275705 ],
       ...,
       [-4.31774  ,  4.21413  ],
       [-0.7884399,  0.7092131],
       [-3.9932837,  3.8474443]], dtype=float32),)
[[9.9983346e-01 1.6646489e-04]
 [2.5142592e-04 9.9974853e-01]
 [9.9984920e-01 1.5081915e-04]
 ...
 [1.9704727e-04 9.9980301e-01]
 [1.8277581e-01 8.1722414e-01]
 [3.9322794e-04 9.9960679e-01]]
