In [None]:
import numpy as np
import os
import pandas as pd
from tqdm.notebook import tqdm
from urllib.request import urlopen
import tarfile
import tensorflow as tf
import tensorflow_datasets as tfds
import transformers as tfs
import sentencepiece

In [3]:
# Constants
MAX_LEN = 512     # Max number of tokens for T5 to use.
EPOCHS = 1
VERBOSE = 1

# Set Actions
PERFORM_TRAINING = False
GENERATE_TEXT = True

# Batch Size
GENERATE_BATCH_SIZE = 30
BATCH_SIZE = 4

# Learning Rate
LR = 1e-4

# Set T5 Type
t5_size = 't5-base'

# Set T5 Task Name
task_name = 'generate fake news: '

# Set T5 Config
t5_config = tfs.T5Config.from_pretrained(t5_size)

# Set T5 Tokenizer
t5_tokenizer = tfs.T5Tokenizer.from_pretrained(t5_size, return_dict = True)

Train Batch Size: 4
Generate Batch Size: 30
Learning Rate: 0.0001
T5 Model Type: t5-base
T5 Task Name: generate fake news: 


For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [4]:
# AG News Subset Download URL from TFDS
AGNEWSSUBSET_URL = 'https://drive.google.com/uc?export=download&id=0Bz8a_Dbh9QhbUDNpeUdjb0wxRms'
AGNEWSSUBSET_DIR = '/tmp/agnewssubet/'

# Download Tar.Gz File and Extract
with urlopen(AGNEWSSUBSET_URL) as targzstream:
    thetarfile = tarfile.open(fileobj = targzstream, mode = "r|gz")
    thetarfile.extractall(AGNEWSSUBSET_DIR)

# List Dataset files
agnewssubset_files = os.listdir(AGNEWSSUBSET_DIR + 'ag_news_csv/')
print(agnewssubset_files)

# Load Train Csv
df = pd.read_csv(AGNEWSSUBSET_DIR + 'ag_news_csv/train.csv', names = ['label', 'title', 'description'])
df = df.sample(frac = 1.0, random_state = 42) # Shuffle all the rows
df.head()

['train.csv', 'test.csv', 'classes.txt', 'readme.txt']


Unnamed: 0,label,title,description
71787,3,"BBC set for major shake-up, claims newspaper","London - The British Broadcasting Corporation, the world #39;s biggest public broadcaster, is to cut almost a quarter of its 28 000-strong workforce, in the biggest shake-up in its 82-year history, The Times newspaper in London said on Monday."
67218,3,Marsh averts cash crunch,Embattled insurance broker #39;s banks agree to waive clause that may have prevented access to credit. NEW YORK (Reuters) - Marsh amp; McLennan Cos.
54066,2,"Jeter, Yankees Look to Take Control (AP)",AP - Derek Jeter turned a season that started with a terrible slump into one of the best in his accomplished 10-year career.
7168,4,Flying the Sun to Safety,"When the Genesis capsule comes back to Earth with its samples of the sun, helicopter pilots will be waiting for it, ready to snag it out of the sky."
29618,3,Stocks Seen Flat as Nortel and Oil Weigh,"NEW YORK (Reuters) - U.S. stocks were set to open near unchanged on Thursday after a warning from technology bellwether Nortel Networks Corp. &lt;A HREF=""http://www.investor.reuters.com/FullQuote.aspx?ticker=NT.N target=/stocks/quickinfo/fullquote""&..."


In [5]:
# Split into 2 equal 60K rows sets...one for training and one for generating
train_df = df.iloc[:60000,:]
generate_df = df.iloc[60000:,:]

# Placeholder for new 'generated_description' column in the generation part.
generate_df['generated_description'] = ''

# Total Samples
total_train_samples = train_df.shape[0]
total_generate_samples = generate_df.shape[0]

# Save Train_Df
train_df.to_csv('t5_train_df_news.csv')

# Summary
print(f'Total Samples for Training: {total_train_samples}')
print(f'Total Samples for Generation: {total_generate_samples}')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  generate_df['generated_description'] = ''


Total Samples for Training: 60000
Total Samples for Generation: 60000


In [6]:
# Train: Show Input and Output Samples encoded
for index, row in train_df[:2].iterrows():

    # Get title and description as strings
    title = row['title']
    description = row['description']

    # Encode with special tokens and use maximum length
    input_encoded = t5_tokenizer.encode_plus(title, add_special_tokens = True, max_length = MAX_LEN, truncation = True, padding = 'max_length')
    output_encoded = t5_tokenizer.encode_plus(description, add_special_tokens = True, max_length = MAX_LEN, truncation = True, padding = 'max_length')

    # Print...
    print(f'Title: {title}')
    print(f'Input - Title Encoded: {input_encoded}')
    print(f'Description: {description}')
    print(f'Output - Description Encoded: {output_encoded}\n')

Title: BBC set for major shake-up, claims newspaper
Input - Title Encoded: {'input_ids': [9938, 356, 21, 779, 8944, 18, 413, 6, 3213, 8468, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [7]:
# Generate: Show Input and Output Samples encoded
for index, row in generate_df[:2].iterrows():

    # Get title and description as strings
    title = row['title']
    description = row['description']

    # Encode with special tokens and use maximum length
    input_encoded = t5_tokenizer.encode_plus(title, add_special_tokens = True, max_length = MAX_LEN, truncation = True, padding = 'max_length')
    output_encoded = t5_tokenizer.encode_plus(description, add_special_tokens = True, max_length = MAX_LEN, truncation = True, padding = 'max_length')

    # Print...
    print(f'Title: {title}')
    print(f'Input - Title Encoded: {input_encoded}')
    print(f'Description: {description}')
    print(f'Output - Description Encoded: {output_encoded}\n')

Title: Besieging holy sites: past lessons
Input - Title Encoded: {'input_ids': [493, 19247, 53, 15273, 1471, 10, 657, 5182, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [8]:
# Placeholders input
input_ids = np.zeros((total_train_samples, MAX_LEN), dtype='int32')
input_masks = np.zeros((total_train_samples, MAX_LEN), dtype='int32')

# Placeholders output
output_ids = np.zeros((total_train_samples, MAX_LEN), dtype='int32')
output_masks = np.zeros((total_train_samples, MAX_LEN), dtype='int32')

# Process Train DF dataframe
for index, row in tqdm(zip(range(total_train_samples), train_df.iterrows()), total = total_train_samples):

    # Get title and description as strings
    title = row[1]['title']
    description = row[1]['description']

    # Process Input
    input_encoded = t5_tokenizer.encode_plus(task_name + title, add_special_tokens = True, max_length = MAX_LEN, truncation = True, padding = 'max_length')
    input_ids_sample = input_encoded['input_ids']
    input_ids[index,:] = input_ids_sample
    attention_mask_sample = input_encoded['attention_mask']
    input_masks[index,:] = attention_mask_sample

    # Process Output
    output_encoded = t5_tokenizer.encode_plus(description, add_special_tokens = True, max_length = MAX_LEN, truncation = True, padding = 'max_length')
    output_ids_sample = output_encoded['input_ids']
    output_ids[index,:] = output_ids_sample
    attention_mask_sample = output_encoded['attention_mask']
    output_masks[index,:] = attention_mask_sample

  0%|          | 0/60000 [00:00<?, ?it/s]

In [9]:
class KerasTFT5ForConditionalGeneration(tfs.TFT5ForConditionalGeneration):
    def __init__(self, *args, log_dir=None, cache_dir= None, **kwargs):
        super().__init__(*args, **kwargs)

        self.loss_tracker= tf.keras.metrics.Mean(name='loss')

    @tf.function
    def train_step(self, data):
        x = data[0]
        y = x['labels']
        y = tf.reshape(y, [-1, 1])
        with tf.GradientTape() as tape:
            outputs = self(x, training=True)
            loss = outputs[0]
            logits = outputs[1]
            loss = tf.reduce_mean(loss)
            grads = tape.gradient(loss, self.trainable_variables)

        self.optimizer.apply_gradients(zip(grads, self.trainable_variables))
        self.loss_tracker.update_state(loss)
        self.compiled_metrics.update_state(y, logits)
        metrics = {m.name: m.result() for m in self.metrics}

        return metrics

    def test_step(self, data):
      x = data[0]
      y = x["labels"]
      y = tf.reshape(y, [-1, 1])
      output = self(x, training=False)
      loss = output[0]
      loss = tf.reduce_mean(loss)
      logits = output[1]

      self.loss_tracker.update_state(loss)
      self.compiled_metrics.update_state(y, logits)

      return {m.name: m.result() for m in self.metrics}

In [10]:
class SaveModel(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs = None):
        print("\nSave Model Weights")

        # Save the entire model as a SavedModel.
        self.model.save_weights('t5_base_model.h5')

In [11]:
# Perform training only if specified
if PERFORM_TRAINING:

    # Create Model
    with strategy.scope():
        model = KerasTFT5ForConditionalGeneration.from_pretrained(t5_size, config = t5_config)
        model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = LR),
                      metrics = [tf.keras.metrics.SparseTopKCategoricalAccuracy(name = 'accuracy')])

    # Summary
    model.summary()

    # Set Input
    input_data = {'input_ids': input_ids, 'labels': output_ids, 'attention_mask': input_masks, 'decoder_attention_mask': output_masks}

    # Fit Model
    model.fit(input_data,
              epochs = EPOCHS,
              batch_size = BATCH_SIZE,
              verbose = VERBOSE,
              shuffle = True,
              callbacks = [SaveModel()],
              use_multiprocessing = False,
              workers = 4)

In [12]:
if GENERATE_TEXT:
    # Create Model
    with strategy.scope():
        model = KerasTFT5ForConditionalGeneration.from_pretrained(t5_size, config = t5_config)
        model.compile(optimizer = tf.keras.optimizers.Adam(),
                      metrics = [tf.keras.metrics.SparseTopKCategoricalAccuracy(name = 'accuracy')])

    # Summary
    model.summary()

    # Load Weights
    model.load_weights('t5_base_model_TEST.h5')

2023-12-12 14:10:40.531831: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /sw/pkgs/arc/python/3.10.4/lib:/sw/pkgs/arc/gcc/10.3.0/lib64:/sw/pkgs/arc/cuda/12.1.1/lib64
2023-12-12 14:10:40.531927: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublasLt.so.11'; dlerror: libcublasLt.so.11: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /sw/pkgs/arc/python/3.10.4/lib:/sw/pkgs/arc/gcc/10.3.0/lib64:/sw/pkgs/arc/cuda/12.1.1/lib64
2023-12-12 14:10:40.531996: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcufft.so.10'; dlerror: libcufft.so.10: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /sw/pkgs/arc/python/3.10.4/lib:/sw/pkgs/arc/gcc/10.3.0/lib64:/sw/pkgs/arc/cuda/12.

Model: "keras_tf_t5for_conditional_generation"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
shared (Embedding)           multiple                  24674304  
_________________________________________________________________
encoder (TFT5MainLayer)      multiple                  109628544 
_________________________________________________________________
decoder (TFT5MainLayer)      multiple                  137949312 
Total params: 222,903,554
Trainable params: 222,903,552
Non-trainable params: 2
_________________________________________________________________


In [None]:
if GENERATE_TEXT:
    generated = []
    new_generated_df = generate_df[:1000]
    for index, row in new_generated_df.iterrows():

        # Get title and description as strings
        title = row['title']
        description = row['description']

        print(f'\n\n========= Sample:  {index}')
        print(f'Title: {title}')
        print(f'Description: {description}')

        # Encode with Special Tokens
        input_encoded = t5_tokenizer.encode_plus(task_name + title, add_special_tokens = True, max_length = MAX_LEN, truncation = True, padding = 'max_length', return_tensors = 'tf')

        # Generate FakeNews
        generated_fakenews = model.generate(input_encoded['input_ids'],
                                          attention_mask = input_encoded['attention_mask'],
                                          max_length = MAX_LEN,
                                          top_p = 0.96,
                                          top_k = 256,
                                          temperature = 1.3,
                                          num_beams = 2,
                                          num_return_sequences = 1,
                                          repetition_penalty = 1.3,
                                          length_penalty = 1.3)

        for mapping in generated_fakenews.numpy():
            generated_description = t5_tokenizer.decode(mapping, skip_special_tokens = True)
            generated.append(generated_description)
    
    new_generated_df['generated_description'] = generated
    new_generated_df.to_csv('t5_generated_fake_news.csv')
    new_generated_df.head()



Title: Besieging holy sites: past lessons
Description: The standoff at one of Islam's holiest shrines parallels one at the Church of the Nativity in 2002.


2023-12-12 14:13:13.821253: I tensorflow/compiler/xla/service/service.cc:170] XLA service 0x3a25d5b0 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2023-12-12 14:13:13.821292: I tensorflow/compiler/xla/service/service.cc:178]   StreamExecutor device (0): Host, Default Version
2023-12-12 14:13:13.830019: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:263] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-12-12 14:13:13.847631: I tensorflow/compiler/jit/xla_compilation_cache.cc:478] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.




Title: Spain sprouts WiMax network
Description: Europe appears to be fertile ground for new WiMax networks. Spain is the latest country to embrace the emerging high-end broadband wireless technology, following recent deployments in France, Ireland, and the U.K.


Title: Oracle sets new deadline on PeopleSoft bid
Description: Oracle has again pushed back the expiration date on its offer for PeopleSoft stock.


Title: Nations demand study of quotas
Description: Governments from nearly 30 countries demanded Friday that the World Trade Organization undertake an urgent review of what might happen when textile quotas end Jan. 1. The 
