In [1]:
from google.colab import drive
drive.mount('/content/drive')

# Update your dataset path accordingly
liar_data_path = '/content/train.tsv'

import pandas as pd

def load_liar(path):
    return pd.read_csv(path, sep='\t', header=None, names=[
        'id', 'label', 'statement', 'subject', 'speaker', 'speaker_job',
        'state_info', 'party_affiliation', 'barely_true_counts', 'false_counts',
        'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts', 'context'
    ])

df = load_liar(liar_data_path)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
from sklearn.model_selection import train_test_split

def split_dataset(df):
    train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)
    return train_df, val_df


In [3]:
!pip install transformers -q

from transformers import BertTokenizer, TFBertForSequenceClassification
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
import tensorflow as tf

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def encode_data(texts, labels, max_length=128):
    encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=max_length, return_tensors="tf")
    return encodings, tf.convert_to_tensor(labels)

def train_classifier(texts, labels):
    encodings, labels_tensor = encode_data(texts, labels)

    model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(set(labels)))
    model.compile(optimizer=Adam(learning_rate=2e-5),
                  loss=SparseCategoricalCrossentropy(from_logits=True),
                  metrics=['accuracy'])

    model.fit(encodings['input_ids'], labels_tensor, epochs=2, batch_size=8)
    return model


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
from sklearn.metrics import accuracy_score, f1_score

def evaluate_classifier(model, texts, labels):
    encodings, _ = encode_data(texts, labels)
    preds = model.predict(encodings['input_ids']).logits
    pred_labels = tf.argmax(preds, axis=1).numpy()
    acc = accuracy_score(labels, pred_labels)
    f1 = f1_score(labels, pred_labels, average='weighted')
    return acc, f1


In [5]:
from transformers import BertTokenizer, TFBertForSequenceClassification
from tensorflow.keras.losses import SparseCategoricalCrossentropy
import tensorflow as tf

# Use tf.keras.optimizers.Adam instead of keras.optimizers.Adam
def train_classifier(texts, labels):
    encodings, labels_tensor = encode_data(texts, labels)

    model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(set(labels)))
    # Changed here to use tf.keras.optimizers.Adam
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
                  loss=SparseCategoricalCrossentropy(from_logits=True),
                  metrics=['accuracy'])

    model.fit(encodings['input_ids'], labels_tensor, epochs=2, batch_size=8)
    return model

In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenize the dataset (use either real or fake news statements for training the generator)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['statement'])  # Fit tokenizer on your fake news data

# Convert text to sequences
sequences = tokenizer.texts_to_sequences(df['statement'])
max_seq_length = max([len(seq) for seq in sequences])  # Get max length of sequences
sequences_padded = pad_sequences(sequences, maxlen=max_seq_length, padding='post')

# Now, `sequences_padded` is ready to be used for SeqGAN training


In [7]:
from tensorflow.keras.layers import Flatten

def build_discriminator(vocab_size, embedding_dim, sequence_length):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=sequence_length))
    model.add(LSTM(128))
    model.add(Dense(1, activation='sigmoid'))  # Output: Real or Fake (binary classification)
    return model


In [8]:
import tensorflow as tf
from tensorflow.keras.optimizers import Adam

def train_seqgan(generator, discriminator, real_data, fake_data, batch_size=32):
    # Optimizers
    g_optimizer = Adam(lr=0.0002, beta_1=0.5)
    d_optimizer = Adam(lr=0.0002, beta_1=0.5)

    for epoch in range(10000):  # Number of epochs to train
        # Train Discriminator (Real + Fake News)
        real_labels = tf.ones((batch_size, 1))  # Real labels
        fake_labels = tf.zeros((batch_size, 1))  # Fake labels

        with tf.GradientTape() as d_tape:
            real_output = discriminator(real_data)
            fake_output = discriminator(fake_data)
            d_loss_real = tf.keras.losses.binary_crossentropy(real_labels, real_output)
            d_loss_fake = tf.keras.losses.binary_crossentropy(fake_labels, fake_output)
            d_loss = d_loss_real + d_loss_fake

        d_gradients = d_tape.gradient(d_loss, discriminator.trainable_variables)
        d_optimizer.apply_gradients(zip(d_gradients, discriminator.trainable_variables))

        # Train Generator
        with tf.GradientTape() as g_tape:
            fake_output = discriminator(fake_data)
            g_loss = tf.keras.losses.binary_crossentropy(real_labels, fake_output)

        g_gradients = g_tape.gradient(g_loss, generator.trainable_variables)
        g_optimizer.apply_gradients(zip(g_gradients, generator.trainable_variables))

        if epoch % 100 == 0:
            print(f"Epoch {epoch}: D Loss = {d_loss.numpy()}, G Loss = {g_loss.numpy()}")


In [9]:
def generate_fake_news(generator, num_samples=10):
    fake_news = []
    for _ in range(num_samples):
        generated_sequence = generator.predict(generate_input)  # Your input to the generator here
        fake_news.append(tokenizer.sequences_to_texts(generated_sequence))
    return fake_news


In [10]:
!pip install transformers datasets -q


In [11]:
from transformers import T5Tokenizer
import pandas as pd

tokenizer = T5Tokenizer.from_pretrained("t5-small")

def preprocess_t5_data(df, input_col='statement', label_col='label'):
    input_texts = ['classify: ' + str(text) for text in df[input_col]]
    label_texts = ['real' if label == 1 else 'fake' for label in df[label_col]]
    return input_texts, label_texts


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [12]:
def tokenize_t5_data(input_texts, target_texts, tokenizer, max_length=128):
    input_encodings = tokenizer(input_texts, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
    target_encodings = tokenizer(target_texts, padding=True, truncation=True, max_length=16, return_tensors="pt")

    input_encodings['labels'] = target_encodings['input_ids']
    return input_encodings


In [13]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [14]:
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset

class T5FakeNewsDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __len__(self):
        return len(self.encodings['input_ids'])
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Call preprocess_t5_data to get train_inputs and train_labels
train_inputs, train_labels = preprocess_t5_data(df)  # Assuming 'df' is your training DataFrame
val_inputs, val_labels = preprocess_t5_data(df)  # Assuming 'df' is your validation DataFrame

train_encodings = tokenize_t5_data(train_inputs, train_labels, tokenizer)
val_encodings = tokenize_t5_data(val_inputs, val_labels, tokenizer)

train_dataset = T5FakeNewsDataset(train_encodings)
val_dataset = T5FakeNewsDataset(val_encodings)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    # The 'evaluation_strategy' argument has been replaced with 'eval_strategy'.
    eval_strategy='epoch',
    logging_dir='./logs',
    save_total_limit=1,
    save_strategy='epoch'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss
1,0.0002,0.0
2,0.0005,0.0


  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


TrainOutput(global_step=2560, training_loss=0.08641776070189736, metrics={'train_runtime': 13114.5158, 'train_samples_per_second': 1.562, 'train_steps_per_second': 0.195, 'total_flos': 692950023536640.0, 'train_loss': 0.08641776070189736, 'epoch': 2.0})

In [15]:
def predict_t5(text):
    input_text = "classify: " + text
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    output = model.generate(input_ids)
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Example
print(predict_t5("The earth is flat and the moon is made of cheese."))


fake


In [16]:
def predict_t5(text, model, tokenizer):
    input_text = "classify: " + text
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    output = model.generate(input_ids)
    return tokenizer.decode(output[0], skip_special_tokens=True)


In [17]:
from sklearn.metrics import accuracy_score, f1_score

def evaluate_t5(model, tokenizer, val_df):
    predictions = []
    for text in val_df['statement']:
        pred = predict_t5(text, model, tokenizer)
        predictions.append(pred.strip().lower())

    true_labels = ['real' if l == 1 else 'fake' for l in val_df['label']]

    acc = accuracy_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions, pos_label='real')

    print(f"T5 Validation Accuracy: {acc:.4f}")
    print(f"T5 F1 Score: {f1:.4f}")


In [19]:
from sklearn.model_selection import train_test_split

def split_dataset(df):
    train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)
    return train_df, val_df

# Call split_dataset to create train_df and val_df
train_df, val_df = split_dataset(df)  # Assuming 'df' is your original DataFrame

# ... (Rest of the code, including evaluate_t5 and its call)

In [20]:
evaluate_t5(model, tokenizer, val_df)


T5 Validation Accuracy: 1.0000
T5 F1 Score: 0.0000


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [21]:
model.save_pretrained("/content/t5_fake_news_classifier")
tokenizer.save_pretrained("/content/t5_fake_news_classifier")

('/content/t5_fake_news_classifier/tokenizer_config.json',
 '/content/t5_fake_news_classifier/special_tokens_map.json',
 '/content/t5_fake_news_classifier/spiece.model',
 '/content/t5_fake_news_classifier/added_tokens.json')