# SpotFake: Twitter Fake News Detection - Kaggle TPU v5e-8 Edition

This notebook is optimized for Kaggle with TPU v5e-8 support.

**Important Setup Steps:**
1. In Kaggle Settings: Select **TPU v5e-8** as accelerator
2. Add your dataset to Kaggle Datasets
3. Run all cells in order

This notebook provides:
1. Loading and preprocessing data (text + images)
2. Building the multimodal model (BERT + VGG19)
3. Training the model with TPU acceleration
4. Making predictions on new inputs (text + image)

## 1. TPU Configuration and Setup

In [None]:
import os
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
tqdm.pandas()
import matplotlib.pyplot as plt
%matplotlib inline

import cv2
from os import listdir
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

import tensorflow as tf
import tensorflow_hub as hub
from transformers import BertTokenizer
from tensorflow.keras import backend as K

import gc

print("TensorFlow version:", tf.__version__)
print("✓ All imports successful!")

In [None]:
# TPU Detection and Initialization
def setup_tpu():
    """Configure TPU if available, otherwise use GPU/CPU"""
    try:
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()
        strategy = tf.distribute.TPUStrategy(tpu)
        print("✓ TPU initialized successfully!")
        print(f"  TPU devices: {tpu.cluster_spec().as_dict()['worker']}")
        print(f"  Number of replicas: {strategy.num_replicas_in_sync}")
        return strategy, True
    except ValueError:
        print("⚠ TPU not found. Using default strategy (GPU/CPU)")
        strategy = tf.distribute.get_strategy()
        return strategy, False

strategy, using_tpu = setup_tpu()

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Suppress TF warnings
tf.get_logger().setLevel('ERROR')

print(f"\nRunning on: {'TPU' if using_tpu else 'GPU/CPU'}")
print(f"Number of accelerators: {strategy.num_replicas_in_sync}")

In [None]:
# Configuration
bert_path = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"
max_seq_length = 23
img_length = 224
img_width = 224
img_channels = 3

# Kaggle paths - adjust these based on your dataset location
# If you uploaded the dataset to Kaggle, it will be in /kaggle/input/
DATASET_PATH = '/kaggle/input/spotfake-twitter'  # Change this to your dataset name
# For testing locally, uncomment:
# DATASET_PATH = 'dataset/twitter'

print(f"Dataset path: {DATASET_PATH}")

## 2. Helper Functions

In [None]:
# Progress callback
def live():
    """Simple callback for training progress"""
    return tf.keras.callbacks.LambdaCallback(
        on_epoch_end=lambda epoch, logs: print(
            f"Epoch {epoch + 1}: loss={logs.get('loss', 0):.4f}, "
            f"acc={logs.get('accuracy', 0):.4f}, "
            f"val_loss={logs.get('val_loss', 0):.4f}, "
            f"val_acc={logs.get('val_accuracy', 0):.4f}"
        )
    )

### Text Preprocessing Functions

In [None]:
class PaddingInputExample(object):
    """Fake example for padding."""

class InputExample(object):
    """A single training/test example for simple sequence classification."""
    def __init__(self, guid, text_a, text_b=None, label=None):
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label

def create_tokenizer_from_hub_module():
    """Get the BERT tokenizer."""
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    return tokenizer

def convert_single_example(tokenizer, example, max_seq_length=256):
    """Converts a single InputExample into features."""
    if isinstance(example, PaddingInputExample):
        input_ids = [0] * max_seq_length
        input_mask = [0] * max_seq_length
        segment_ids = [0] * max_seq_length
        label = 0
        return input_ids, input_mask, segment_ids, label

    encoding = tokenizer(
        example.text_a,
        truncation=True,
        padding='max_length',
        max_length=max_seq_length,
        return_tensors='tf'
    )
    
    input_ids = encoding['input_ids'][0].numpy().tolist()
    input_mask = encoding['attention_mask'][0].numpy().tolist()
    segment_ids = [0] * max_seq_length
    
    return input_ids, input_mask, segment_ids, example.label

def convert_examples_to_features(tokenizer, examples, max_seq_length=256):
    """Convert a set of InputExamples to features."""
    input_ids, input_masks, segment_ids, labels = [], [], [], []
    for example in tqdm(examples, desc="Converting examples to features"):
        input_id, input_mask, segment_id, label = convert_single_example(
            tokenizer, example, max_seq_length
        )
        input_ids.append(input_id)
        input_masks.append(input_mask)
        segment_ids.append(segment_id)
        labels.append(label)
    return (
        np.array(input_ids),
        np.array(input_masks),
        np.array(segment_ids),
        np.array(labels).reshape(-1, 1),
    )

def convert_text_to_examples(texts, labels):
    """Create InputExamples from texts and labels."""
    InputExamples = []
    for text, label in zip(texts, labels):
        InputExamples.append(
            InputExample(guid=None, text_a=text if isinstance(text, str) else " ".join(text), text_b=None, label=label)
        )
    return InputExamples

def preprocess_text_input(text, tokenizer, max_seq_length=23):
    """Preprocess a single text input for prediction."""
    example = InputExample(guid=None, text_a=text, text_b=None, label=0)
    input_id, input_mask, segment_id, _ = convert_single_example(
        tokenizer, example, max_seq_length
    )
    return np.array([input_id]), np.array([input_mask]), np.array([segment_id])

print("✓ Text preprocessing functions defined")

### Image Preprocessing Functions

In [None]:
def read_and_process_image(list_of_images, length=224, width=224):
    """Read and preprocess multiple images."""
    X = [] 
    for image in tqdm(list_of_images, desc="Processing images"):
        X.append(cv2.resize(cv2.imread(image, cv2.IMREAD_COLOR), (length, width), interpolation=cv2.INTER_CUBIC))  
    return np.array(X)

def preprocess_single_image(image_path, length=224, width=224):
    """Preprocess a single image for prediction."""
    img = cv2.imread(image_path, cv2.IMREAD_COLOR)
    if img is None:
        raise ValueError(f"Could not read image from {image_path}")
    img = cv2.resize(img, (length, width), interpolation=cv2.INTER_CUBIC)
    # Convert to (channels, height, width) format
    img = np.rollaxis(img, 2, 0)
    return np.expand_dims(img, axis=0)  # Add batch dimension

print("✓ Image preprocessing functions defined")

## 3. Model Definition (TPU-Compatible)

In [None]:
def get_news_model(params):
    """Build the multimodal fake news detection model - TPU compatible."""
    tf.keras.backend.clear_session()
    
    # BERT encoder function
    def bert_encode(input_ids, input_mask, segment_ids):
        bert_layer = hub.KerasLayer(
            bert_path,
            trainable=False,
            signature="tokens",
            signature_outputs_as_dict=True,
        )
        bert_inputs = {
            "input_ids": input_ids, 
            "input_mask": input_mask, 
            "segment_ids": segment_ids
        }
        bert_outputs = bert_layer(bert_inputs)
        return bert_outputs["pooled_output"]

    # Text input branch
    in_id = tf.keras.layers.Input(shape=(max_seq_length,), name="input_ids", dtype=tf.int32)
    in_mask = tf.keras.layers.Input(shape=(max_seq_length,), name="input_masks", dtype=tf.int32)
    in_segment = tf.keras.layers.Input(shape=(max_seq_length,), name="segment_ids", dtype=tf.int32)
    
    bert_output = tf.keras.layers.Lambda(
        lambda inputs: bert_encode(inputs[0], inputs[1], inputs[2]),
        output_shape=(768,),
        name="bert_encoding"
    )([in_id, in_mask, in_segment])

    if params['text_no_hidden_layer'] > 0:
        for i in range(params['text_no_hidden_layer']):
            bert_output = tf.keras.layers.Dense(params['text_hidden_neurons'], activation='relu')(bert_output)
            bert_output = tf.keras.layers.Dropout(params['dropout'])(bert_output)

    text_repr = tf.keras.layers.Dense(params['repr_size'], activation='relu')(bert_output)

    # Image input branch (VGG19)
    conv_base = tf.keras.applications.VGG19(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
    conv_base.trainable = False

    input_image = tf.keras.layers.Input(shape=(3, 224, 224))
    transposed_image = tf.keras.layers.Lambda(lambda x: tf.transpose(x, [0, 2, 3, 1]))(input_image)
    base_output = conv_base(transposed_image)
    flat = tf.keras.layers.Flatten()(base_output)

    if params['vis_no_hidden_layer'] > 0:
        for i in range(params['vis_no_hidden_layer']):
            flat = tf.keras.layers.Dense(params['vis_hidden_neurons'], activation='relu')(flat)
            flat = tf.keras.layers.Dropout(params['dropout'])(flat)

    visual_repr = tf.keras.layers.Dense(params['repr_size'], activation='relu')(flat)

    # Classifier (combine text + image)
    combine_repr = tf.keras.layers.concatenate([text_repr, visual_repr])
    com_drop = tf.keras.layers.Dropout(params['dropout'])(combine_repr)

    if params['final_no_hidden_layer'] > 0:
        for i in range(params['final_no_hidden_layer']):
            com_drop = tf.keras.layers.Dense(params['final_hidden_neurons'], activation='relu')(com_drop)
            com_drop = tf.keras.layers.Dropout(params['dropout'])(com_drop)

    prediction = tf.keras.layers.Dense(1, activation='sigmoid')(com_drop)

    model = tf.keras.models.Model(inputs=[in_id, in_mask, in_segment, input_image], outputs=prediction)
    model.compile(loss='binary_crossentropy', optimizer=params['optimizer'](), metrics=['accuracy'])
    
    return model

print("✓ Model definition ready (TPU-compatible)")

## 4. Load and Preprocess Data

In [None]:
# Load datasets
def get_df(file):
    return pd.read_csv(file, sep='\t')

train_df = get_df(f'{DATASET_PATH}/train_posts.txt')
test_df = get_df(f'{DATASET_PATH}/test_posts.txt')

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
train_df.head()

In [None]:
# Extract first image ID
def return_first_image(row):
    return row['image_id'].split(',')[0].strip()

train_df['first_image_id'] = train_df.progress_apply(lambda row: return_first_image(row), axis=1)
test_df['first_image_id'] = test_df.progress_apply(lambda row: return_first_image(row), axis=1)

In [None]:
# Filter out missing images
images_train_dataset = [i for i in train_df['first_image_id'].tolist()]
images_train_folder = [i.split('.')[0].strip() for i in listdir(f'{DATASET_PATH}/images_train')]
images_train_not_available = set(images_train_dataset) - set(images_train_folder)
images_train_not_available.add('boston_fake_10')

images_test_dataset = [i.split(',')[0].strip() for i in test_df['image_id'].tolist()]
images_test_folder = [i.split('.')[0].strip() for i in listdir(f'{DATASET_PATH}/images_test/')]
images_test_not_available = set(images_test_dataset) - set(images_test_folder)

train_df = train_df[~train_df['first_image_id'].isin(images_train_not_available)]
test_df = test_df[~test_df['first_image_id'].isin(images_test_not_available)]

print(f"After filtering - Train: {train_df.shape}, Test: {test_df.shape}")

In [None]:
# Extract text and labels
train_text = train_df['post_text'].tolist()
test_text = test_df['post_text'].tolist()

train_images = [i for i in train_df['first_image_id'].tolist()]
test_images = [i for i in test_df['first_image_id'].tolist()]

trainY = train_df['label'].tolist()
trainY = [1 if i == 'real' else 0 for i in trainY]

testY = test_df['label'].tolist()
testY = [1 if i == 'real' else 0 for i in testY]

print(f"Data counts: {len(train_text)} train, {len(test_text)} test")

### Process Text Data

In [None]:
# Initialize tokenizer
tokenizer = create_tokenizer_from_hub_module()

# Convert data to InputExample format
train_examples = convert_text_to_examples(train_text, trainY)
test_examples = convert_text_to_examples(test_text, testY)

# Convert to features
(train_input_ids, train_input_masks, train_segment_ids, trainY_processed
) = convert_examples_to_features(tokenizer, train_examples, max_seq_length=max_seq_length)

(test_input_ids, test_input_masks, test_segment_ids, testY_processed
) = convert_examples_to_features(tokenizer, test_examples, max_seq_length=max_seq_length)

print(f"Text features shape: {train_input_ids.shape}")

### Process Image Data

In [None]:
# Get image file extensions
images = listdir(f'{DATASET_PATH}/images_train/')
images.extend(listdir(f'{DATASET_PATH}/images_test/'))
jpg, png, jpeg, gif = [], [], [], []

valid_extensions = {'jpg', 'png', 'jpeg', 'gif'}
for i in images:
    if '.' not in i or i.startswith('.'):
        continue
    name, ext = i.split('.')[0], i.split('.')[-1].lower()
    if ext in valid_extensions:
        if ext == 'jpg':
            jpg.append(name)
        elif ext == 'png':
            png.append(name)
        elif ext == 'jpeg':
            jpeg.append(name)
        elif ext == 'gif':
            gif.append(name)

def get_extension_of_file(file_name):
    if file_name in jpg:
        return '.jpg'
    elif file_name in png:
        return '.png'
    elif file_name in jpeg:
        return '.jpeg'
    else:
        return '.gif'

print(f"Found: {len(jpg)} jpg, {len(png)} png, {len(jpeg)} jpeg, {len(gif)} gif")

In [None]:
# Build full image paths
train_image_paths = [f'{DATASET_PATH}/images_train/' + i + get_extension_of_file(i) for i in train_images]
test_image_paths = [f'{DATASET_PATH}/images_test/' + i + get_extension_of_file(i) for i in test_images]

In [None]:
# Process images
print("Processing images (this may take a while)...")
train_imagesX = read_and_process_image(train_image_paths)
test_imagesX = read_and_process_image(test_image_paths)

# Convert to (batch, channels, height, width) format
train_imagesX = np.rollaxis(train_imagesX, 3, 1)
test_imagesX = np.rollaxis(test_imagesX, 3, 1)

print(f"Image data shape: {train_imagesX.shape}")
print("✓ Image preprocessing complete")

## 5. Model Training with TPU

In [None]:
# Best hyperparameters (from hyperparameter search)
params_final = {
    'text_no_hidden_layer': 1,
    'text_hidden_neurons': 768,
    'dropout': 0.4,
    'repr_size': 32,
    'vis_no_hidden_layer': 1,
    'vis_hidden_neurons': 2742,
    'final_no_hidden_layer': 1,
    'final_hidden_neurons': 35,
    'optimizer': tf.keras.optimizers.Adam
}

print("Model parameters:")
for k, v in params_final.items():
    print(f"  {k}: {v}")

In [None]:
# Build model within TPU strategy scope
with strategy.scope():
    model = get_news_model(params_final)
    model.optimizer.learning_rate.assign(0.0005)
    print(f"Learning rate set to: {model.optimizer.learning_rate.numpy()}")

model.summary()

In [None]:
# Setup checkpoint callback
checkpoint = tf.keras.callbacks.ModelCheckpoint(
    'model-{epoch:03d}-{val_accuracy:.6f}.h5', 
    verbose=1, 
    monitor='val_accuracy',
    save_best_only=True, 
    mode='max'
)

# Early stopping to prevent overfitting
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_accuracy',
    patience=5,
    restore_best_weights=True
)

print("✓ Callbacks configured")

In [None]:
# Adjust batch size for TPU (TPUs work best with larger batch sizes)
# TPU v5e-8 has 8 cores, so batch size should be divisible by 8
if using_tpu:
    batch_size = 256  # 32 per core * 8 cores
    epochs = 10
else:
    batch_size = 128
    epochs = 10

print(f"Training configuration:")
print(f"  Batch size: {batch_size}")
print(f"  Epochs: {epochs}")
print(f"  Using: {'TPU' if using_tpu else 'GPU/CPU'}")

In [None]:
# Train the model
print("Starting training...\n")

history = model.fit(
    [train_input_ids, train_input_masks, train_segment_ids, train_imagesX], 
    trainY_processed,
    batch_size=batch_size,
    epochs=epochs,
    verbose=1,
    shuffle=True,
    validation_data=(
        [test_input_ids, test_input_masks, test_segment_ids, test_imagesX],
        testY_processed
    ),
    callbacks=[live(), checkpoint, early_stopping]
)

print("\n✓ Training completed!")

In [None]:
# Plot training history
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Accuracy
ax1.plot(history.history['accuracy'], label='Train Accuracy')
ax1.plot(history.history['val_accuracy'], label='Val Accuracy')
ax1.set_title('Model Accuracy')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Accuracy')
ax1.legend()
ax1.grid(True)

# Loss
ax2.plot(history.history['loss'], label='Train Loss')
ax2.plot(history.history['val_loss'], label='Val Loss')
ax2.set_title('Model Loss')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Loss')
ax2.legend()
ax2.grid(True)

plt.tight_layout()
plt.show()

print(f"Best validation accuracy: {max(history.history['val_accuracy']):.4f}")

## 6. Model Evaluation

In [None]:
# Evaluate current model
test_predictions = model.predict([
    test_input_ids, test_input_masks, test_segment_ids, test_imagesX
])
test_predictions_binary = [1 if i >= 0.5 else 0 for i in test_predictions]

print("Test Set Evaluation:")
print(f"Accuracy:  {accuracy_score(testY_processed, test_predictions_binary):.4f}")
print(f"F1 Score:  {f1_score(testY_processed, test_predictions_binary, average=None)}")
print(f"Precision: {precision_score(testY_processed, test_predictions_binary, average=None)}")
print(f"Recall:    {recall_score(testY_processed, test_predictions_binary, average=None)}")

## 7. Save Model

In [None]:
# Save the entire model
model.save('spotfake_tpu_model.h5')
print("✓ Model saved to 'spotfake_tpu_model.h5'")

# To load the model later:
# loaded_model = tf.keras.models.load_model('spotfake_tpu_model.h5', custom_objects={'KerasLayer': hub.KerasLayer})

## 8. Inference Functions

In [None]:
def predict_fake_news(text, image_path, model, tokenizer, threshold=0.5):
    """
    Predict whether a news post (text + image) is fake or real.
    
    Args:
        text (str): The post text
        image_path (str): Path to the image file
        model: Trained Keras model
        tokenizer: BERT tokenizer
        threshold (float): Classification threshold (default 0.5)
    
    Returns:
        dict: Prediction results
    """
    # Preprocess text
    input_ids, input_masks, segment_ids = preprocess_text_input(
        text, tokenizer, max_seq_length=max_seq_length
    )
    
    # Preprocess image
    image_data = preprocess_single_image(image_path, length=img_length, width=img_width)
    
    # Make prediction
    prediction = model.predict(
        [input_ids, input_masks, segment_ids, image_data],
        verbose=0
    )[0][0]
    
    # Classify
    is_real = prediction >= threshold
    label = "REAL" if is_real else "FAKE"
    confidence = prediction if is_real else (1 - prediction)
    
    return {
        'label': label,
        'confidence': float(confidence),
        'raw_score': float(prediction),
        'text': text,
        'image_path': image_path
    }

print("✓ Inference function ready")

In [None]:
# Example prediction
sample_text = "Breaking news: Major event happening now!"
sample_image_path = test_image_paths[0]

result = predict_fake_news(sample_text, sample_image_path, model, tokenizer)

print("\n" + "="*50)
print("PREDICTION RESULT")
print("="*50)
print(f"Text: {result['text']}")
print(f"Image: {result['image_path']}")
print(f"\nPrediction: {result['label']}")
print(f"Confidence: {result['confidence']:.2%}")
print(f"Raw Score: {result['raw_score']:.4f}")
print("="*50)

# Display the image
img = cv2.imread(sample_image_path)
img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
plt.figure(figsize=(8, 6))
plt.imshow(img_rgb)
plt.title(f"Prediction: {result['label']} ({result['confidence']:.2%} confidence)")
plt.axis('off')
plt.show()

## Summary

This notebook has been optimized for Kaggle TPU v5e-8:

### Key TPU Optimizations:
1. **TPU Detection & Strategy**: Automatic TPU detection and distribution strategy
2. **Batch Size**: Increased to 256 (optimal for 8 TPU cores)
3. **Model Scope**: Model created within `strategy.scope()`
4. **Data Paths**: Configured for Kaggle's `/kaggle/input/` structure

### To Use This Notebook:
1. Upload your SpotFake dataset to Kaggle Datasets
2. Update `DATASET_PATH` variable with your dataset name
3. In Notebook Settings: Select **TPU v5e-8** as accelerator
4. Run all cells in order

### Expected Performance:
- Training speed: ~3-5x faster than GPU on large batches
- Best for: Large datasets with batch sizes ≥256
- Model accuracy: Similar to GPU/CPU training