In [1]:
import kagglehub
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import torch
import re
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split



In [2]:
# Download dataset from Kaggle
path = kagglehub.dataset_download("jruvika/fake-news-detection")

print("Path to dataset files:", path)



Path to dataset files: /Users/rayarawajba/.cache/kagglehub/datasets/jruvika/fake-news-detection/versions/1


In [3]:
# Introduction:
# The following code demonstrates how to process and batch text data for a poem classification task using PyTorch.
# It covers loading and preprocessing the dataset, tokenizing and numericalizing the text, creating a custom Dataset,
# and batching the data with padding for use in neural network models. This setup is essential for training
# deep learning models on variable-length text sequences.

In [4]:
# Load the training dataset into a DataFrame
df = pd.read_csv(path + "/data.csv")
print("Dataset loaded successfully.")

print(df.head())  # Display the first few rows of the dataset

Dataset loaded successfully.
                                                URLs  \
0  http://www.bbc.com/news/world-us-canada-414191...   
1  https://www.reuters.com/article/us-filmfestiva...   
2  https://www.nytimes.com/2017/10/09/us/politics...   
3  https://www.reuters.com/article/us-mexico-oil-...   
4  http://www.cnn.com/videos/cnnmoney/2017/10/08/...   

                                            Headline  \
0         Four ways Bob Corker skewered Donald Trump   
1  Linklater's war veteran comedy speaks to moder...   
2  Trump’s Fight With Corker Jeopardizes His Legi...   
3  Egypt's Cheiron wins tie-up with Pemex for Mex...   
4        Jason Aldean opens 'SNL' with Vegas tribute   

                                                Body  Label  
0  Image copyright Getty Images\nOn Sunday mornin...      1  
1  LONDON (Reuters) - “Last Flag Flying”, a comed...      1  
2  The feud broke into public view last week when...      1  
3  MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin

In [5]:
# Explore the dataset
print(df.info())
print(df.describe())
print(df.columns)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4009 entries, 0 to 4008
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   URLs      4009 non-null   object
 1   Headline  4009 non-null   object
 2   Body      3988 non-null   object
 3   Label     4009 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 125.4+ KB
None
             Label
count  4009.000000
mean      0.466949
std       0.498969
min       0.000000
25%       0.000000
50%       0.000000
75%       1.000000
max       1.000000
Index(['URLs', 'Headline', 'Body', 'Label'], dtype='object')


In [6]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/rayarawajba/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/rayarawajba/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rayarawajba/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/rayarawajba/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
# Function to preprocess text data
def preprocess_text(text):
    try:
        # Check if text is not NaN and is a string
        if isinstance(text, str):
            # Remove leading and trailing whitespace
            text = text.strip()
            # Remove HTML tags
            text = re.sub(r'<.*?>', '', text)
            # Remove URLs
            text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
            # Remove email addresses
            text = re.sub(r'\S+@\S+', '', text)
            # Remove non-ASCII characters
            text = re.sub(r'[^\x00-\x7F]+', ' ', text)
            # Remove extra whitespace
            text = re.sub(r'\s+', ' ', text)
            # Remove digits
            text = re.sub(r'\d+', '', text)
            # Remove leading and trailing whitespace again
            text = text.strip()
            # Remove special characters
            text = re.sub(r'[^\w\s]', '', text)
            # Remove extra spaces
            text = re.sub(r'\s+', ' ', text)
            # Remove single characters
            text = re.sub(r'\b\w\b', '', text)
            # Convert to lowercase
            text = text.lower()
            # Remove punctuation
            text = text.translate(str.maketrans('', '', string.punctuation))
            # Tokenize the text
            tokens = word_tokenize(text)
            # Remove stopwords
            stop_words = set(stopwords.words('english'))
            tokens = [word for word in tokens if word not in stop_words]
            # Remove numbers
            tokens = [word for word in tokens if not word.isdigit()]
            # Remove extra spaces
            text = ' '.join(tokens)
            # Remove special characters
            text = re.sub(r'\W+', ' ', text)
        else:
            text = ""
    except Exception as e:
        print(f"Error processing text: {text}. Error: {e}")
        text = ""
    return text


In [8]:
# Preprocess the dataset
preprocessed_df = df.copy()
preprocessed_df['Body'] = preprocessed_df['Body'].apply(preprocess_text)
print("Preprocessing complete. Sample preprocessed data:")
print(preprocessed_df['Body'].head())

Preprocessing complete. Sample preprocessed data:
0    image copyright getty images sunday morning do...
1    london reuters last flag flying comedydrama vi...
2    feud broke public view last week mr corker sai...
3    mexico city reuters egypt cheiron holdings lim...
4    country singer jason aldean performing las veg...
Name: Body, dtype: object


In [9]:
# Tokenize the preprocessed text
preprocessed_df['Tokens'] = preprocessed_df['Body'].apply(word_tokenize)
print("Tokenization complete. Sample tokens:")
print(preprocessed_df['Tokens'].head())

Tokenization complete. Sample tokens:
0    [image, copyright, getty, images, sunday, morn...
1    [london, reuters, last, flag, flying, comedydr...
2    [feud, broke, public, view, last, week, mr, co...
3    [mexico, city, reuters, egypt, cheiron, holdin...
4    [country, singer, jason, aldean, performing, l...
Name: Tokens, dtype: object


In [10]:
# Count the total number of tokens in the dataset
tokens = []
for token_list in preprocessed_df['Tokens']:
    tokens.extend(token_list)
print("Total tokens in dataset:", len(tokens))

Total tokens in dataset: 1092149


In [11]:
unique_tokens = list(set(tokens)) # Sorting here is optional but ensures consistent ID assignment
PAD_ID = 0
UNK_ID = 1
# Create a mapping from tokens to IDs
# This mapping will be used to convert tokens to numerical IDs for model input
# Ensure that special tokens are included in the mapping
word_to_id = {"<pad>": PAD_ID, "<unk>": UNK_ID} # Special token IDs
next_id = 2 

for token in unique_tokens:
    if token not in word_to_id: # Ensure special tokens are not overwritten if they happen to be in the text
        word_to_id[token] = next_id
        next_id += 1

vocab_size = len(word_to_id)
print(f"Vocabulary Size: {vocab_size}")
# print(word_to_id)

# id_to_word mapping for debugging/reverse lookup
id_to_word = {v: k for k, v in word_to_id.items()}

Vocabulary Size: 49205


In [12]:
preprocessed_df['Numerical_Tokens'] = preprocessed_df['Tokens'].apply(
    lambda token_list: [word_to_id.get(token, word_to_id["<unk>"]) for token in token_list]
)

print(preprocessed_df[['Tokens', 'Numerical_Tokens']])

#  Numericalize the 'Label' column in the DataFrame ---
# Get all unique labels
unique_labels = sorted(preprocessed_df['Label'].unique().tolist()) 

# Create a mapping from string label to integer ID
label_to_id = {label: i for i, label in enumerate(unique_labels)}
print(label_to_id)
# Create a mapping from integer ID to string label
id_to_label = {i: label for label, i in label_to_id.items()}
num_classes = len(label_to_id)

preprocessed_df['Numerical_Label'] = preprocessed_df['Label'].map(label_to_id)


                                                 Tokens  \
0     [image, copyright, getty, images, sunday, morn...   
1     [london, reuters, last, flag, flying, comedydr...   
2     [feud, broke, public, view, last, week, mr, co...   
3     [mexico, city, reuters, egypt, cheiron, holdin...   
4     [country, singer, jason, aldean, performing, l...   
...                                                 ...   
4004  [trends, watch, readers, think, story, fact, a...   
4005  [trump, jr, soon, give, minute, speech, reader...   
4006                                                 []   
4007  [shanghai, reuters, china, said, plans, accept...   
4008  [vice, president, mike, pence, leaves, nfl, ga...   

                                       Numerical_Tokens  
0     [19187, 20100, 19944, 29186, 94, 10804, 37021,...  
1     [40821, 28437, 27032, 42569, 39144, 42717, 245...  
2     [36544, 39399, 43034, 35529, 27032, 27545, 419...  
3     [25052, 8186, 28437, 23970, 13108, 29580, 4691...  
4

In [13]:
class TextDataset(Dataset):
    def __init__(self, numerical_tokens, numerical_labels):
        self.numerical_tokens = numerical_tokens
        self.numerical_labels = numerical_labels

    def __len__(self):
        return len(self.numerical_tokens)

    def __getitem__(self, idx):
        # Return numerical tokens as a PyTorch tensor and the label
        # We convert to tensor here, but padding happens in collate_fn
        token_ids = torch.tensor(self.numerical_tokens.iloc[idx], dtype=torch.long)
        label = torch.tensor(self.numerical_labels.iloc[idx], dtype=torch.long)
        return token_ids, label

In [14]:
# min_required_seq_len = max(model.kernel_sizes)
def collate_fn(batch):
    # `batch` is a list of tuples, where each tuple is (token_ids, label)
    # e.g., [(tensor([3,2,4]), 0), (tensor([5,6,7,8]), 1)]

    # Separate token IDs and labels
    token_ids_list = [item[0] for item in batch]
    labels_list = [item[1] for item in batch]

    # Pad the token_ids to the max length in the current batch
    # `batch_first=True` means the output tensor will be (batch_size, sequence_length)
    padded_token_ids = pad_sequence(token_ids_list,
                                    batch_first=True,
                                    padding_value=PAD_ID) 

    # Stack labels into a single tensor
    labels = torch.stack(labels_list)

    return padded_token_ids, labels

In [15]:
train_df, test_df = train_test_split(
    preprocessed_df,
    test_size=0.2,    # 20% for testing
    random_state=42,  # A common seed for reproducibility
    stratify=preprocessed_df['Label'] # Stratify by the original string label column for balanced splits
)

In [16]:
train_dataset = TextDataset(train_df['Numerical_Tokens'], train_df['Numerical_Label'])
test_dataset = TextDataset(test_df['Numerical_Tokens'], test_df['Numerical_Label'])

In [17]:
batch_size = 4 # Choose your batch size
train_dataloader = DataLoader(train_dataset,
                        batch_size=batch_size,
                        shuffle=True, # Shuffle for training
                        collate_fn=collate_fn)

print(f"\nDataLoader created with batch_size={batch_size}. Iterating through a few batches:")

# --- 4. Iterate through the DataLoader to see the padded batches ---
for i, (batch_tokens, batch_labels) in enumerate(train_dataloader):
    print(f"\n--- Batch {i+1} ---")
    print("Padded Token IDs (shape, content):")
    print(batch_tokens.shape)
    print(batch_tokens)
    print("Labels (shape, content):")
    print(batch_labels.shape)
    print(batch_labels)

    if i >= 1: # Just show a couple of batches
        break

# This `batch_tokens` tensor (e.g., shape [batch_size, max_seq_len_in_batch])
# is what you directly feed into your PyTorch nn.Embedding layer.


DataLoader created with batch_size=4. Iterating through a few batches:

--- Batch 1 ---
Padded Token IDs (shape, content):
torch.Size([4, 221])
tensor([[ 2386, 42569, 38431, 15803, 44974, 34175, 43518, 41755, 19374,  6590,
         46658, 28149, 47506, 46552,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0, 

In [18]:
class TextCNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_classes, pad_idx):
        super(TextCNN, self).__init__()

        # 1. Embedding Layer
        # pad_idx tells the embedding layer to not update the embedding for this index (PAD_ID)
        # and it will output zeros for that index.
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)

        # 2. Convolutional Layers (usually multiple filters with different kernel sizes)
        # These capture n-gram features of different lengths
        self.kernel_sizes = [3, 4, 5] # Example: capture 3-gram, 4-gram, 5-gram features
        self.num_filters = 100        # Number of filters (feature detectors) per kernel size

        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=embedding_dim, # Input channels are the embedding dimension
                      out_channels=self.num_filters,
                      kernel_size=k)
            for k in self.kernel_sizes
        ])

        # 3. Fully Connected (Dense) Layer for classification
        # Sum of num_filters for each kernel size, as we concatenate their outputs
        self.fc = nn.Linear(len(self.kernel_sizes) * self.num_filters, num_classes)

        # Dropout for regularization (to prevent overfitting)
        self.dropout = nn.Dropout(0.5) # Example dropout rate

    def forward(self, text):
        # text shape: (batch_size, sequence_length)

        # Pass through embedding layer
        embedded = self.embedding(text)
        # embedded shape: (batch_size, sequence_length, embedding_dim)

        # PyTorch Conv1d expects input in (batch_size, channels, sequence_length)
        # So we permute the dimensions
        embedded = embedded.permute(0, 2, 1)
        # embedded shape: (batch_size, embedding_dim, sequence_length)

        # Apply convolutions and ReLU activation
        # For each conv layer, apply it, then apply ReLU, then apply global max pooling
        # The pooling operation extracts the most important feature from each filter's output
        conved = [F.relu(conv(embedded)) for conv in self.convs]
        # conved[i] shape: (batch_size, num_filters, output_sequence_length)

        # Apply global max pooling over the sequence dimension
        # This takes the maximum value from each filter's output across the entire sequence
        pooled = [F.max_pool1d(c, c.shape[2]).squeeze(2) for c in conved]
        # pooled[i] shape: (batch_size, num_filters)

        # Concatenate the pooled outputs from all kernel sizes
        cat = self.dropout(torch.cat(pooled, dim=1))
        # cat shape: (batch_size, num_filters * len(kernel_sizes))

        # Pass through the fully connected layer
        output = self.fc(cat)
        # output shape: (batch_size, num_classes)

        return output

In [19]:
# Define the model
embedding_dim = 100 
model = TextCNN(vocab_size, embedding_dim, num_classes, PAD_ID)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

In [20]:
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train() # Set model to training mode
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad() # Clear previous gradients
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")

In [21]:
num_epochs = 5 # Set the number of epochs for training
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}\n-------------------------------")
    train_loop(train_dataloader, model, loss_fn, optimizer)
print("Done!")

Epoch 1
-------------------------------
loss: 1.575658 [    4/ 3207]
loss: 0.289116 [  404/ 3207]
loss: 0.500863 [  804/ 3207]
loss: 1.317698 [ 1204/ 3207]
loss: 0.140407 [ 1604/ 3207]
loss: 0.212157 [ 2004/ 3207]
loss: 0.678218 [ 2404/ 3207]
loss: 4.055592 [ 2804/ 3207]
loss: 0.002178 [ 3204/ 3207]
Epoch 2
-------------------------------
loss: 2.183415 [    4/ 3207]
loss: 0.000081 [  404/ 3207]
loss: 2.028860 [  804/ 3207]
loss: 0.008639 [ 1204/ 3207]
loss: 0.233213 [ 1604/ 3207]
loss: 0.770672 [ 2004/ 3207]
loss: 0.003216 [ 2404/ 3207]
loss: 0.010467 [ 2804/ 3207]
loss: 0.591287 [ 3204/ 3207]
Epoch 3
-------------------------------
loss: 2.396940 [    4/ 3207]
loss: 0.358429 [  404/ 3207]
loss: 0.019943 [  804/ 3207]
loss: 0.647841 [ 1204/ 3207]
loss: 0.000765 [ 1604/ 3207]
loss: 0.005887 [ 2004/ 3207]
loss: 0.001492 [ 2404/ 3207]
loss: 4.329236 [ 2804/ 3207]
loss: 0.000229 [ 3204/ 3207]
Epoch 4
-------------------------------
loss: 2.681771 [    4/ 3207]
loss: 0.016275 [  404/ 3207]

In [22]:
def evaluate_model(model, dataloader, device):
    model.eval() # Ensure model is in evaluation mode
    total_loss = 0
    all_preds = []
    all_labels = []

    # No gradient calculation needed during evaluation
    with torch.no_grad():
        for batch_tokens, batch_labels in dataloader:
            batch_tokens = batch_tokens.to(device)
            batch_labels = batch_labels.to(device)

            # Forward pass
            outputs = model(batch_tokens)

            # Calculate loss (optional for testing, but good for understanding)
            loss_fn = nn.CrossEntropyLoss() # Use the same loss function as training
            loss = loss_fn(outputs, batch_labels)
            total_loss += loss.item()

            # Get predictions (the class with the highest probability/logit)
            _, predicted = torch.max(outputs.data, 1)

            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(batch_labels.cpu().numpy())

    avg_loss = total_loss / len(dataloader)
    accuracy = accuracy_score(all_labels, all_preds)
    
    # Generate a classification report (precision, recall, f1-score per class)
    report = classification_report(all_labels, all_preds, target_names=list(label_to_id.keys()), output_dict=True)
    
    # Generate confusion matrix
    cm = confusion_matrix(all_labels, all_preds)

    return avg_loss, accuracy, report, cm

In [23]:
# Run the evaluation
train_loss, train_accuracy, train_report, train_confusion_matrix = evaluate_model(model, train_dataloader, device=torch.device('cpu'))

print(f"\n--- Test Results on Training Dataset ---")
print(f"Test Loss: {train_loss:.4f}")
print(f"Test Accuracy: {train_accuracy:.4f}")
print("\nClassification Report:")
# Convert dict to string for pretty print
print(pd.DataFrame(train_report).transpose())
print("\nConfusion Matrix:")
print(train_confusion_matrix)


--- Test Results on Training Dataset ---
Test Loss: 0.0810
Test Accuracy: 0.9738

Classification Report:
              precision    recall  f1-score      support
0              0.997551  0.953189  0.974865  1709.000000
1              0.949174  0.997330  0.972656  1498.000000
accuracy       0.973807  0.973807  0.973807     0.973807
macro avg      0.973362  0.975259  0.973761  3207.000000
weighted avg   0.974954  0.973807  0.973833  3207.000000

Confusion Matrix:
[[1629   80]
 [   4 1494]]


In [24]:
# The following code demonstrates how to evaluate a trained TextCNN model on a test dataset using PyTorch.
# It sets up a DataLoader for batching and padding, runs the evaluation loop, and prints out key metrics
# such as test loss, accuracy, classification report, and confusion matrix. This process helps assess
# the model's performance on unseen data and provides insights into its predictive capabilities.

In [25]:
# Instantiate your dataloader for the test dataset
test_batch_size = 8 
test_dataloader = DataLoader(test_dataset,
                             batch_size=test_batch_size,
                             shuffle=False, 
                             collate_fn=collate_fn)

print("Test DataLoader ready.")

Test DataLoader ready.


In [26]:
# Run the evaluation
test_loss, test_accuracy, test_report, test_confusion_matrix = evaluate_model(model, test_dataloader, device=torch.device('cpu'))

print(f"\n--- Test Results ---")
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")
print("\nClassification Report:")
# Convert dict to string for pretty print
print(pd.DataFrame(test_report).transpose())
print("\nConfusion Matrix:")
print(test_confusion_matrix)


--- Test Results ---
Test Loss: 0.1926
Test Accuracy: 0.9514

Classification Report:
              precision    recall  f1-score     support
0              0.997442  0.911215  0.952381  428.000000
1              0.907543  0.997326  0.950318  374.000000
accuracy       0.951372  0.951372  0.951372    0.951372
macro avg      0.952493  0.954271  0.951350  802.000000
weighted avg   0.955519  0.951372  0.951419  802.000000

Confusion Matrix:
[[390  38]
 [  1 373]]


In [27]:
# Export the trained model to a file
torch.save(model.state_dict(), "textcnn_model.pth")
print("Model exported to textcnn_model.pth")

Model exported to textcnn_model.pth


In [28]:
import pickle

# Save the word_to_id mapping
with open('word_to_id.pkl', 'wb') as f:
    pickle.dump(word_to_id, f)
# Save the id_to_label mapping
with open('id_to_label.pkl', 'wb') as f:
    pickle.dump(id_to_label, f)