In [1]:
import kagglehub
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import torch
import re
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split



In [2]:
# Download dataset from Kaggle
path = kagglehub.dataset_download("muhammadimran112233/liar-twitter-dataset")

print("Path to dataset files:", path)



Path to dataset files: /Users/rayarawajba/.cache/kagglehub/datasets/muhammadimran112233/liar-twitter-dataset/versions/1


In [3]:
# Introduction:
# The following code demonstrates how to process and batch text data for a poem classification task using PyTorch.
# It covers loading and preprocessing the dataset, tokenizing and numericalizing the text, creating a custom Dataset,
# and batching the data with padding for use in neural network models. This setup is essential for training
# deep learning models on variable-length text sequences.

In [4]:
# Load the training dataset into a DataFrame
df = pd.read_csv(path + "/Liar_Dataset.csv")
print("Dataset loaded successfully.")

print(df.head())  # Display the first few rows of the dataset
# count_label_0 = df['label'].value_counts().get(TRUE, 0)
# count_label_1 = df['label'].value_counts().get(FALSE, 0)
# print("Number of items with label = 0:", count_label_0)
# print("Number of items with label = 1:", count_label_1)


Dataset loaded successfully.
    [ID].json       label                                          statement  \
0  11972.json        TRUE  Building a wall on the U.S.-Mexico border will...   
1  11685.json       FALSE  Wisconsin is on pace to double the number of l...   
2  11096.json       FALSE  Says John McCain has done nothing to help the ...   
3   5209.json   half-true  Suzanne Bonamici supports a plan that will cut...   
4   9524.json  pants-fire  When asked by a reporter whether hes at the ce...   

                                          subject(s)  \
0                                        immigration   
1                                               jobs   
2                    military,veterans,voting-record   
3  medicare,message-machine-2012,campaign-adverti...   
4  campaign-finance,legal-issues,campaign-adverti...   

                            speaker   speaker's job title state info  \
0                        rick-perry              Governor      Texas   
1        

In [5]:
# Explore the dataset
print(df.info())
print(df.describe())
print(df.columns)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12787 entries, 0 to 12786
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   [ID].json             12787 non-null  object
 1   label                 12787 non-null  object
 2   statement             12787 non-null  object
 3   subject(s)            12787 non-null  object
 4   speaker               12787 non-null  object
 5   speaker's job title   9221 non-null   object
 6   state info            10038 non-null  object
 7   party affiliation     12787 non-null  object
 8   barely true counts    12787 non-null  int64 
 9   false counts          12787 non-null  int64 
 10  half true counts      12787 non-null  int64 
 11  mostly true counts    12787 non-null  int64 
 12  pants on fire counts  12787 non-null  int64 
 13  venue                 12658 non-null  object
dtypes: int64(5), object(9)
memory usage: 1.4+ MB
None
       barely true counts  false cou

In [6]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/rayarawajba/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/rayarawajba/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rayarawajba/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/rayarawajba/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
# Function to preprocess text data
def preprocess_text(text):
    try:
        # Check if text is not NaN and is a string
        if isinstance(text, str):
            # Remove leading and trailing whitespace
            text = text.strip()
            # Remove HTML tags
            text = re.sub(r'<.*?>', '', text)
            # Remove URLs
            text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
            # Remove email addresses
            text = re.sub(r'\S+@\S+', '', text)
            # Remove non-ASCII characters
            text = re.sub(r'[^\x00-\x7F]+', ' ', text)
            # Remove extra whitespace
            text = re.sub(r'\s+', ' ', text)
            # Remove digits
            text = re.sub(r'\d+', '', text)
            # Remove leading and trailing whitespace again
            text = text.strip()
            # Remove special characters
            text = re.sub(r'[^\w\s]', '', text)
            # Remove extra spaces
            text = re.sub(r'\s+', ' ', text)
            # Remove single characters
            text = re.sub(r'\b\w\b', '', text)
            # Convert to lowercase
            text = text.lower()
            # Remove punctuation
            text = text.translate(str.maketrans('', '', string.punctuation))
            # Tokenize the text
            tokens = word_tokenize(text)
            # Remove stopwords
            stop_words = set(stopwords.words('english'))
            tokens = [word for word in tokens if word not in stop_words]
            # Remove numbers
            tokens = [word for word in tokens if not word.isdigit()]
            # Remove extra spaces
            text = ' '.join(tokens)
            # Remove special characters
            text = re.sub(r'\W+', ' ', text)
        else:
            text = ""
    except Exception as e:
        print(f"Error processing text: {text}. Error: {e}")
        text = ""
    return text


In [8]:
# Preprocess the dataset
preprocessed_df = df.copy()
preprocessed_df['statement'] = preprocessed_df['statement'].apply(preprocess_text)
print("Preprocessing complete. Sample preprocessed data:")
print(preprocessed_df['statement'].head())

Preprocessing complete. Sample preprocessed data:
0    building wall usmexico border take literally y...
1            wisconsin pace double number layoffs year
2              says john mccain done nothing help vets
3    suzanne bonamici supports plan cut choice medi...
4    asked reporter whether hes center criminal sch...
Name: statement, dtype: object


In [9]:
# Tokenize the preprocessed text
preprocessed_df['Tokens'] = preprocessed_df['statement'].apply(word_tokenize)
print("Tokenization complete. Sample tokens:")
print(preprocessed_df['Tokens'].head())

Tokenization complete. Sample tokens:
0    [building, wall, usmexico, border, take, liter...
1     [wisconsin, pace, double, number, layoffs, year]
2      [says, john, mccain, done, nothing, help, vets]
3    [suzanne, bonamici, supports, plan, cut, choic...
4    [asked, reporter, whether, hes, center, crimin...
Name: Tokens, dtype: object


In [10]:
# Count the total number of tokens in the dataset
tokens = []
for token_list in preprocessed_df['Tokens']:
    tokens.extend(token_list)
print("Total tokens in dataset:", len(tokens))

Total tokens in dataset: 134804


In [11]:
unique_tokens = list(set(tokens)) # Sorting here is optional but ensures consistent ID assignment
PAD_ID = 0
UNK_ID = 1
# Create a mapping from tokens to IDs
# This mapping will be used to convert tokens to numerical IDs for model input
# Ensure that special tokens are included in the mapping
word_to_id = {"<pad>": PAD_ID, "<unk>": UNK_ID} # Special token IDs
next_id = 2 

for token in unique_tokens:
    if token not in word_to_id: # Ensure special tokens are not overwritten if they happen to be in the text
        word_to_id[token] = next_id
        next_id += 1

vocab_size = len(word_to_id)
print(f"Vocabulary Size: {vocab_size}")
# print(word_to_id)

# id_to_word mapping for debugging/reverse lookup
id_to_word = {v: k for k, v in word_to_id.items()}

Vocabulary Size: 13669


In [12]:
preprocessed_df['Numerical_Tokens'] = preprocessed_df['Tokens'].apply(
    lambda token_list: [word_to_id.get(token, word_to_id["<unk>"]) for token in token_list]
)


#  Numericalize the 'Label' column in the DataFrame ---
# Get all unique labels
unique_labels = sorted(preprocessed_df['label'].unique().tolist()) 
# unique_labels = ['fake', 'true']  
# label_to_id = {'fake': 0, 'true': 1}
# id_to_label = {0: 'fake', 1: 'true'}

# Create a mapping from string label to integer ID
label_to_id = {label: i for i, label in enumerate(unique_labels)}
print(label_to_id)
# Create a mapping from integer ID to string label
id_to_label = {i: label for label, i in label_to_id.items()}
num_classes = len(label_to_id)

preprocessed_df['Numerical_Label'] = preprocessed_df['label'].map(label_to_id)



{'FALSE': 0, 'TRUE': 1, 'barely-true': 2, 'half-true': 3, 'mostly-true': 4, 'pants-fire': 5}


In [13]:
class TextDataset(Dataset):
    def __init__(self, numerical_tokens, numerical_labels):
        self.numerical_tokens = numerical_tokens
        self.numerical_labels = numerical_labels

    def __len__(self):
        return len(self.numerical_tokens)

    def __getitem__(self, idx):
        # Return numerical tokens as a PyTorch tensor and the label
        # We convert to tensor here, but padding happens in collate_fn
        token_ids = torch.tensor(self.numerical_tokens.iloc[idx], dtype=torch.long)
        label = torch.tensor(self.numerical_labels.iloc[idx], dtype=torch.long)
        return token_ids, label

In [14]:
kernel_sizes = [3, 4, 5]  # Example kernel sizes for CNN
min_required_seq_len = max(kernel_sizes)  # Define this based on your model's kernel sizes
def collate_fn(batch):
    # Assume min_required_seq_len is defined in your context (e.g. globally, or as a param)
    # e.g., min_required_seq_len = max(model.kernel_sizes)

    # Separate token IDs and labels
    token_ids_list = [item[0] for item in batch]
    labels_list = [item[1] for item in batch]

    # First, ensure each token_ids is at least min_required_seq_len long
    adjusted_token_ids_list = []
    for seq in token_ids_list:
        if len(seq) < min_required_seq_len:
            pad = torch.full((min_required_seq_len - len(seq),), PAD_ID, dtype=seq.dtype)
            seq = torch.cat([seq, pad], dim=0)
        adjusted_token_ids_list.append(seq)

    # Now pad all to batch max length (could be > min_required_seq_len!)
    padded_token_ids = pad_sequence(adjusted_token_ids_list,
                                   batch_first=True,
                                   padding_value=PAD_ID) 

    # Stack labels into a single tensor
    labels = torch.stack(labels_list)

    return padded_token_ids, labels

In [15]:
train_df, test_df = train_test_split(
    preprocessed_df,
    test_size=0.2,    # 20% for testing
    random_state=42,  # A common seed for reproducibility
    stratify=preprocessed_df['label'] # Stratify by the original string label column for balanced splits
)

In [16]:
train_dataset = TextDataset(train_df['Numerical_Tokens'], train_df['Numerical_Label'])
test_dataset = TextDataset(test_df['Numerical_Tokens'], test_df['Numerical_Label'])

In [17]:
batch_size = 4 # Choose your batch size
train_dataloader = DataLoader(train_dataset,
                        batch_size=batch_size,
                        shuffle=True, # Shuffle for training
                        collate_fn=collate_fn)

print(f"\nDataLoader created with batch_size={batch_size}. Iterating through a few batches:")

# --- 4. Iterate through the DataLoader to see the padded batches ---
for i, (batch_tokens, batch_labels) in enumerate(train_dataloader):
    print(f"\n--- Batch {i+1} ---")
    print("Padded Token IDs (shape, content):")
    print(batch_tokens.shape)
    print(batch_tokens)
    print("Labels (shape, content):")
    print(batch_labels.shape)
    print(batch_labels)

    if i >= 1: # Just show a couple of batches
        break

# This `batch_tokens` tensor (e.g., shape [batch_size, max_seq_len_in_batch])
# is what you directly feed into your PyTorch nn.Embedding layer.


DataLoader created with batch_size=4. Iterating through a few batches:

--- Batch 1 ---
Padded Token IDs (shape, content):
torch.Size([4, 14])
tensor([[  118,  7110,  8900,  5402, 12407,  6090,  3263,     0,     0,     0,
             0,     0,     0,     0],
        [ 2617,  4834,  1243,  7493,  2256, 10992,  3474,  7345,  9205,  1224,
         10414,     0,     0,     0],
        [13147,  2745,  5316,  2151,  8009,  4751, 13083,  3309,  3205,  4816,
          6778,  7139,  7256, 13378],
        [ 8904,   696,  6757, 10799,  5406,  4352,  3086, 12845,  4401,  6150,
           510,     0,     0,     0]])
Labels (shape, content):
torch.Size([4])
tensor([4, 2, 0, 3])

--- Batch 2 ---
Padded Token IDs (shape, content):
torch.Size([4, 13])
tensor([[ 2106,  9069,  3319, 12392, 12628,  7110, 12283, 11233,  5424,     0,
             0,     0,     0],
        [10806, 11216,   759,   962,  1455,  8721, 12386, 12699,     0,     0,
             0,     0,     0],
        [13147, 10449,  9944,  63

In [None]:
class TextCNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_classes, pad_idx):
        super(TextCNN, self).__init__()

        # 1. Embedding Layer
        # pad_idx tells the embedding layer to not update the embedding for this index (PAD_ID)
        # and it will output zeros for that index.
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)

        # 2. Convolutional Layers 
        self.kernel_sizes = [3, 4, 5] # Example: capture 3-gram, 4-gram, 5-gram features
        self.num_filters = 100        # Number of filters (feature detectors) per kernel size

        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=embedding_dim, # Input channels are the embedding dimension
                      out_channels=self.num_filters,
                      kernel_size=k)
            for k in self.kernel_sizes
        ])

        # 3. Fully Connected (Dense) Layer for classification
        # Sum of num_filters for each kernel size, as we concatenate their outputs
        self.fc = nn.Linear(len(self.kernel_sizes) * self.num_filters, num_classes)

        # Dropout for regularization (to prevent overfitting)
        self.dropout = nn.Dropout(0.5) # Example dropout rate

    def forward(self, text):
        # text shape: (batch_size, sequence_length)

        # Pass through embedding layer
        embedded = self.embedding(text)
        # embedded shape: (batch_size, sequence_length, embedding_dim)

        # PyTorch Conv1d expects input in (batch_size, channels, sequence_length)
        # So we permute the dimensions
        embedded = embedded.permute(0, 2, 1)
        # embedded shape: (batch_size, embedding_dim, sequence_length)

        # Apply convolutions and ReLU activation
        # For each conv layer, apply it, then apply ReLU, then apply global max pooling
        # The pooling operation extracts the most important feature from each filter's output
        conved = [F.relu(conv(embedded)) for conv in self.convs]
        # conved[i] shape: (batch_size, num_filters, output_sequence_length)

        # Apply global max pooling over the sequence dimension
        # This takes the maximum value from each filter's output across the entire sequence
        pooled = [F.max_pool1d(c, c.shape[2]).squeeze(2) for c in conved]
        # pooled[i] shape: (batch_size, num_filters)

        # Concatenate the pooled outputs from all kernel sizes
        cat = self.dropout(torch.cat(pooled, dim=1))
        # cat shape: (batch_size, num_filters * len(kernel_sizes))

        # Pass through the fully connected layer
        output = self.fc(cat)
        # output shape: (batch_size, num_classes)

        return output

In [None]:
# Define the model
embedding_dim = 50 
model = TextCNN(vocab_size, embedding_dim, num_classes, PAD_ID)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

In [20]:
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train() # Set model to training mode
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad() # Clear previous gradients
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")

In [21]:
num_epochs = 20 # Set the number of epochs for training
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}\n-------------------------------")
    train_loop(train_dataloader, model, loss_fn, optimizer)
print("Done!")

Epoch 1
-------------------------------
loss: 1.693548 [    4/10229]
loss: 2.288349 [  404/10229]
loss: 2.215557 [  804/10229]
loss: 2.210623 [ 1204/10229]
loss: 2.546705 [ 1604/10229]
loss: 1.176772 [ 2004/10229]
loss: 1.731690 [ 2404/10229]
loss: 1.651572 [ 2804/10229]
loss: 2.712379 [ 3204/10229]
loss: 2.136947 [ 3604/10229]
loss: 3.821948 [ 4004/10229]
loss: 1.969606 [ 4404/10229]
loss: 1.749651 [ 4804/10229]
loss: 1.681575 [ 5204/10229]
loss: 3.037511 [ 5604/10229]
loss: 2.095672 [ 6004/10229]
loss: 1.766688 [ 6404/10229]
loss: 1.728169 [ 6804/10229]
loss: 2.288031 [ 7204/10229]
loss: 1.140062 [ 7604/10229]
loss: 2.990820 [ 8004/10229]
loss: 2.337970 [ 8404/10229]
loss: 1.420963 [ 8804/10229]
loss: 2.042612 [ 9204/10229]
loss: 2.419693 [ 9604/10229]
loss: 1.982548 [10004/10229]
Epoch 2
-------------------------------
loss: 3.155347 [    4/10229]
loss: 2.978479 [  404/10229]
loss: 3.180850 [  804/10229]
loss: 2.707742 [ 1204/10229]
loss: 2.768564 [ 1604/10229]
loss: 1.818096 [ 2004

In [22]:
def evaluate_model(model, dataloader, device):
    model.eval() # Ensure model is in evaluation mode
    total_loss = 0
    all_preds = []
    all_labels = []

    # No gradient calculation needed during evaluation
    with torch.no_grad():
        for batch_tokens, batch_labels in dataloader:
            batch_tokens = batch_tokens.to(device)
            batch_labels = batch_labels.to(device)

            # Forward pass
            outputs = model(batch_tokens)

            # Calculate loss (optional for testing, but good for understanding)
            loss_fn = nn.CrossEntropyLoss() # Use the same loss function as training
            loss = loss_fn(outputs, batch_labels)
            total_loss += loss.item()

            # Get predictions (the class with the highest probability/logit)
            _, predicted = torch.max(outputs.data, 1)

            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(batch_labels.cpu().numpy())

    avg_loss = total_loss / len(dataloader)
    accuracy = accuracy_score(all_labels, all_preds)
    
    # Generate a classification report (precision, recall, f1-score per class)
    report = classification_report(all_labels, all_preds, target_names=list(label_to_id.keys()), output_dict=True)
    
    # Generate confusion matrix
    cm = confusion_matrix(all_labels, all_preds)

    return avg_loss, accuracy, report, cm

In [23]:
# Run the evaluation
train_loss, train_accuracy, train_report, train_confusion_matrix = evaluate_model(model, train_dataloader, device=torch.device('cpu'))

print(f"\n--- Test Results on Training Dataset ---")
print(f"Test Loss: {train_loss:.4f}")
print(f"Test Accuracy: {train_accuracy:.4f}")
print("\nClassification Report:")
# Convert dict to string for pretty print
print(pd.DataFrame(train_report).transpose())
print("\nConfusion Matrix:")
print(train_confusion_matrix)


--- Test Results on Training Dataset ---
Test Loss: 0.2864
Test Accuracy: 0.9203

Classification Report:
              precision    recall  f1-score       support
FALSE          0.870177  0.957064  0.911555   2003.000000
TRUE           0.927536  0.896468  0.911737   1642.000000
barely-true    0.962155  0.891795  0.925640   1682.000000
half-true      0.929303  0.907187  0.918112   2101.000000
mostly-true    0.924365  0.927662  0.926011   1963.000000
pants-fire     0.928987  0.952267  0.940483    838.000000
accuracy       0.920325  0.920325  0.920325      0.920325
macro avg      0.923754  0.922074  0.922256  10229.000000
weighted avg   0.921870  0.920325  0.920391  10229.000000

Confusion Matrix:
[[1917   13   20   26   16   11]
 [  64 1472   11   47   33   15]
 [  87   16 1500   32   35   12]
 [  72   36   11 1906   60   16]
 [  46   44   10   35 1821    7]
 [  17    6    7    5    5  798]]


In [24]:
# The following code demonstrates how to evaluate a trained TextCNN model on a test dataset using PyTorch.
# It sets up a DataLoader for batching and padding, runs the evaluation loop, and prints out key metrics
# such as test loss, accuracy, classification report, and confusion matrix. This process helps assess
# the model's performance on unseen data and provides insights into its predictive capabilities.

In [25]:
# Instantiate your dataloader for the test dataset
test_batch_size = 8 
test_dataloader = DataLoader(test_dataset,
                             batch_size=test_batch_size,
                             shuffle=False, 
                             collate_fn=collate_fn)

print("Test DataLoader ready.")

Test DataLoader ready.


In [26]:
# Run the evaluation
test_loss, test_accuracy, test_report, test_confusion_matrix = evaluate_model(model, test_dataloader, device=torch.device('cpu'))

print(f"\n--- Test Results ---")
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")
print("\nClassification Report:")
# Convert dict to string for pretty print
print(pd.DataFrame(test_report).transpose())
print("\nConfusion Matrix:")
print(test_confusion_matrix)


--- Test Results ---
Test Loss: 11.0489
Test Accuracy: 0.2189

Classification Report:
              precision    recall  f1-score      support
FALSE          0.222362  0.353293  0.272938   501.000000
TRUE           0.173585  0.111922  0.136095   411.000000
barely-true    0.235955  0.150000  0.183406   420.000000
half-true      0.248299  0.277567  0.262118   526.000000
mostly-true    0.217391  0.203666  0.210305   491.000000
pants-fire     0.153846  0.133971  0.143223   209.000000
accuracy       0.218921  0.218921  0.218921     0.218921
macro avg      0.208573  0.205070  0.201347  2558.000000
weighted avg   0.215538  0.218921  0.211405  2558.000000

Confusion Matrix:
[[177  57  46 107  81  33]
 [131  46  29  90  84  31]
 [126  35  63  99  72  25]
 [144  52  55 146  98  31]
 [145  62  49 101 100  34]
 [ 73  13  25  45  25  28]]


In [27]:
model_args = {
    'vocab_size': vocab_size,
    'embedding_dim': embedding_dim,
    'num_classes': num_classes,
    'pad_idx': PAD_ID,  
    'unk_idx': UNK_ID
}

In [28]:
# Export the trained model to a file
torch.save(model.state_dict(), "textcnn_model.pth")
print("Model exported to textcnn_model.pth")

Model exported to textcnn_model.pth


In [29]:
import pickle

# Save the word_to_id mapping
with open('word_to_id.pkl', 'wb') as f:
    pickle.dump(word_to_id, f)
# Save the id_to_label mapping
with open('id_to_label.pkl', 'wb') as f:
    pickle.dump(id_to_label, f)
with open('model_args.pkl', 'wb') as fp:
    pickle.dump(model_args, fp)