In [None]:
import kagglehub
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import torch
import re
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

In [None]:
# Download dataset from Kaggle
path = kagglehub.dataset_download("muhammadimran112233/liar-twitter-dataset")

print("Path to dataset files:", path)



In [None]:
# Introduction:
# The following code demonstrates how to process and batch text data for news classification task using PyTorch.
# It covers loading and preprocessing the dataset, tokenizing and numericalizing the text, creating a custom Dataset,
# and batching the data with padding for use in neural network models.

In [None]:
# Load the training dataset into a DataFrame
df = pd.read_csv(path + "/Liar_Dataset.csv")
print("Dataset loaded successfully.")

print(df.head())  # Display the first few rows of the dataset


In [None]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
# Function to preprocess text data
def preprocess_text(text):
    try:
        # Check if text is not NaN and is a string
        if isinstance(text, str):
            # Remove leading and trailing whitespace
            text = text.strip()
            # Remove HTML tags
            text = re.sub(r'<.*?>', '', text)
            # Remove URLs
            text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
            # Remove email addresses
            text = re.sub(r'\S+@\S+', '', text)
            # Remove non-ASCII characters
            text = re.sub(r'[^\x00-\x7F]+', ' ', text)
            # Remove extra whitespace
            text = re.sub(r'\s+', ' ', text)
            # Remove digits
            text = re.sub(r'\d+', '', text)
            # Remove leading and trailing whitespace again
            text = text.strip()
            # Remove special characters
            text = re.sub(r'[^\w\s]', '', text)
            # Remove extra spaces
            text = re.sub(r'\s+', ' ', text)
            # Remove single characters
            text = re.sub(r'\b\w\b', '', text)
            # Convert to lowercase
            text = text.lower()
            # Remove punctuation
            text = text.translate(str.maketrans('', '', string.punctuation))
            # Tokenize the text
            tokens = word_tokenize(text)
            # Remove stopwords
            stop_words = set(stopwords.words('english'))
            tokens = [word for word in tokens if word not in stop_words]
            # Remove numbers
            tokens = [word for word in tokens if not word.isdigit()]
            # Remove extra spaces
            text = ' '.join(tokens)
            # Remove special characters
            text = re.sub(r'\W+', ' ', text)
        else:
            text = ""
    except Exception as e:
        print(f"Error processing text: {text}. Error: {e}")
        text = ""
    return text


In [None]:
# Preprocess the dataset
preprocessed_df = df.copy()
preprocessed_df['statement'] = preprocessed_df['statement'].apply(preprocess_text)
print("Preprocessing complete. Sample preprocessed data:")
print(preprocessed_df['statement'].head())

In [None]:
# Tokenize the preprocessed text
preprocessed_df['Tokens'] = preprocessed_df['statement'].apply(word_tokenize)
print("Tokenization complete. Sample tokens:")
print(preprocessed_df['Tokens'].head())

In [None]:
# Count the total number of tokens in the dataset
tokens = []
for token_list in preprocessed_df['Tokens']:
    tokens.extend(token_list)
print("Total tokens in dataset:", len(tokens))

In [None]:
unique_tokens = list(set(tokens))
PAD_ID = 0
UNK_ID = 1
# Create a mapping from tokens to IDs
# This mapping will be used to convert tokens to numerical IDs for model input
# Ensure that special tokens are included in the mapping
word_to_id = {"<pad>": PAD_ID, "<unk>": UNK_ID} # Special token IDs
next_id = 2 

for token in unique_tokens:
    if token not in word_to_id: # Ensure special tokens are not overwritten if they happen to be in the text
        word_to_id[token] = next_id
        next_id += 1

vocab_size = len(word_to_id)
print(f"Vocabulary Size: {vocab_size}")
# print(word_to_id)

# id_to_word mapping for debugging/reverse lookup
id_to_word = {v: k for k, v in word_to_id.items()}

In [None]:
preprocessed_df['Numerical_Tokens'] = preprocessed_df['Tokens'].apply(
    lambda token_list: [word_to_id.get(token, word_to_id["<unk>"]) for token in token_list]
)

# Numericalization of the 'label' column
# Get all unique labels
unique_labels = sorted(preprocessed_df['label'].unique().tolist()) 

# Create a mapping from string label to integer ID
label_to_id = {label: i for i, label in enumerate(unique_labels)}
print(label_to_id)
# Create a mapping from integer ID to string label
id_to_label = {i: label for label, i in label_to_id.items()}
num_classes = len(label_to_id)

preprocessed_df['Numerical_Label'] = preprocessed_df['label'].map(label_to_id)



In [None]:
class TextDataset(Dataset):
    def __init__(self, numerical_tokens, numerical_labels):
        self.numerical_tokens = numerical_tokens
        self.numerical_labels = numerical_labels

    def __len__(self):
        return len(self.numerical_tokens)

    def __getitem__(self, idx):
        # Return numerical tokens as a PyTorch tensor and the label
        # We convert to tensor here, but padding happens in collate_fn
        token_ids = torch.tensor(self.numerical_tokens.iloc[idx], dtype=torch.long)
        label = torch.tensor(self.numerical_labels.iloc[idx], dtype=torch.long)
        return token_ids, label

In [None]:
kernel_sizes = [3, 4, 5]  # Example kernel sizes for CNN
min_required_seq_len = max(kernel_sizes)  # Define this based on your model's kernel sizes
def collate_fn(batch):

    # Separate token IDs and labels
    token_ids_list = [item[0] for item in batch]
    labels_list = [item[1] for item in batch]

    # Ensure each token_ids is at least min_required_seq_len long
    adjusted_token_ids_list = []
    for seq in token_ids_list:
        if len(seq) < min_required_seq_len:
            pad = torch.full((min_required_seq_len - len(seq),), PAD_ID, dtype=seq.dtype)
            seq = torch.cat([seq, pad], dim=0)
        adjusted_token_ids_list.append(seq)

    # Pad all to batch max length (could be > min_required_seq_len!)
    padded_token_ids = pad_sequence(adjusted_token_ids_list,
                                   batch_first=True,
                                   padding_value=PAD_ID) 

    # Stack labels into a single tensor
    labels = torch.stack(labels_list)

    return padded_token_ids, labels

In [None]:
train_df, test_df = train_test_split(
    preprocessed_df,
    test_size=0.2,    # 20% for testing
    random_state=42,  # For reproducibility
    stratify=preprocessed_df['label'] # Stratify by the original string label column for balanced splits
)

In [None]:
train_dataset = TextDataset(train_df['Numerical_Tokens'], train_df['Numerical_Label'])
test_dataset = TextDataset(test_df['Numerical_Tokens'], test_df['Numerical_Label'])

In [None]:
batch_size = 4
train_dataloader = DataLoader(train_dataset,
                        batch_size=batch_size,
                        shuffle=True,
                        collate_fn=collate_fn)

print(f"\nDataLoader created with batch_size={batch_size}. Iterating through a few batches:")

In [None]:
class TextCNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_classes, pad_idx):
        super(TextCNN, self).__init__()

        #  Embedding Layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)

        #  Convolutional Layers 
        self.kernel_sizes = [3, 4, 5] 
        self.num_filters = 100 e

        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=embedding_dim,
                      out_channels=self.num_filters,
                      kernel_size=k)
            for k in self.kernel_sizes
        ])

        #  Fully Connected (Dense) Layer for classification
        self.fc = nn.Linear(len(self.kernel_sizes) * self.num_filters, num_classes)

        # Dropout for regularization 
        self.dropout = nn.Dropout(0.5)

    def forward(self, text):
        # text shape: (batch_size, sequence_length)

        # Pass through embedding layer
        embedded = self.embedding(text)
        # embedded shape: (batch_size, sequence_length, embedding_dim)

        # PyTorch Conv1d expects input in (batch_size, channels, sequence_length)
        # So we permute the dimensions
        embedded = embedded.permute(0, 2, 1)

        # Apply convolutions and ReLU activation
        conved = [F.relu(conv(embedded)) for conv in self.convs]

        # Apply global max pooling over the sequence dimension
        pooled = [F.max_pool1d(c, c.shape[2]).squeeze(2) for c in conved]

        # Concatenate the pooled outputs from all kernel sizes
        cat = self.dropout(torch.cat(pooled, dim=1))
        # cat shape: (batch_size, num_filters * len(kernel_sizes))

        # Pass through the fully connected layer
        output = self.fc(cat)
        # output shape: (batch_size, num_classes)

        return output

In [None]:
# Define the model
embedding_dim = 50 
model = TextCNN(vocab_size, embedding_dim, num_classes, PAD_ID)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

In [None]:
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")

In [None]:
# Train the model
num_epochs = 20 
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}\n-------------------------------")
    train_loop(train_dataloader, model, loss_fn, optimizer)
print("Done!")

In [None]:
def evaluate_model(model, dataloader, device):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []

    # No gradient calculation needed during evaluation
    with torch.no_grad():
        for batch_tokens, batch_labels in dataloader:
            batch_tokens = batch_tokens.to(device)
            batch_labels = batch_labels.to(device)

            # Forward pass
            outputs = model(batch_tokens)
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(outputs, batch_labels)
            total_loss += loss.item()

            # Get predictions
            _, predicted = torch.max(outputs.data, 1)

            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(batch_labels.cpu().numpy())

    avg_loss = total_loss / len(dataloader)
    accuracy = accuracy_score(all_labels, all_preds)
    
    # Generate a classification report
    report = classification_report(all_labels, all_preds, target_names=list(label_to_id.keys()), output_dict=True)
    
    # Generate confusion matrix
    cm = confusion_matrix(all_labels, all_preds)

    return avg_loss, accuracy, report, cm

In [None]:
# Run the evaluation
train_loss, train_accuracy, train_report, train_confusion_matrix = evaluate_model(model, train_dataloader, device=torch.device('cpu'))

print(f"\n--- Test Results on Training Dataset ---")
print(f"Test Loss: {train_loss:.4f}")
print(f"Test Accuracy: {train_accuracy:.4f}")
print("\nClassification Report:")

print(pd.DataFrame(train_report).transpose())
print("\nConfusion Matrix:")
print(train_confusion_matrix)

In [None]:
# The following code demonstrates how to evaluate a trained TextCNN model on a test dataset using PyTorch.
# It sets up a DataLoader for batching and padding, runs the evaluation loop, and prints out key metrics
# such as test loss, accuracy, classification report, and confusion matrix. This process helps assess
# the model's performance on unseen data and provides insights into its predictive capabilities.

In [None]:
# Instantiate dataloader for the test dataset
test_batch_size = 8 
test_dataloader = DataLoader(test_dataset,
                             batch_size=test_batch_size,
                             shuffle=False, 
                             collate_fn=collate_fn)

print("Test DataLoader ready.")

In [None]:
# Run the evaluation
test_loss, test_accuracy, test_report, test_confusion_matrix = evaluate_model(model, test_dataloader, device=torch.device('cpu'))

print(f"\n--- Test Results ---")
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")
print("\nClassification Report:")

print(pd.DataFrame(test_report).transpose())
print("\nConfusion Matrix:")
print(test_confusion_matrix)

In [None]:
model_args = {
    'vocab_size': vocab_size,
    'embedding_dim': embedding_dim,
    'num_classes': num_classes,
    'pad_idx': PAD_ID,  
    'unk_idx': UNK_ID
}

In [None]:
# Export the trained model to a file
torch.save(model.state_dict(), "textcnn_model.pth")
print("Model exported to textcnn_model.pth")

In [None]:
import pickle

# Save the word_to_id mapping
with open('word_to_id.pkl', 'wb') as f:
    pickle.dump(word_to_id, f)
# Save the id_to_label mapping
with open('id_to_label.pkl', 'wb') as f:
    pickle.dump(id_to_label, f)
with open('model_args.pkl', 'wb') as fp:
    pickle.dump(model_args, fp)