In [None]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer
from torchtext.vocab import GloVe
from gensim.models import KeyedVectors
from datasets import load_dataset
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, decoders, processors
import numpy as np
import torch
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence
from torchtext.vocab import GloVe
import torch
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence

embedding_dim = 300
glove = GloVe(name='6B', dim=embedding_dim)
vectors = glove.vectors
vocab = glove.stoi


liar_dataset = load_dataset('liar')
train_data, val_data, test_data = liar_dataset['train'], liar_dataset['validation'], liar_dataset['test']

vocab_size = 30000
tokenizer = Tokenizer(models.BPE())
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
tokenizer.decoder = decoders.ByteLevel()
tokenizer.post_processor = processors.ByteLevel()
trainer = trainers.BpeTrainer(special_tokens=["[PAD]", "[CLS]", "[SEP]", "[MASK]", "[UNK]"])
tokenizer.train_from_iterator(train_data['statement'], trainer=trainer)

texts = train_data['statement']
labels = train_data['label']
val_texts = val_data['statement']
val_labels = val_data['label']
label_mapping = {0: 0, 1: 0, 2: 1, 3: 1, 4: 0, 5:0}
labels = [label_mapping[label] for label in labels]
val_labels = [label_mapping[label] for label in val_labels]

class LiarDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, glove_embeddings):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.glove_embeddings = glove_embeddings

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        tokens = self.tokenizer.encode(text).ids
        embedded_text = torch.stack([torch.tensor(self.glove_embeddings[t]) for t in tokens])

        return embedded_text, label

dataset = LiarDataset(texts, labels, tokenizer, vectors)
val_dataset = LiarDataset(val_texts, val_labels, tokenizer, vectors)

def collate_fn(batch):
    # Sort batch in descending order by sequence length and pad sequences
    batch = sorted(batch, key=lambda x: len(x[0]), reverse=True)
    sequences, labels = zip(*batch)
    padded_sequences = pad_sequence(sequences, batch_first=True, padding_value=0)

    return padded_sequences, torch.tensor(labels)

batch_size = 2
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)


# Define the CNN model
class CNNForSentenceClassification(nn.Module):
    def __init__(self, embedding_dim, num_classes, kernel_sizes=(3, 4, 5), num_filters=100):
        super(CNNForSentenceClassification, self).__init__()

        self.embedding_dim = embedding_dim
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(vectors), freeze=True)
        self.convolution_layers = nn.ModuleList([
            nn.Conv1d(in_channels=embedding_dim, out_channels=num_filters, kernel_size=kernel_size)
            for kernel_size in kernel_sizes
        ])
        self.fc = nn.Linear(num_filters * len(kernel_sizes), num_classes)

    def forward(self, x):

        convolution_outputs = [torch.relu(conv(x.permute(0, 2, 1))) for conv in self.convolution_layers]
        pooled_outputs = [torch.max(conv_output, dim=2)[0] for conv_output in convolution_outputs]

        # Concatenate the pooled outputs
        x = torch.cat(pooled_outputs, dim=1)
        x = self.fc(x)

        return x

num_classes = 2  # Number of classes in the Liar dataset
model = CNNForSentenceClassification(embedding_dim, num_classes)
optimizer = torch.optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

epochs = 4
for epoch in range(epochs):
    model.train()
    total_correct_train = 0
    total_samples_train = 0

    for batch_texts, batch_labels in dataloader:
        # Forward pass
        outputs = model(batch_texts)
        loss = criterion(outputs, batch_labels)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Calculate training accuracy
        _, predicted_train = torch.max(outputs, 1)
        total_correct_train += (predicted_train == batch_labels).sum().item()
        total_samples_train += batch_labels.size(0)

    accuracy_train = total_correct_train / total_samples_train

    model.eval()  # Set the model to evaluation mode
    total_correct_val = 0
    total_samples_val = 0

    with torch.no_grad():
        for batch_texts_val, batch_labels_val in val_dataloader:
            outputs_val = model(batch_texts_val)
            _, predicted_val = torch.max(outputs_val, 1)
            total_correct_val += (predicted_val == batch_labels_val).sum().item()
            total_samples_val += batch_labels_val.size(0)

    accuracy_val = total_correct_val / total_samples_val

    print(f"Epoch {epoch + 1}/{epochs}, "
          f"Train Loss: {loss.item()}, Train Accuracy: {accuracy_train * 100:.2f}%, "
          f"Validation Accuracy: {accuracy_val * 100:.2f}%")


  embedded_text = torch.stack([torch.tensor(self.glove_embeddings[t]) for t in tokens])


Epoch 1/4, Train Loss: 0.3234739899635315, Train Accuracy: 63.10%, Validation Accuracy: 67.29%
Epoch 2/4, Train Loss: 0.41495776176452637, Train Accuracy: 67.55%, Validation Accuracy: 66.51%
Epoch 3/4, Train Loss: 1.1246583461761475, Train Accuracy: 78.45%, Validation Accuracy: 63.63%
Epoch 4/4, Train Loss: 0.11450965702533722, Train Accuracy: 89.38%, Validation Accuracy: 62.85%


In [None]:
#test
test_text = test_data["statement"]
test_labels = test_data["label"]
test_labels = [label_mapping[label] for label in test_labels]
t_dataset = LiarDataset(test_text, test_labels, tokenizer, vectors)
test_dataloader = DataLoader(t_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
model.eval()  # Set the model to evaluation mode

total_correct = 0
total_samples = 0

with torch.no_grad():
    for batch_texts, batch_labels in test_dataloader:
        outputs = model(batch_texts)
        # Calculate accuracy
        _, predicted = torch.max(outputs, 1)
        total_correct += (predicted == batch_labels).sum().item()
        total_samples += batch_labels.size(0)

# Calculate overall accuracy
accuracy = total_correct / total_samples
print(f"Test Accuracy: {accuracy * 100:.2f}%")


  embedded_text = torch.stack([torch.tensor(self.glove_embeddings[t]) for t in tokens])


Test Accuracy: 57.99%
