In [None]:
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel


In [None]:
df = pd.read_csv("cleaned.csv")


In [None]:
X_train, X_test, y_train, y_test, category_train, category_test = train_test_split(
    df["X"], df["labels"], df["categories"], test_size=0.2, random_state=42
)

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def preprocess_text(text):
    encoded_text = tokenizer(text, return_tensors="pt", padding=True)
    return encoded_text["input_ids"].to(device), encoded_text["attention_mask"].to(device)


In [None]:
from transformers import BertTokenizer, BertModel

# Hyperparameters
num_classes = 3  # Number of dark pattern categories
embedding_dim = 768  # Dimension of BERT embeddings
dropout_rate = 0.2

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased")

# Define device (CPU or GPU)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
class ResNetTransformer(nn.Module):
    def __init__(self, embedding_dim, num_classes, dropout_rate):
        super(ResNetTransformer, self).__init__()
        self.bert = bert_model

        # Define residual connections
        self.layer1 = nn.Sequential(
            nn.Linear(embedding_dim, embedding_dim // 2),
            nn.ReLU(),
            nn.Linear(embedding_dim // 2, embedding_dim),
            nn.Dropout(dropout_rate),
        )
        self.layer2 = nn.Sequential(
            nn.Linear(embedding_dim, embedding_dim // 2),
            nn.ReLU(),
            nn.Linear(embedding_dim // 2, embedding_dim),
            nn.Dropout(dropout_rate),
        )

        # Classification layer
        self.fc = nn.Linear(embedding_dim, num_classes)

    def forward(self, input_ids, attention_mask):
        bert_output = self.bert(input_ids, attention_mask=attention_mask)
        hidden_states = bert_output[0]

        # Residual connections
        residual = hidden_states
        hidden_states = self.layer1(hidden_states)
        hidden_states += residual
        residual = hidden_states
        hidden_states = self.layer2(hidden_states)
        hidden_states += residual

        # Classification
        logits = self.fc(hidden_states[:, 0, :])  # Take CLS token representation

        return logits

# Create model instance
model = ResNetTransformer(embedding_dim, num_classes, dropout_rate).to(device)


In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss()


In [None]:
import torch
from tqdm import tqdm  # for displaying progress bar

num_epochs = 10
batch_size = 1

# Assuming you have a PyTorch model, optimizer, X_train, y_train, preprocess_text, and loss_fn defined

model.train()  # Set model to training mode

for epoch in range(num_epochs):
    train_loss = 0

    # Use tqdm to display a progress bar
    progress_bar = tqdm(enumerate(range(0, len(X_train), batch_size)), total=len(X_train)//batch_size)

    for batch_idx, i in progress_bar:
        batch_text = X_train[i:i + batch_size]
        batch_labels = y_train[i:i + batch_size]

        input_ids, attention_mask = preprocess_text('batch_text')

        # Zero gradients
        optimizer.zero_grad()
        batch_labels = batch_labels.to_numpy()  # Convert to NumPy array
        batch_labels = torch.tensor(batch_labels)  # Now create the tensor
        # Convert batch_labels directly to a PyTorch tensor (if not already a tensor)
        batch_labels = torch.tensor(batch_labels)

        # Forward pass
        logits = model(input_ids, attention_mask)
        loss = loss_fn(logits, batch_labels)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

        # Update progress bar description
        progress_bar.set_description(f'Epoch {epoch+1}/{num_epochs}, Batch {batch_idx+1}/{len(X_train)//batch_size}, Loss: {loss.item():.4f}')

    # Print training loss after each epoch
    train_loss /= len(X_train)  # Calculate average loss
    print(f"Epoch {epoch+1}/{num_epochs}, TrainingLoss: {train_loss:.4f}")

# Save the trained model
torch.save(model.state_dict(), 'trained_model.pth')


In [None]:
import torch
import numpy as np
from tqdm import tqdm
from sklearn.metrics import accuracy_score

def preprocess_text(text):
    if isinstance(text, str):
        text = [text]
    encoded_text = tokenizer(text, return_tensors="pt", padding=True)
    return encoded_text["input_ids"].to(device), encoded_text["attention_mask"].to(device)

# Assuming you have a PyTorch model, X_test, y_test, preprocess_text, and tokenizer defined

with torch.no_grad():
    model.eval()  # Set model to evaluation mode
    all_predictions = []
    all_labels = []

    for i in tqdm(range(0, len(X_test), batch_size)):
        batch_text = X_test[i:i + batch_size]
        batch_labels = torch.tensor(y_test[i:i + batch_size].to_numpy())  # Convert to NumPy array first

        input_ids, attention_mask = preprocess_text('batch_text')

        # Forward pass
        logits = model(input_ids, attention_mask)
        predictions = torch.argmax(logits, dim=1)

        # Collect predictions and labels for the entire dataset
        all_predictions.extend(predictions.cpu().numpy())
        all_labels.extend(batch_labels.cpu().numpy())

# Calculate accuracy using scikit-learn
accuracy = accuracy_score(all_labels, all_predictions)
print("Validation accuracy:", accuracy)
