In [1]:
import numpy as np
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pickle

In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [3]:
df = pd.read_csv("normie.csv")

In [4]:
df.columns = df.columns.str.strip()

In [5]:
# Modify the preprocessing function to handle non-string values and missing values
from torch.nn.utils.rnn import pad_sequence

def preprocess_data(df):
    df = df.dropna().copy()  # Drop rows with NaN values and create a copy
    df['Pattern String'] = df['Pattern String'].astype(str)  # Convert to string
    df['Pattern String'] = df['Pattern String'].fillna("")    # Handle missing values
    tokenized_texts = [tokenizer.tokenize(pattern) for pattern in df['Pattern String']]
    input_ids = [tokenizer.convert_tokens_to_ids(tokens) for tokens in tokenized_texts]
    input_ids = pad_sequence([torch.tensor(ids) for ids in input_ids], batch_first=True)
    attention_masks = [[float(i != 0) for i in ids] for ids in input_ids]

    label_dict = {"not dark": 0, "dark": 1}
    labels = [label_dict[label] for label in df['classification']]

    return input_ids, attention_masks, labels

# Call the preprocess_data function again
input_ids, attention_masks, labels = preprocess_data(df)

# Splitting the dataset into train and validation sets
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, random_state=42, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids, random_state=42, test_size=0.1)

# Convert data into torch tensors
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)

# Create DataLoader for handling batches
batch_size = 32
train_data = TensorDataset(train_inputs, torch.tensor(train_masks), train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, torch.tensor(validation_masks), validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)


In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = (pred.predictions[:, 1] > 0.5).astype(int)

    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    acc = accuracy_score(labels, preds)
    return {"Accuracy": acc, "F1": f1, "Precision": precision, "Recall": recall}


In [None]:
# Create and train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,  # Pass the dataset, not the DataLoader
    eval_dataset=val_dataset,      # Pass the dataset, not the DataLoader
    compute_metrics=compute_metrics,
)

trainer.train()


In [None]:
import pickle
import torch
from transformers import BertForSequenceClassification, BertTokenizer

# Define the model and tokenizer paths
model_path = "dark_pattern2.pkl"


# Save the model as a .pkl file
torch.save(model.state_dict(), model_path)

# Save the tokenizer as a .pkl file
with open(tokenizer_path, "wb") as f:
    pickle.dump(tokenizer, f)