In [5]:
import torch
import pandas as pd
import string
import nltk
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from nltk import word_tokenize
from nltk.corpus import stopwords

In [7]:
def get_words_only(sentence: str) -> list[str]:
    tokens = word_tokenize(sentence)
    tokens = [token.lower() for token in tokens if token not in string.punctuation]
    tokens = [token for token in tokens if token not in stopwords.words("english")]
    tokens = [token for token in tokens if token.isalpha()]
    return " ".join(tokens)

In [4]:
dataset = pd.read_csv("../../data/raw/classifier_dataset.tsv", sep="\t")
dataset.head(5)

Unnamed: 0,sentence,label
0,you're becoming disgusting.,1
1,"monkey, you have to wake up.",1
2,I've got orders to put her down.,0
3,I have orders to kill her.,1
4,they're laughing at us. We'll show you.,0


In [10]:
dataset["parsed_sentence"] = dataset["sentence"].apply(get_words_only)
dataset.head(5)

Unnamed: 0,sentence,label,parsed_sentence
0,you're becoming disgusting.,1,becoming disgusting
1,"monkey, you have to wake up.",1,monkey wake
2,I've got orders to put her down.,0,got orders put
3,I have orders to kill her.,1,orders kill
4,they're laughing at us. We'll show you.,0,laughing us show


In [11]:
train_data, test_data, train_labels, test_labels = train_test_split(
    dataset['sentence'], dataset['label'], test_size=0.2, random_state=42
)

In [12]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [13]:
train_encodings = tokenizer(list(train_data), truncation=True, padding=True)
test_encodings = tokenizer(list(test_data), truncation=True, padding=True)

In [15]:
train_dataset = torch.utils.data.TensorDataset(
    torch.tensor(train_encodings['input_ids']),
    torch.tensor(train_encodings['attention_mask']),
    torch.tensor(train_labels.values)
)
test_dataset = torch.utils.data.TensorDataset(
    torch.tensor(test_encodings['input_ids']),
    torch.tensor(test_encodings['attention_mask']),
    torch.tensor(test_labels.values)
)

In [None]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

# Set up the optimizer and learning rate
optimizer = AdamW(model.parameters(), lr=1e-5)

# Set up the training parameters
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
model.train()

# Set up the training loop
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)

for epoch in range(3):  # Adjust the number of training epochs as needed
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()


In [None]:
# Set up the evaluation loop
model.eval()
eval_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16)

with torch.no_grad():
    correct = 0
    total = 0

    for batch in eval_loader:
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        predicted_labels = torch.argmax(outputs.logits, dim=1)
        
        total += labels.size(0)
        correct += (predicted_labels == labels).sum().item()

accuracy = correct / total
print(f'Accuracy: {accuracy}')