# DistilBERT as detector

In [1]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, AdamW
import torch
import pandas as pd
from tqdm.notebook import tqdm
from torch.utils.data import DataLoader

In [2]:
df = pd.read_csv(r'C:\Users\Sten\Documents\EUR BAM\Thesis\data\data_v1.csv')
df = df.sample(frac=1, random_state=78735)
df

Unnamed: 0,text,label
472763,"In this episode of The Sporkful podcast, Rache...",1
12000,Pacific Islands Forum says it is in consultati...,0
320238,"The U.S. and its allies are ""at war"" with the ...",1
90693,"Unless otherwise stated, the content of this p...",0
115026,"This was a major remodel of an existing, chopp...",0
...,...,...
459383,Loretta and the rest of the crew have been pre...,1
83541,As the war of words between the two managers s...,0
140213,UPDATE: PLEDGE NOW AND READ THE FIRST 4 PAGES ...,0
330563,"The story is so bizarre, your jaw will drop......",1


In [3]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [4]:
train_test_list =  list(df.loc[0:16,'text'])

In [5]:
train_encoded = tokenizer(train_test_list, truncation=True, padding=True)

In [6]:
train_labels = list(df.loc[0:16, 'label'])

In [7]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [8]:
dataset_train = Dataset(train_encoded, train_labels)
dataset_train

<__main__.Dataset at 0x12a1f566580>

In [9]:
device = torch.device('cpu')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model.to(device)
model.train()

train_loader = DataLoader(dataset_train, batch_size=16, shuffle=True, num_workers=4)

optim = AdamW(model.parameters(), lr=5e-5)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'pre_clas

In [10]:
for epoch in range(3):
    for batch in tqdm(train_loader):
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optim.step()
        optim.zero_grad()
model.eval()

  0%|          | 0/9977 [00:00<?, ?it/s]

In [18]:
import multiprocessing

num_threads = multiprocessing.cpu_count()

print(f'Number of CPU threads: {num_threads}')

Number of CPU threads: 4


In [None]:
def predict(input_text):
    # Tokenize the input text
    tokens = tokenizer.encode_plus(input_text, max_length=512, truncation=True, padding='max_length', return_tensors='pt')

    # Predict
    model.eval()
    output = model(**tokens)

    # Convert logits to probabilities
    probs = torch.nn.functional.softmax(output.logits, dim=-1)

    # Return the probabilities
    return probs.detach().numpy()