In [1]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

In [6]:
df = pd.read_json('../../data/post_topics.json')

def preprocess(text):
    return text.lower()

df['post'] = df['post'].apply(preprocess)

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['categories'])

In [7]:
X_train, X_val, y_train, y_val = train_test_split(df['post'], y, test_size=0.2, random_state=42)

In [8]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch

# Load pre-trained BERT tokenizer and model
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(mlb.classes_))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(list(X_val), truncation=True, padding=True, max_length=128)

In [10]:
class MultiLabelDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.labels)

In [11]:
training_args = TrainingArguments(
    output_dir='./ner',
    num_train_epochs=20,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

# Train the model
trainer.train()

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.20.1`: Please run `pip install transformers[torch]` or `pip install accelerate -U`

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move model to the appropriate device
model.to(device)

def predict(post, model, tokenizer, mlb, threshold=0.3):
    post = preprocess(post)
    encoding = tokenizer(post, return_tensors="pt", truncation=True, padding=True, max_length=128)
    encoding = {key: val.to(device) for key, val in encoding.items()}
    
    with torch.no_grad():
        outputs = model(**encoding)
    logits = outputs.logits
    
    # Apply sigmoid activation to get probabilities
    probs = torch.sigmoid(logits).squeeze().cpu().numpy()
    
    predicted_labels = [mlb.classes_[i] for i in range(len(probs)) if probs[i] >= threshold]
    return predicted_labels

In [None]:
new_post = "I am looking for a Project Management Role."
predicted_categories = predict(new_post, model, tokenizer, mlb, 0.5)
print(predicted_categories)