# Tag classifier model

In [None]:
import pandas as pd
import numpy as np
import ast
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn as nn
import torch.optim as optim

In [None]:
def text_preprocessing(text):
    # TODO: Implement text preprocessing logic
    return text

## Dataset reading

In [None]:
df = pd.read_csv("synthetic_dataset.csv")
df['tags'] = df['tags'].apply(lambda x: [tag.strip().lower() for tag in ast.literal_eval(x)])
df["characteristics"] = df["characteristics"].apply(text_preprocessing)

MultiLabelBinarizer used since for each person we need to clasify multiple tags not just one

In [None]:
all_tags = sorted(set(tag for tags in df["tags"] for tag in tags))
mlb = MultiLabelBinarizer(classes=all_tags)
Y = mlb.fit_transform(df["tags"])
num_tags = len(all_tags)
descriptions = df["characteristics"].tolist()

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(descriptions, Y, test_size=0.3, random_state=42)

Model for embeddings and tokenizer setup

In [None]:
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
bert = AutoModel.from_pretrained(MODEL_NAME)

## Embeddings

In [None]:
def get_bert_embeddings(texts, batch_size=16):
    bert.eval() # Set model to evaluation mode (faster inference)
    emb_list = []
    with torch.no_grad(): # Disable gradient calculation for inference
        # Process texts in batches to avoid memory issues
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt", max_length=256)
            outputs = bert(**inputs)
            attention_mask = inputs['attention_mask'] # Get attention mask (which tokens are valid, which ones are padding) to handle padding
            token_embeddings = outputs.last_hidden_state # Embeddings matrix for each token
            input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() # Expand mask to match token embeddings and avoid padding issues
            sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) # Sum embeddings for valid tokens
            sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9) # Count of valid tokens and avoid division by zero
            emb = sum_embeddings / sum_mask # Average embeddings for each text
            emb_list.append(emb)
    return torch.cat(emb_list, dim=0) # Concatenate embeddings from all batches for all texts

In [None]:
with torch.no_grad():
    X_train_emb = get_bert_embeddings(X_train)
    X_test_emb = get_bert_embeddings(X_test)

## Classifier

In [None]:
class TagClassifier(nn.Module): # simple MLP for tag classification
    def __init__(self, embed_dim, hidden_dim, num_tags):
        super().__init__()
        self.mlp = nn.Sequential(
            nn.Linear(embed_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, num_tags)
        )
    def forward(self, x):
        return self.mlp(x)

In [None]:
device = "cpu"
model = TagClassifier(X_train_emb.shape[1], 256, num_tags).to(device)
loss_fn = nn.BCEWithLogitsLoss() # Since multi-label classification
optimizer = optim.Adam(model.parameters(), lr=0.005)

X_train_t = X_train_emb.to(device)
Y_train_t = torch.tensor(Y_train, dtype=torch.float32).to(device)

## Training

In [None]:
EPOCHS = 30
for epoch in range(EPOCHS):
    model.train()
    optimizer.zero_grad()
    logits = model(X_train_t)
    loss = loss_fn(logits, Y_train_t)
    loss.backward()
    optimizer.step()
    if epoch % 5 == 0 or epoch == EPOCHS-1:
        print(f"Epoch {epoch+1}/{EPOCHS} - Loss: {loss.item():.4f}")

## Evaluation

In [None]:
model.eval()
X_test_t = X_test_emb.to(device)
Y_test_t = torch.tensor(Y_test, dtype=torch.float32).to(device)
with torch.no_grad():
    logits = model(X_test_t)
    probs = torch.sigmoid(logits)
    preds = (probs > 0.5).cpu().numpy() # Tensor to cpu (if it was on GPU) and convert to numpy array
    y_true = Y_test
    y_pred = preds
    print("=== TEST SAMPLES ===\n")
    for i, desc in enumerate(X_test):
        true_tags = [t for t, f in zip(all_tags, y_true[i]) if f]
        pred_tags = [t for t, f in zip(all_tags, y_pred[i]) if f]
        print(f"Text: {desc}")
        print(f"True tags: {true_tags}")
        print(f"Predicted: {pred_tags}\n")

## Predict/inference

In [None]:
def predict_tags(text):
    model.eval()
    with torch.no_grad():
        emb = get_bert_embeddings([text_preprocessing(text)]).to(device)
        logits = model(emb)
        probs = torch.sigmoid(logits).cpu().numpy()[0]
        tags = [tag for tag, p in zip(all_tags, probs) if p > 0.5]
        return tags

In [None]:
print("=== EXAMPLE ===")
test_text = "Experienced in data science and machine learning"
print(f"Input: {test_text}")
print("Tags:", predict_tags(test_text))