# Predicting labels for new statements

## 0. Setup

### 0.1 Install libraries

In [1]:
! pip install -r requirements.txt

# If you work with GPU-support:
! pip install torch==2.7.1+cu128 -f https://download.pytorch.org/whl/torch/
! pip install torchaudio==2.7.1+cu128 -f https://download.pytorch.org/whl/torchaudio/
! pip install torchvision==0.22.1+cu128 -f https://download.pytorch.org/whl/torchvision/

# If you only work with CPU-support:
# ! pip install torch==2.7.1
# ! pip install torchaudio==2.7.1
# ! pip install torchvision==0.22.1




[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Looking in links: https://download.pytorch.org/whl/torch/



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Looking in links: https://download.pytorch.org/whl/torchaudio/



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Looking in links: https://download.pytorch.org/whl/torchvision/



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


### 0.2 GPU setup

In [2]:
# Load necessary libraries
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset, SequentialSampler
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


## 1. Configuration

In [None]:
MODEL_DIR = "./best_model"  # path to saved model
CSV_PATH = r"../../data/validation_data/validation_labeled.xlsx"
OUTPUT_PATH = "new_statements_with_preds.csv"
TEXT_COLUMN = "expanded"
LABEL_COLUMN = "label_pred"
MAX_LENGTH = 350  # or whatever you used during training
BATCH_SIZE = 8
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## 2. Load model, tokenizer and data

In [4]:
# Load model and tokenizer
print("Loading model...")
model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR)
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
model.to(DEVICE)
model.eval()

Loading model...


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [16]:
# Load and preprocess data
df = pd.read_csv(CSV_PATH)
texts = df[TEXT_COLUMN].tolist()

print("Tokenizing...")
encodings = tokenizer(texts, truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors="pt")
input_ids = encodings["input_ids"]
attention_mask = encodings["attention_mask"]

dataset = TensorDataset(input_ids, attention_mask)
dataloader = DataLoader(dataset, sampler=SequentialSampler(dataset), batch_size=BATCH_SIZE)

Tokenizing...


## 3. Run predictions

In [17]:
print("Running inference...")
all_preds = []

with torch.no_grad():
    for batch in tqdm(dataloader):
        batch = [item.to(DEVICE) for item in batch]
        outputs = model(input_ids=batch[0], attention_mask=batch[1])
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        all_preds.extend(preds.cpu().numpy())

Running inference...


100%|██████████| 2/2 [00:01<00:00,  1.94it/s]


### 3.1 Save predicted labels to data file

In [18]:
df[LABEL_COLUMN] = all_preds
df.to_csv(OUTPUT_PATH, index=False)
print(f"Predictions saved to {OUTPUT_PATH}")

Predictions saved to new_statements_with_preds.csv


In [None]:
# The end...