Label all sentences using transfer learning model

In [1]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import Dataset
from torch.nn.functional import softmax
import numpy as np
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load saved model
model_path = "3_model/transfer-learning/deberta-v3-tuned-final"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.eval()

if torch.cuda.is_available():
    model.cuda()

In [3]:
# Load datasets
df_labeled = pd.read_csv("2_processed_data/sampling_labelled.csv", encoding='latin1')
df_full = pd.read_csv("2_processed_data/processed_dataset.csv", encoding='latin1')

In [4]:
# Map known labels to full dataset
valid_labels = ['C', 'G', 'W/Q', 'OTH']
label_map = {label: i for i, label in enumerate(valid_labels)}
inv_label_map = {v: k for k, v in label_map.items()}

# Ensure clean types
df_labeled = df_labeled[df_labeled['argument_type'].isin(valid_labels)].copy()
df_labeled['argument_type'] = df_labeled['argument_type'].astype(str)
df_full['sentence'] = df_full['sentence'].astype(str)

# Assign known labels from sampling to processed dataset
df_full['argument_type'] = df_full['sentence'].map(
    df_labeled.drop_duplicates(subset='sentence').set_index('sentence')['argument_type']
)

In [5]:
# Identify sentences to predict
df_to_predict = df_full[df_full['argument_type'].isna()].copy()
sentences_to_predict = df_to_predict['sentence'].tolist()

In [6]:
# Predict using the model
batch_size = 16
predicted_labels = []

for i in tqdm(range(0, len(sentences_to_predict), batch_size)):
    batch_sentences = sentences_to_predict[i:i+batch_size]
    encodings = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="pt", max_length=256)
    if torch.cuda.is_available():
        encodings = {k: v.cuda() for k, v in encodings.items()}

    with torch.no_grad():
        outputs = model(**encodings)
        probs = softmax(outputs.logits, dim=1)
        preds = torch.argmax(probs, dim=1).cpu().numpy()
        predicted_labels.extend(preds)

100%|██████████| 6508/6508 [07:07<00:00, 15.21it/s]


In [7]:
# Update predictions back to dataframe
df_full.loc[df_full['argument_type'].isna(), 'argument_type'] = [inv_label_map[p] for p in predicted_labels]

In [8]:
# Save the final fully labeled dataset
df_full.to_csv("2_processed_data/labelled_dataset.csv", index=False)