# Labeling the dataset by an transformer model
In this notebook, you are going to use the pre-trained transformer model, which is fine-tuned by the `transfomers_model_finetuner.py` script, to label the `labeled.csv` dataset. The first labeling method only labels a random sample of 700 unlabelled data points, while the second function labels the entire dataset.
## Pre-setup
### Loading the Model

In [1]:
import pandas as pd
import os

import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW

from transformers import AutoModelForSequenceClassification, AutoTokenizer
from tqdm.auto import tqdm

import random

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_dir = "../models/t5_E5_balanced_25_06/hf_transformer_model"

In [3]:
print(f"\n--- Loading the model from {model_dir} ---")
loaded_tokenizer = AutoTokenizer.from_pretrained(model_dir)
loaded_model = AutoModelForSequenceClassification.from_pretrained(model_dir)

if torch.backends.mps.is_available():
    load_device = torch.device("mps")
else:
    load_device = torch.device("cpu")

loaded_model.to(load_device)

print("Model and tokenizer loaded successfully!")
print("Model architecture:", loaded_model)


--- Loading the model from ../models/t5_E5_balanced_25_06/hf_transformer_model ---
Model and tokenizer loaded successfully!
Model architecture: T5ForSequenceClassification(
  (transformer): T5Model(
    (shared): Embedding(32128, 512)
    (encoder): T5Stack(
      (embed_tokens): Embedding(32128, 512)
      (block): ModuleList(
        (0): T5Block(
          (layer): ModuleList(
            (0): T5LayerSelfAttention(
              (SelfAttention): T5Attention(
                (q): Linear(in_features=512, out_features=512, bias=False)
                (k): Linear(in_features=512, out_features=512, bias=False)
                (v): Linear(in_features=512, out_features=512, bias=False)
                (o): Linear(in_features=512, out_features=512, bias=False)
                (relative_attention_bias): Embedding(32, 8)
              )
              (layer_norm): T5LayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (1): T5LayerFF(
              (Den

### Loading the Dataset

In [4]:
dataset = pd.read_csv('../../data/labeled.csv')
dataset = dataset[dataset['class'] == 'unknown']
len(dataset)

  dataset = pd.read_csv('../../data/labeled.csv')


72984

In [5]:
Index_to_class = {
    1: 'animal welfare',
    2: 'blm',
    3: 'climate',
    4: 'culture',
    5: 'discrimination',
    6: 'education',
    7: 'environment',
    8: 'farmers',
    9: 'health care',
    10: 'housing',
    11: 'immigration',
    12: 'labor rights',
    13: 'lgbtq',
    14: 'palestine-israel conflict',
    15: 'pandemic',
    16: 'policies & politics',
    17: 'public services',
    18: 'ukraine-russia war',
    19: 'unjust law enforcement',
    20: 'women rights',

}

class_to_index = {v: k for k, v in Index_to_class.items()}

## Labeling on 700 unlabeled data

In [6]:
unkown_data = []
random_indexes = random.sample(range(len(dataset)), 700)

Change the last line to save it to another file

In [7]:
MAX_LEN_HF = 128
for i in random_indexes:
    text = dataset['clean_notes'].iloc[i]
    onfiltert_text = dataset['notes'].iloc[i]
    inputs = loaded_tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=MAX_LEN_HF)
    inputs = {key: val.to(load_device) for key, val in inputs.items()}

    with torch.no_grad():
        outputs = loaded_model(**inputs)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=1).item() + 1  

    unkown_data.append([onfiltert_text, Index_to_class[predicted_class], dataset['class'].iloc[i]])

new_df = pd.DataFrame(unkown_data, columns=['notes', 'class', 'orginal_class'])
new_df.to_csv('../../data/KINBERT_3000.csv', index=False)

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


KeyboardInterrupt: 

## Labeling over the hole dataset
Change the last line to save it to another file

In [None]:
MAX_LEN_HF = 128

for i in range(len(dataset)):
    text = dataset['clean_notes'].iloc[i]
    inputs = loaded_tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=MAX_LEN_HF)
    inputs = {key: val.to(load_device) for key, val in inputs.items()}

    with torch.no_grad():
        outputs = loaded_model(**inputs)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=1).item() + 1  

    dataset.at[i, 'predicted_class'] = Index_to_class[predicted_class]
dataset.to_csv('../../data/filtered_events_class_with_predicted.csv', index=False)
    

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
