In [1]:
%load_ext autoreload
%autoreload 2

from concept_gradient_v2 import ConceptGradients

In [2]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import torch.nn.functional as F
import torch.nn as nn
from tqdm.auto import tqdm
import pandas as pd
import numpy as np
import torch
import os
from datasets import Dataset
from concept_gradient_v2 import ConceptGradients
from torch.utils.data import DataLoader

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

class X2YModel(nn.Module):
    def __init__(self, model_name='./saved_target_model', num_classes=2):
        super(X2YModel, self).__init__()
        self.model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=num_classes)
        
    def forward(self, input_ids=None, attention_mask=None, inputs_embeds=None):
        if inputs_embeds is not None:
            outputs = self.model.roberta(inputs_embeds=inputs_embeds, attention_mask=attention_mask)
        else:
            outputs = self.model.roberta(input_ids=input_ids, attention_mask=attention_mask)
        
        return self.model.classifier(outputs.last_hidden_state)  


class X2CModel(nn.Module):
    def __init__(self, model_name='./saved_concept_model', num_concepts=5):
        super(X2CModel, self).__init__()
        self.model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=num_concepts, ignore_mismatched_sizes=True).to('cuda')

    def forward(self, input_ids=None, attention_mask=None, inputs_embeds=None):
        if inputs_embeds is not None:
            outputs = self.model.roberta(inputs_embeds=inputs_embeds, attention_mask=attention_mask)
        else:
            outputs = self.model.roberta(input_ids=input_ids, attention_mask=attention_mask)
        
        return self.model.classifier(outputs.last_hidden_state)

x2y_model = X2YModel().to(device)
x2c_model = X2CModel().to(device)

In [4]:
def forward_func(embeddings, attention_mask):
    output = x2y_model(inputs_embeds=embeddings, attention_mask=attention_mask)
    return output

def concept_forward_func(embeddings, attention_mask):
    output = x2c_model(inputs_embeds=embeddings, attention_mask=attention_mask)
    return output

In [5]:
cg = ConceptGradients(forward_func, concept_forward_func=concept_forward_func, x2y_model=x2y_model, x2c_model=x2c_model)

def calculate_concept_gradient(input_ids, attention_mask, target_index, concept_index, mode):
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)  

    with torch.no_grad():
        embeddings = x2y_model.model.get_input_embeddings()(input_ids)
        
    embeddings.requires_grad_(True)
    attention_mask = attention_mask.float()
    attention_mask.requires_grad_(True)
    attr = cg.attribute(
        (embeddings, attention_mask),
        mode=mode,
        target=target_index,
        target_concept=concept_index,
        n_concepts=5,
        target_layer_name='roberta.encoder.layer.11.output.dense',
        concept_layer_name='roberta.encoder.layer.11.output.dense',
    )

    return attr

In [6]:
df_test = pd.read_csv('dataset/test.csv')
# df_test = df_test.sample(1500, random_state=42)

In [7]:
df_test['concept_labels'] = df_test[['obscene', 'threat', 'sexual_explicit', 'insult', 'identity_attack']].values.tolist()
df_test.drop(columns=['obscene', 'threat', 'insult', 'severe_toxicity', 'id', 'identity_attack', 'sexual_explicit'], inplace=True, axis=1)
ds_test = Dataset.from_pandas(df_test)
df_test.head()

Unnamed: 0,comment_text,toxicity,concept_labels
0,Mr. Morneau has worked doubly hard to become o...,1,"[0, 0, 0, 1, 1]"
1,"Well, naturally Flowers is against immigration...",1,"[1, 0, 1, 1, 1]"
2,Move to cash Larry then wait for the Trump slu...,0,"[0, 0, 0, 0, 0]"
3,“Our research indicates that even though [elec...,0,"[0, 0, 0, 0, 0]"
4,Trudeau is a dunderhead as the problem is as m...,1,"[0, 0, 0, 1, 0]"


In [8]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

def tokenize_function(examples):
    return tokenizer(examples["comment_text"], padding="max_length", truncation=True)

tokenized_dataset = ds_test.map(tokenize_function, batched=True)
tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "concept_labels", "toxicity"])

x2y_dl_test = DataLoader(tokenized_dataset, batch_size=8, shuffle=False)



Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [9]:
results = []
x2y_dl_test = DataLoader(tokenized_dataset, batch_size=8, shuffle=False)
for batch in tqdm(x2y_dl_test, leave=True):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['toxicity'].to(device)
    concept_labels = batch['concept_labels']

    target_logits = x2y_model(input_ids=input_ids, attention_mask=attention_mask)
    target_preds = torch.argmax(target_logits, dim=-1)

    incorrect_indices = (target_preds != labels).nonzero(as_tuple=True)[0]

    if len(incorrect_indices) > 0:
        for idx in incorrect_indices:
            sample_input_ids = input_ids[idx].unsqueeze(0)
            sample_attention_mask = attention_mask[idx].unsqueeze(0)
            sample_label = labels[idx].item()
            sample_concept_label = concept_labels[idx].unsqueeze(0)
            sample_sentence = tokenizer.decode(sample_input_ids.squeeze(), skip_special_tokens=True)

            concept_gradient = calculate_concept_gradient(sample_input_ids, sample_attention_mask, target_index=sample_label, concept_index=None, mode='chain_rule_independent')
            concept_gradient = concept_gradient[0].detach().cpu().numpy()

            concept_logits = x2c_model(input_ids=sample_input_ids, attention_mask=sample_attention_mask)
            concept_logits = torch.sigmoid(concept_logits).detach().cpu().numpy()

            target_logits_final = x2y_model(input_ids=sample_input_ids, attention_mask=sample_attention_mask)
            target_logits_final = torch.softmax(target_logits_final, dim=-1).detach().cpu().numpy()

            results.append({
                "sentence": sample_sentence,
                "target_logits": target_logits_final,
                "concept_logits": concept_logits,
                "concept_gradient": concept_gradient,
                "label": sample_label,
                "concept_label": sample_concept_label.cpu().numpy()  
            })
            
df_results = pd.DataFrame(results)
df_results.to_csv("analysis_sheets/final_misclassified_samples_with_concept_gradients.csv", index=False)

print("Process complete. Results saved to 'misclassified_samples_with_concept_gradients.csv'")


  0%|          | 0/1250 [00:00<?, ?it/s]

Process complete. Results saved to 'misclassified_samples_with_concept_gradients.csv'


In [18]:
results

[{'sentence': "It's an idiotic system offering only one moronic choice. It lacks the integrity to offer a True/False choice, and is essentially meaningless. A lie politely expressed is bogus civility.",
  'target_logits': array([[0.00230334, 0.9976967 ]], dtype=float32),
  'concept_logits': array([[0.9518684 , 0.19160376, 0.50948566, 0.9963252 , 0.17970565]],
        dtype=float32),
  'concept_gradient': array([[0.0516965 , 0.04724283, 0.05041987, 0.08354744, 0.07352459]],
        dtype=float32),
  'label': 1,
  'concept_label': array([[1, 0, 0, 1, 1]])},
 {'sentence': "I WAS RAPED BY AMY DEMBOSKI\n\nLet's talk about it, either validate or disprove it",
  'target_logits': array([[0.04194589, 0.9580541 ]], dtype=float32),
  'concept_logits': array([[0.32105634, 0.49041313, 0.22665249, 0.3756622 , 0.5685829 ]],
        dtype=float32),
  'concept_gradient': array([[0.03498284, 0.03125231, 0.0372651 , 0.04892586, 0.045274  ]],
        dtype=float32),
  'label': 1,
  'concept_label': array(

In [19]:
pd.set_option('display.max_colwidth', None)
df_results.sample(5)

Unnamed: 0,sentence,target_logits,concept_logits,concept_gradient,label,concept_label
176,"All that makes me think he's a horrible, arrogant, sleazy, weasel, crook of a Republican. But like Hilary, he's less worse than Trump.","[[0.0034543686, 0.9965456]]","[[0.8831279, 0.34724164, 0.8705504, 0.9948782, 0.21210144]]","[[0.05465725, 0.037437785, 0.05967429, 0.10609473, 0.06470841]]",1,"[[1, 0, 1, 1, 1]]"
843,"Williams and Holleman, This is the path to social promotion. The mean reading level of Anchorage high school graduates is seventh grade. For mathematics it is sixth grade. The UA system spends an inordinate amount of time and money on remedial education because our high school graduates are not prepared for college work. I am not particularly blaming the teachers, but the system is not working. Most of that failure rests with the parents and there is little the system can do to fix that. If our public school system were a business it would have been forced to shut down decades ago. So instead of tinkering with standardized graduation testing let's improve results for those that are capable and motivated.","[[0.9984145, 0.0015855236]]","[[0.012940977, 0.013660851, 0.032178275, 0.13195023, 0.0193337]]","[[-0.18123941, -0.09661099, -0.11829409, -0.098931596, -0.09335074]]",0,"[[0, 0, 0, 0, 0]]"
1049,Loser.,"[[0.014992549, 0.98500746]]","[[0.62520665, 0.23438062, 0.14596039, 0.93952864, 0.12467685]]","[[0.04952069, 0.04434961, 0.069276474, 0.07471935, 0.06861603]]",1,"[[1, 0, 0, 1, 0]]"
127,So hypocritical. The US is and has been the number one meddler in other countries elections and governments.\nWhat's good for the goose is apparently not good for the gander.,"[[0.012243824, 0.98775613]]","[[0.30464816, 0.030655624, 0.41605493, 0.9697194, 0.04975498]]","[[0.0304129, -0.0025111604, 0.031075269, 0.12211521, 0.0048611513]]",1,"[[0, 0, 0, 1, 0]]"
46,I have yet to see a woman on a roof stripping shingles. Even carrying and nailing shingles. I have never seen a woman troweling concrete or working on a concrete crew. This equal rights foolishness has gone overboard.,"[[0.019861124, 0.9801389]]","[[0.24639589, 0.036675192, 0.5365454, 0.91647136, 0.07402957]]","[[0.028623523, -0.005248331, 0.041953776, 0.09727327, -0.00037343305]]",1,"[[0, 0, 0, 1, 1]]"


In [10]:
df_results.shape

(232, 6)

In [10]:
import torch
from sklearn.metrics import accuracy_score
from tqdm import tqdm

# Initialize lists to store predictions and true labels for each concept
concept_predictions = [[] for _ in range(5)]
concept_true_labels = [[] for _ in range(5)]

for batch in tqdm(x2y_dl_test, leave=True):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['toxicity'].to(device)
    concept_labels = batch['concept_labels'].to(device)
    
    # Compute logits for concept model
    concept_logits = x2c_model(input_ids, attention_mask=attention_mask)
    
    # Apply sigmoid to get probabilities
    concept_probs = torch.sigmoid(concept_logits)
    
    # Convert probabilities to binary predictions (0 or 1)
    concept_preds = (concept_probs > 0.5).int()
    
    # Store predictions and true labels for each concept
    for i in range(5):
        concept_predictions[i].extend(concept_preds[:, i].cpu().numpy())
        concept_true_labels[i].extend(concept_labels[:, i].cpu().numpy())

# Compute accuracy for each concept
concept_accuracies = []
for i in range(5):
    accuracy = accuracy_score(concept_true_labels[i], concept_predictions[i])
    concept_accuracies.append(accuracy)

# Print accuracies for each concept
for i, accuracy in enumerate(concept_accuracies):
    print(f"Concept {i+1} Accuracy: {accuracy:.4f}")

100%|██████████| 1250/1250 [02:29<00:00,  8.36it/s]

Concept 1 Accuracy: 0.8894
Concept 2 Accuracy: 0.8424
Concept 3 Accuracy: 0.7192
Concept 4 Accuracy: 0.9420
Concept 5 Accuracy: 0.7536





In [21]:
from sklearn.metrics import f1_score
concept_f1 = []
for i in range(5):
    accuracy = f1_score(concept_true_labels[i], concept_predictions[i])
    concept_f1.append(accuracy)

# Print accuracies for each concept
for i, accuracy in enumerate(concept_f1):
    print(f"Concept {i+1} F1: {accuracy:.4f}")

Concept 1 F1: 0.8395
Concept 2 F1: 0.3226
Concept 3 F1: 0.3728
Concept 4 F1: 0.9240
Concept 5 F1: 0.3827
