In [1]:
import pandas as pd
import numpy as np

import os

In [2]:
BASE_DIR = "/extra/ucinlp1/cbelem/experiments-apr-15"

OUTPUT_DIR = f"{BASE_DIR}/toxicity_results"
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [3]:
TARGET_WORDS = [
    "muslim",
    "jewish",
    "buddhist",
    "christian",
    "doctor",
    "nurse",
]

## Load Toxicity model

In [4]:
from toxic_clf_model import ToxicCommentTagger
import torch 

DEVICE = 1
TOXIC_MODEL_DEVICE = f"cuda:{DEVICE}"
# It is a multilabel model (we will consider only the "toxic label", index 0)
# the other labels are as follow: toxic, severe_toxic, obscene, threat, insult, identity_hate
toxicity_model = ToxicCommentTagger(n_classes=6)
toxicity_model.load_state_dict(torch.load('./model.pth'))
toxicity_model.eval();
toxicity_model.to(TOXIC_MODEL_DEVICE)

# sanity check
toxicity_model.predict(["Yo, ma homie! What's up?", "This is just a commmon sequence!"], device=TOXIC_MODEL_DEVICE)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


array([[0.00338174, 0.00256998, 0.00332734, 0.00326782, 0.00316576,
        0.00304676],
       [0.01982095, 0.00048714, 0.00112874, 0.00064673, 0.00271987,
        0.00104581]], dtype=float32)

In [5]:
def add_toxic_prediction(df: pd.DataFrame, sequences, colname: str, model, device, batch_size: int) -> pd.DataFrame:
    df = df.copy()
    result = model.predict(sequences, device=device, batch_size=batch_size)
    
    df[f"{colname}toxicity"]     = result[:,0]
    df[f"{colname}severe_toxic"] = result[:,1]
    df[f"{colname}obscene"]      = result[:,2]
    df[f"{colname}threat"]       = result[:,3]
    df[f"{colname}insult"]       = result[:,4]
    df[f"{colname}identity_hate"]= result[:,5]
    
    return df

### Obtain toxicity for data

### Obtain toxicity model

In [6]:
TARGET_WORD = "christian"
print(TARGET_WORD)

christian


In [7]:
MODEL_DIR = f"{BASE_DIR}/models/EleutherAI__pythia-70m"
MODEL_OUT_DIR = f"{OUTPUT_DIR}/models/EleutherAI__pythia-70m"
os.makedirs(MODEL_OUT_DIR, exist_ok=True)

BASE_PATH = f"{MODEL_DIR}/{TARGET_WORD}_min_prefix.csv"
BASE_DATA = pd.read_csv(BASE_PATH, index_col=0)
print("Read data from", BASE_PATH, len(BASE_DATA))

Read data from /extra/ucinlp1/cbelem/experiments-apr-15/models/EleutherAI__pythia-70m/christian_min_prefix.csv 2662400


In [8]:
top_p_mask = BASE_DATA["sampling_kwargs"].apply(lambda x: "top_p" in x)
BASE_DATA.loc[top_p_mask, "sampling"] = ["top-p"] * sum(top_p_mask)

In [9]:
# sanity check 
BASE_DATA.groupby("sampling").count()

Unnamed: 0_level_0,prefix,sequence,sequence_log_prob,sampling_kwargs
sampling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
multinomial,204800,204800,204800,204800
temperature,819200,819200,819200,819200
top-k,819200,819200,819200,819200
top-p,819200,819200,819200,819200


In [10]:
# Let us process the toxicity by parts
sampling_types = sorted(BASE_DATA["sampling"].unique())
sampling_types

['multinomial', 'temperature', 'top-k', 'top-p']

In [11]:
print(sampling_types[:2])
for sampling in sampling_types[:2]:
    print(sampling, TARGET_WORD)
    data = BASE_DATA[BASE_DATA["sampling"] == sampling]
    data = data.dropna()
    sequences = (data["sequence"]).values.tolist()
    sequences_preds = add_toxic_prediction(data, sequences, "", toxicity_model, batch_size=16, device=TOXIC_MODEL_DEVICE)
    sequences_preds.to_csv(f"{MODEL_OUT_DIR}/{TARGET_WORD}_{sampling}_pt1.csv")

['multinomial', 'temperature']
multinomial christian
temperature christian
