In [1]:
!pip install datasets
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import numpy as np
import torch
from transformers import set_seed

# Set random seeds
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)
set_seed(seed)

In [6]:
import pandas as pd
from datasets import Dataset

def load_jigsaw_dataset(train_path, test_path, test_labels_path):
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    test_labels_df = pd.read_csv(test_labels_path)

    # Merge the test dataset with its labels
    test_df = test_df.merge(test_labels_df, on='id')

    # Filter out rows with -1 labels (unlabeled samples)
    test_df = test_df[test_df['toxic'] != -1]

    # Load as Hugging Face Dataset
    train_dataset = Dataset.from_pandas(train_df[['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']])
    test_dataset = Dataset.from_pandas(test_df[['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']])

    return train_dataset, test_dataset

# Load the Jigsaw dataset
train_dataset, test_dataset = load_jigsaw_dataset('train.csv', 'test.csv', 'test_labels.csv')


# Load the Jigsaw dataset


In [None]:
# skip
# from transformers import AutoTokenizer
# import torch

# # Load the pre-trained tokenizer
# tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# # Tokenize the dataset
# # def tokenize(batch):
# #     return tokenizer(batch['comment_text'], padding="max_length", truncation=True, max_length=512)
# # def tokenize(batch):
# #     labels = [batch[col] for col in ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]]
# #     labels = torch.tensor(labels).float()
# #     return tokenizer(batch['comment_text'], padding="max_length", truncation=True, max_length=512, return_tensors="pt").update({"labels": labels})
# def tokenize(batch):
#     labels = [[batch[col][i] for col in ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] for i in range(len(batch['comment_text']))]
#     labels = torch.tensor(labels).float()
#     tokenized_batch = tokenizer(batch['comment_text'], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
#     tokenized_batch["labels"] = labels
#     return tokenized_batch

# # Need to process at small batchsize due to memory limitation in google colab, used batch_size=32 here
# train_dataset = train_dataset.map(tokenize, batched=True, batch_size=32)
# train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"])
# test_dataset = test_dataset.map(tokenize, batched=True, batch_size=32) 
# test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"])




In [7]:
from torch.nn import BCEWithLogitsLoss
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

# Load the pre-trained tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the dataset
def tokenize(batch):
    labels = [[batch[col][i] for col in ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] for i in range(len(batch['comment_text']))]
    labels = torch.tensor(labels).float()
    tokenized_batch = tokenizer(batch['comment_text'], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    tokenized_batch["labels"] = labels
    return tokenized_batch

train_dataset = train_dataset.map(tokenize, batched=True, batch_size=32)
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

test_dataset = test_dataset.map(tokenize, batched=True, batch_size=32)
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])




Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/159571 [00:00<?, ? examples/s]

Map:   0%|          | 0/63978 [00:00<?, ? examples/s]

In [None]:

# Load the pre-trained model
checkpoint_path = "toxicity_detection/checkpoint-1000"
# model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=6)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint_path, num_labels=6)
# Training arguments
training_args = TrainingArguments(
    output_dir="toxicity_detection",
    per_device_train_batch_size=12,
    per_device_eval_batch_size=12,
    num_train_epochs=1,
    learning_rate=1e-6,
    logging_dir="logs",
    fp16=False,
    save_steps=1000,
    save_total_limit=2,
    resume_from_checkpoint=checkpoint_path,
    warmup_steps=500,
    lr_scheduler_type="linear",
)

# Custom Trainer class for multi-label classification
class MultilabelTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = BCEWithLogitsLoss()
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1, self.model.config.num_labels))
        return (loss, outputs) if return_outputs else loss

# Instantiate the custom Trainer
trainer = MultilabelTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,

)

# Train the model
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print(eval_results)




Step,Training Loss
500,0.0101
1000,0.0078
1500,0.011


KeyboardInterrupt: ignored

In [None]:
!cp -r "toxicity_detection/checkpoint-5000" backup/

In [9]:
!tar czf checkpoint1000.tar.gz "drive/MyDrive/toxicity_detection/checkpoint-1000"

In [10]:
!cp -r "/content/checkpoint1000.tar.gz" /content/drive/MyDrive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
from torch.nn import BCEWithLogitsLoss
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
saved_checkpoint_path = "drive/MyDrive/toxicity_detection/checkpoint-1000"
# model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=6)
model = AutoModelForSequenceClassification.from_pretrained(saved_checkpoint_path, num_labels=6)
# Training arguments
training_args = TrainingArguments(
    output_dir="toxicity_detection",
    per_device_train_batch_size=12,
    per_device_eval_batch_size=12,
    num_train_epochs=1,
    learning_rate=1e-6,
    logging_dir="logs",
    fp16=False,
    save_steps=1000,
    save_total_limit=2,
    resume_from_checkpoint=saved_checkpoint_path,
    warmup_steps=500,
    lr_scheduler_type="linear",
)

# Custom Trainer class for multi-label classification
class MultilabelTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = BCEWithLogitsLoss()
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1, self.model.config.num_labels))
        return (loss, outputs) if return_outputs else loss

# Instantiate the custom Trainer
trainer = MultilabelTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,

)


# Evaluate the model
eval_results = trainer.evaluate()
print(eval_results)

KeyboardInterrupt: ignored

In [None]:
!cp "drive/MyDrive/finalcheckpoint.tar.gz" checkpoin_resume

In [None]:
!mkdir checkpoin_resume

In [None]:
!gunzip -c checkpoin_resume/finalcheckpoint.tar.gz | tar xopf -

In [12]:
df = pd.read_csv('train.csv')

In [16]:
# Assuming you have a pandas dataframe called 'df'
column_names = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

# Count the occurrences of each value in the specified columns
occurrences = df[column_names].sum()

# Print the results
print(occurrences)
# Count the occurrences where rows have both "toxic" and another values
for c in ["severe_toxic", "obscene", "threat", "insult", "identity_hate"]:
  overlap_count = (df["toxic"] & df[c]).sum()
  print(f"Number of occurrences where both 'toxic' and '{c}' are present:", overlap_count)


toxic            15294
severe_toxic      1595
obscene           8449
threat             478
insult            7877
identity_hate     1405
dtype: int64
Number of occurrences where both 'toxic' and 'severe_toxic' are present: 1595
Number of occurrences where both 'toxic' and 'obscene' are present: 7926
Number of occurrences where both 'toxic' and 'threat' are present: 449
Number of occurrences where both 'toxic' and 'insult' are present: 7344
Number of occurrences where both 'toxic' and 'identity_hate' are present: 1302
