In [1]:
!pip install gdown
!pip install -q -U bitsandbytes peft accelerate optimum
!pip install transformers==4.10.0
!pip install flash-attn --no-build-isolation

!pip install torch>=1.7.0,!=1.8.0
!pip install numpy <1.23.0

Collecting gdown
  Downloading gdown-4.7.1-py3-none-any.whl (15 kB)
Installing collected packages: gdown
Successfully installed gdown-4.7.1
Collecting transformers==4.10.0
  Downloading transformers-4.10.0-py3-none-any.whl (2.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.8/2.8 MB[0m [31m34.1 MB/s[0m eta [36m0:00:00[0m
Collecting sacremoses (from transformers==4.10.0)
  Obtaining dependency information for sacremoses from https://files.pythonhosted.org/packages/0b/f0/89ee2bc9da434bd78464f288fdb346bc2932f2ee80a90b2a4bbbac262c74/sacremoses-0.1.1-py3-none-any.whl.metadata
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Collecting tokenizers<0.11,>=0.10.1 (from transformers==4.10.0)
  Downloading tokenizers-0.10.3.tar.gz (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.7/212.7 kB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l- \ | / - \ done
[?2

In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import gdown
from kaggle_secrets import UserSecretsClient
import torch
from torch.utils.data import DataLoader
from peft import get_peft_config, get_peft_model, get_peft_model_state_dict, LoraConfig, TaskType

import transformers
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Model 
from transformers import DataCollatorForSeq2Seq, get_scheduler, BitsAndBytesConfig
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, EarlyStoppingCallback, IntervalStrategy
from datasets import load_dataset, Dataset, DatasetDict, load_metric

import os, sys
from tqdm.auto import tqdm

np.object = object

device = "cuda" if torch.cuda.is_available() else "cpu"
if device=="cuda":
    torch.cuda.empty_cache()
    torch.cuda.set_per_process_memory_fraction(0.7)
print("DEVICE:", device)

# set w&b secrets as environment variable
user_secrets = UserSecretsClient()
wandb_key = user_secrets.get_secret("w&b")
huggingface_key = user_secrets.get_secret("huggingface-write")
os.environ["WANDB_API_KEY"] = wandb_key
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session



DEVICE: cuda


In [3]:
url = 'https://huggingface.co/datasets/s-nlp/paradetox/raw/main/train.tsv'
destination_path = "/kaggle/temp/"

if os.path.exists(destination_path) != True:
        os.mkdir(destination_path)
        
filename = url.split("/")[-1]
outpath = os.path.join(destination_path, filename)
gdown.download(url=url, output=outpath, quiet=False, fuzzy=True)

Downloading...
From: https://huggingface.co/datasets/s-nlp/paradetox/raw/main/train.tsv
To: /kaggle/temp/train.tsv
100%|██████████| 2.04M/2.04M [00:00<00:00, 8.26MB/s]


'/kaggle/temp/train.tsv'

In [4]:
df = pd.read_csv(outpath, sep='\t')
# rename columns
df = df.rename(columns={"en_toxic_comment":"toxic", "en_neutral_comment":"neutral"})
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19744 entries, 0 to 19743
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   toxic    19744 non-null  object
 1   neutral  19744 non-null  object
dtypes: object(2)
memory usage: 308.6+ KB


Unnamed: 0,toxic,neutral
0,he had steel balls too !,he was brave too!
1,"dude should have been taken to api , he would ...",It would have been good if he went to api. He ...
2,"im not gonna sell the fucking picture , i just...","I'm not gonna sell the picture, i just want to..."
3,the garbage that is being created by cnn and o...,the news that is being created by cnn and othe...
4,the reason they dont exist is because neither ...,The reason they don't exist is because neither...


In [5]:
# to huggingface dataset type
dataset = Dataset.from_pandas(df)

In [6]:
MAX_INLENGTH = 50
MAX_OUTLENGTH = 50
LEARNING_RATE = 5e-5
BATCH_SIZE = 4
EPOCHS = 1
MODEL_NAME = "Ribin/t5-base_detoxParaphraser"

In [7]:
# https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#Flash-Attention-2
# https://huggingface.co/blog/4bit-transformers-bitsandbytes
# load model in 4-bit
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16
)

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME, model_max_length=MAX_INLENGTH)
tokenizer.add_special_tokens({'pad_token': '[PAD]'}) 

model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME,
                                                   device_map='auto',
#                                                    quantization_config=quantization_config,
#                                                    attn_implementation="flash_attention_2",
#                                                    torch_dtype=torch.float16,
                                                   return_dict=True)

tokenizer_config.json:   0%|          | 0.00/21.0k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.61k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

In [8]:
def preprocessing(samples):
    toxic = samples['toxic']
    neutral = samples['neutral']
    
    prefix_toxic = "Toxic version: "
    prefix_nontoxic = "Non-toxic version: "
    
    inputs = [prefix_toxic + text for text in toxic]
    outputs = [prefix_nontoxic + text for text in neutral]
    encodings = tokenizer(text=inputs, 
            text_target=outputs,
            padding="max_length",
            max_length=MAX_INLENGTH,
            truncation= True)
    return encodings

In [9]:
tokenized_dataset = dataset.map(preprocessing, batched=True, remove_columns=dataset.column_names)

tokenized_dataset = tokenized_dataset.train_test_split(train_size=0.90, seed=12)
tokenized_dataset.set_format("pt")
tokenized_dataset.column_names

  0%|          | 0/20 [00:00<?, ?ba/s]

{'train': ['input_ids', 'attention_mask', 'labels'],
 'test': ['input_ids', 'attention_mask', 'labels']}

In [10]:
label_pad_token_id = -100
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

In [11]:
def test(model, tokenizer, toxic_text, device):
    model = model
    batch = tokenizer([toxic_text], return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model.generate(input_ids=batch['input_ids'], max_new_tokens=MAX_OUTLENGTH)
    non_toxic_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    print(non_toxic_text)

In [12]:
toxic_text = "Toxic version: who the fuck are you?"
test(model, tokenizer, toxic_text, "cuda")

['<pad> Non-toxic version: Who are you?']


In [13]:
# Load the BLEU metric
bleu_metric = load_metric("bleu")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    # Compute BLEU score
    result = bleu_metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    return result

Downloading builder script:   0%|          | 0.00/2.48k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

In [14]:
output_dir = "/kaggle/working/detox"

training_args = Seq2SeqTrainingArguments(
        output_dir=output_dir,
        learning_rate=LEARNING_RATE,
        num_train_epochs=EPOCHS,
        logging_dir=f"{output_dir}/logs",
        evaluation_strategy="steps",  # Evaluate every specified number of steps
        eval_steps=15,  # Specify the number of steps between evaluations
        logging_strategy="steps",
        logging_steps=15,
        save_strategy="steps",
        save_steps=100,
        warmup_steps=100,
        per_device_train_batch_size=12,
        per_device_eval_batch_size=6,
        gradient_accumulation_steps=12,
        eval_accumulation_steps=12,
        weight_decay=0.01,
        fp16=True, 
#         metric_for_best_model='f1',
#         load_best_model_at_end=True
        )

trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=tokenized_dataset['train'],
        eval_dataset=tokenized_dataset['test'],
#         callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
#         compute_metrics=compute_metrics
    )

In [15]:
trainer.train()
result = trainer.evaluate()
result

[34m[1mwandb[0m: Currently logged in as: [33mribinbaby[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.16.1
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20240107_052740-8d8abo5o[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mcerulean-wood-4[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/ribinbaby/huggingface[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/ribinbaby/huggingface/runs/8d8abo5o[0m


Step,Training Loss,Validation Loss
15,1.119,0.862605
30,1.1117,0.851181
45,1.0869,0.833959
60,1.0499,0.81432
75,1.006,0.787291
90,0.953,0.763292
105,0.9336,0.734475
120,0.9009,0.723049


{'eval_loss': 0.7226091027259827,
 'eval_runtime': 13.9749,
 'eval_samples_per_second': 141.325,
 'eval_steps_per_second': 23.614,
 'epoch': 1.0}

In [16]:
model_id = MODEL_NAME
print(model_id)
model.push_to_hub(model_id, token=huggingface_key)
tokenizer.push_to_hub(model_id, token=huggingface_key)

Ribin/t5-base_detoxParaphraser


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Ribin/t5-base_detoxParaphraser/commit/e7327b15b49648b482851c3519a042f2e6497226', commit_message='Upload tokenizer', commit_description='', oid='e7327b15b49648b482851c3519a042f2e6497226', pr_url=None, pr_revision=None, pr_num=None)

In [17]:
print("END")

END
