In [1]:
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
data_path = '/content/drive/MyDrive/filtered.tsv'
model_cktp_path = 'pretrained.pt'

data = pd.read_csv(
    data_path, sep="\t", index_col=0
)
data.head()

Unnamed: 0,reference,translation,similarity,lenght_diff,ref_tox,trn_tox
0,"If Alkar is flooding her with psychic waste, t...","if Alkar floods her with her mental waste, it ...",0.785171,0.010309,0.014195,0.981983
1,Now you're getting nasty.,you're becoming disgusting.,0.749687,0.071429,0.065473,0.999039
2,"Well, we could spare your life, for one.","well, we can spare your life.",0.919051,0.268293,0.213313,0.985068
3,"Ah! Monkey, you've got to snap out of it.","monkey, you have to wake up.",0.664333,0.309524,0.053362,0.994215
4,I've got orders to put her down.,I have orders to kill her.,0.726639,0.181818,0.009402,0.999348


In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize
import re
import pandas as pd

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')

# Initialize lemmatizer, stemmer and stopwords list
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))


def remove_symbols(text: str) -> str:
    """remove punctuation, symbols and etc."""

    fix = text
    fix = re.sub(r"\s+", " ", fix)
    fix = re.sub(r"\d+", " ", fix)
    fix = re.sub(r"([.!?])", r" ", fix)
    fix = re.sub(r"[^a-zA-Z.!?]+", r" ", fix)
    fix = fix.strip()
    fix = fix.lower()

    return fix

def preprocess_df(data: pd.DataFrame,
                  toxicity_threshold=0.99
                  ):


    mask = data["trn_tox"] > data["ref_tox"]
    temp = data.loc[mask, "reference"].copy()
    data.loc[mask, "reference"] = data.loc[mask, "translation"]
    data.loc[mask, "translation"] = temp


    filtered_data = data[
    ((data["ref_tox"] > toxicity_threshold) & (data["trn_tox"] < 1 - toxicity_threshold))
    | ((data["trn_tox"] > toxicity_threshold) & (data["ref_tox"] < 1 - toxicity_threshold))
    ]
    # Preprocess entries for 'reference' and 'translation' columns
    data_preprocessed = filtered_data.copy()
    data_preprocessed['reference'] = data_preprocessed['reference'].apply(remove_symbols)
    data_preprocessed['translation'] = data_preprocessed['translation'].apply(remove_symbols)

    return data_preprocessed






[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [4]:
from sklearn.model_selection import train_test_split

data = preprocess_df(data)

# Split the dataset into training and validation sets
train_df, val_df = train_test_split(data, test_size=0.1, random_state=42)

# Select only the necessary columns for training and validation
train_df = train_df[['reference', 'translation']]
val_df = val_df[['reference', 'translation']]

# Checking the first few rows of the training data
train_df.head()


Unnamed: 0,reference,translation
221588,goddamn it,oh my god
142200,no fucking around,no more hanging around
514822,klemash where the fucking kid,klemash where s the baby
434931,who the fuck do you think you are,do you think you re christ
413069,i once had a bunch of wads a big jerk who want...,grunts i once had a a buckshot supplier a real...


In [5]:
!pip install transformers[torch]

Collecting transformers[torch]
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m59.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers[torch])
  Downloading huggingface_hub-0.19.0-py3-none-any.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.2/311.2 kB[0m [31m36.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers[torch])
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m102.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers[torch])
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m88.8 MB/s

In [6]:
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/1.3 MB[0m [31m7.5 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.3/1.3 MB[0m [31m21.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [7]:
import torch
import gc
torch.cuda.empty_cache()
gc.collect()


0

In [8]:
model_checkpoint = 't5-small'

In [9]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
import transformers

# Load the T5 tokenizer and T5 model
tokenizer = transformers.AutoTokenizer.from_pretrained(model_checkpoint) #transformers.models.t5.tokenization_t5_fast.T5TokenizerFast
model = transformers.AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

# Create a custom dataset class
class TextDetoxDataset(Dataset):
    def __init__(self, tokenizer, data_df, max_length=512):
        self.tokenizer = tokenizer
        self.input_texts = data_df['reference']
        self.target_texts = data_df['translation']
        self.max_length = max_length

    def __len__(self):
        return len(self.input_texts)

    def __getitem__(self, index):
        # Tokenize the input and target text
        input_text = self.input_texts.iloc[index]
        target_text = self.target_texts.iloc[index]

        source = self.tokenizer.__call__(input_text, max_length=self.max_length, truncation=True)

        target = self.tokenizer.__call__(target_text, max_length=self.max_length, truncation=True)

        return {
            'input_ids': source['input_ids'],
            'attention_mask': source['attention_mask'],
            'labels': target['input_ids']
        }

# Create the training and validation datasets
train_dataset = TextDetoxDataset(tokenizer, train_df)
val_dataset = TextDetoxDataset(tokenizer, val_df)


Downloading (…)okenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [10]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, datasets
Successfully installed datasets-2.14.6 dill-0.3.7 multiprocess-0.70.15


In [11]:
!pip install sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.3.2-py3-none-any.whl (119 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/119.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━[0m [32m61.4/119.7 kB[0m [31m2.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.7/119.7 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-2.8.2-py3-none-any.whl (17 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-2.8.2 sacrebleu-2.3.2


In [12]:
from datasets import load_metric
metric = load_metric("sacrebleu")
epochs = 10
seed = 42

  metric = load_metric("sacrebleu")


Downloading builder script:   0%|          | 0.00/2.85k [00:00<?, ?B/s]

In [13]:
# defining the parameters for training
batch_size = 32
args = transformers.Seq2SeqTrainingArguments(
    f"{model_checkpoint}-finetuned-de-toxification",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=epochs,
    predict_with_generate=True,
    fp16=True,
    report_to=['tensorboard'],
    seed=seed,
)

In [14]:
data_collator = transformers.DataCollatorForSeq2Seq(tokenizer, model=model)

In [15]:
# simple postprocessing for text
def postprocess_text(preds: list[str], labels: list[str]):
    """
    Postprocess the generated text predictions and reference labels.

    Args:
        preds (list[str]): List of predicted text.
        labels (list[str]): List of reference labels.

    Returns:
        tuple[list[str], list[list[str]]: Postprocessed predictions and labels.
    """
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

# compute metrics function to pass to trainer
def compute_metrics(eval_preds: tuple):
    """
    Compute evaluation metrics for text generation.

    Args:
        eval_preds (tuple): Evaluation predictions.

    Returns:
        dict: Dictionary containing evaluation metrics.
    """
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [16]:
trainer = transformers.Seq2SeqTrainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [17]:
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,2.071,1.880503,26.7206,11.6564
2,1.9874,1.824681,27.1018,11.5461
3,1.9293,1.794965,27.3959,11.5718
4,1.9128,1.77474,27.5532,11.5144
5,1.884,1.761769,27.684,11.5229
6,1.8812,1.751299,27.764,11.5242
7,1.8626,1.74505,27.8609,11.5005
8,1.8435,1.740571,27.9484,11.4692
9,1.8622,1.73707,27.9604,11.4985
10,1.8458,1.736602,27.9731,11.4813




TrainOutput(global_step=48920, training_loss=1.9252171089877288, metrics={'train_runtime': 7470.5079, 'train_samples_per_second': 209.512, 'train_steps_per_second': 6.548, 'total_flos': 1.5760924971368448e+16, 'train_loss': 1.9252171089877288, 'epoch': 10.0})

In [18]:
trainer.save_model(model_cktp_path)

In [19]:
# loading the model and run inference for it
ModelType = transformers.models.t5.modeling_t5.T5ForConditionalGeneration
model = transformers.AutoModelForSeq2SeqLM.from_pretrained(model_cktp_path)
model.eval()
model.config.use_cache = False

In [20]:
ModelTokenizer = transformers.models.t5.tokenization_t5_fast.T5TokenizerFast
def inference(text_model: ModelType, inference_request: str, tokenizer: ModelTokenizer) -> str:
    input_ids = tokenizer(inference_request, return_tensors="pt").input_ids
    outputs = text_model.generate(input_ids=input_ids)
    return tokenizer.decode(outputs[0], skip_special_tokens=True, temperature=0)

In [21]:
inference(model, "Fuck it I'm done with this shit goddamn", tokenizer)



'i m done with this'