# **Models**
## Fine tune `t5-small`
## [Link for the model](https://huggingface.co/t5-small)

### Preparation

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
import torch
from torch.utils.data import Dataset, DataLoader

In [2]:
# Choose device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


In [3]:
# Read in data from a CSV file
data = pd.read_csv("filtered_paranmt/filtered.tsv", sep="\t", index_col=0)
data.head(7)

Unnamed: 0,reference,translation,similarity,lenght_diff,ref_tox,trn_tox
0,"If Alkar is flooding her with psychic waste, t...","if Alkar floods her with her mental waste, it ...",0.785171,0.010309,0.014195,0.981983
1,Now you're getting nasty.,you're becoming disgusting.,0.749687,0.071429,0.065473,0.999039
2,"Well, we could spare your life, for one.","well, we can spare your life.",0.919051,0.268293,0.213313,0.985068
3,"Ah! Monkey, you've got to snap out of it.","monkey, you have to wake up.",0.664333,0.309524,0.053362,0.994215
4,I've got orders to put her down.,I have orders to kill her.,0.726639,0.181818,0.009402,0.999348
5,I'm not gonna have a child... ...with the same...,I'm not going to breed kids with a genetic dis...,0.703185,0.206522,0.950956,0.035846
6,"They're all laughing at us, so we'll kick your...",they're laughing at us. We'll show you.,0.618866,0.230769,0.999492,0.000131


In [6]:
# Class that is used to prepare the data for model
class MyDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_source_length, max_target_length):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_source_length = max_source_length
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.dataframe)
    
    def priint(self, idx):
        row = self.dataframe.iloc[idx]
        print(row["Toxic"])

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        inputs = self.tokenizer(
            row["Toxic"]
        )
        outputs = self.tokenizer(
            row["Neutral"]
        )
        inputs["input_ids"] = torch.tensor(inputs["input_ids"])
        inputs["attention_mask"] = torch.tensor(inputs["attention_mask"])
        inputs["labels"] = torch.tensor(outputs["input_ids"])
        return inputs

In [58]:
# Load input data 
input_data = pd.read_csv("filtered_for_models.csv", index_col=0)
input_data

Unnamed: 0,Toxic,Neutral,Tox score
0,if alkar floods her with her mental waste it w...,if alkar is flooding her with psychic waste th...,0.981983
1,youre becoming disgusting,now youre getting nasty,0.999039
2,well we can spare your life,well we could spare your life for one,0.985068
3,monkey you have to wake up,ah monkey youve got to snap out of it,0.994215
4,i have orders to kill her,ive got orders to put her down,0.999348
...,...,...,...
557503,you didnt know that estelle stole your fish fr...,you didnt know that estelle had stolen some fi...,0.949143
557504,itil suck the life out of you,youd be sucked out of your life,0.996124
557505,i cant fuckin take that bruv,i really cant take this,0.984538
557506,they called me a fucking hero the truth is i d...,they said i was a hero but i didnt care,0.991945


In [18]:
# Split data into train and validation sets
from sklearn.model_selection import train_test_split

train_set, val_set = train_test_split(input_data[:1000], test_size=0.2, random_state=42)


In [19]:
# %pip install sentencepiece

Note: you may need to restart the kernel to use updated packages.


In [20]:
# %pip install --upgrade transformers

Note: you may need to restart the kernel to use updated packages.


In [21]:
# %pip install protobuf

Note: you may need to restart the kernel to use updated packages.


### Model fine tuning

In [22]:
# Load model and tokenizer from pretrained
from transformers import AutoTokenizer, AutoModelWithLMHead

tokenizer = AutoTokenizer.from_pretrained("t5-small", cache_dir=None)
model = AutoModelWithLMHead.from_pretrained("t5-small", cache_dir=None)

  from .autonotebook import tqdm as notebook_tqdm
Downloading (…)neration_config.json: 100%|██████████| 147/147 [00:00<00:00, 36.4kB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [23]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer)

In [24]:
# Recollect data to fit for the model
train_dataset = MyDataset(train_set, tokenizer, 128, 128)
val_dataset = MyDataset(val_set, tokenizer, 128, 128)

In [25]:
train_dataset.priint(0)

and im not just talking about hitting me for your boyfriend what a girl


In [26]:
# %pip install accelerate -U

In [27]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=124,
    per_device_eval_batch_size=124,
    num_train_epochs=2,
    logging_dir='./logs',
    save_strategy="steps",
    evaluation_strategy="steps",
    eval_steps=100,
    save_steps=300,
    logging_steps=50,
    learning_rate=1e-4,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [29]:
# Fine tuning
trainer.train()

  0%|          | 0/14 [00:00<?, ?it/s]You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 14/14 [10:31<00:00, 45.11s/it]

{'train_runtime': 631.4419, 'train_samples_per_second': 2.534, 'train_steps_per_second': 0.022, 'train_loss': 3.337999071393694, 'epoch': 2.0}





TrainOutput(global_step=14, training_loss=3.337999071393694, metrics={'train_runtime': 631.4419, 'train_samples_per_second': 2.534, 'train_steps_per_second': 0.022, 'train_loss': 3.337999071393694, 'epoch': 2.0})

In [31]:
torch.save(model.state_dict(), "t5small.pt")

### Paraphrasing

In [39]:
def paraphrase(text, max_length=128):

  input_ids = tokenizer.encode(text, return_tensors="pt", add_special_tokens=True)

  generated_ids = model.generate(input_ids=input_ids, num_return_sequences=5, num_beams=5, max_length=max_length, no_repeat_ngram_size=2, repetition_penalty=3.5, length_penalty=1.0, early_stopping=True)

  preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]

  return preds

In [38]:
test_sample = input_data.sample(n=10)
test_sample

Unnamed: 0,Toxic,Neutral,Tox score
527382,i threw them all out,i had them all fired,0.908439
244993,the background is that our thing is cool and p...,the grounds are our thing is cool and popular ...,0.986022
70263,nothing yet but i want you to know i smell a rat,so far from nothing but i have rats smell,0.957481
206964,could you not appreciate what you fucking had,you couldnt be happy with what you had,0.998181
91515,your destruction is inevitable,your doom is inevitable,0.715417
86573,thats really fucked up,thats messed up man,0.999102
497696,were fucking free,we are home free,0.957807
399940,you were really sick and your mother wanted me...,you had been very ill and your mother wanted u...,0.973515
137356,a nice bowl of soup a little blow job from mél...,a good soup and a little something from melani...,0.991525
220875,shit clayton run,clayton run,0.999473


In [44]:
# Get paraphrased sentences from the model
paraphrased = []
for sent in test_sample['Toxic']:
    paraph = paraphrase(sent)
    paraphrased.append(paraph)

In [45]:
paraphrased

[['y', 'the', 'd', 'just', ''],
 ['Our thing is cool and popular and harvard connection sucks',
  'Our thing is cool and popular and harvard connection sucks.',
  'Our thing is cool and popular and harvard connection sucks!',
  'Our thing is cool and popular and harvard connections sucks',
  'our thing is cool and popular and harvard connection sucks'],
 ['i smell a rat.',
  'i smell a rat',
  'i smell a rat!',
  'i smell a rat but nothing yet.',
  "i smell a rat but it's good."],
 ['Können Sie nicht wissen, was Sie gefuckt haben?',
  'Können Sie nicht wissen, was Sie gefuckt hätten?',
  'Können Sie nicht wissen, was Sie gefuckt haben hätte?',
  'Können Sie nicht wissen, was Sie gefuckt hatten?',
  'Können Sie nicht wissen, was Sie gefuckt hätte haben?'],
 ['your destruction destruction is inevitable',
  'your destruction destruction is inevitable.',
  'ura destruction destruction is inevitable',
  'your destruction destruction is inevitable.',
  'La destruction destruction is inevitab

### Appresiate the model

In [46]:
# Load vocabulary
vocabframe = pd.read_csv("bestvocab.csv", index_col=0)
vocabframe

Unnamed: 0,key,translation
0,kissoon,24170
1,ripton,21371
2,loose-jointed,90238
3,skins,7531
4,seena,37526
...,...,...
113967,ﬁve,113966
113968,ﬂoat,113967
113969,ﬂoor,113969
113970,ﬂunkeys,113970


In [47]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\84907\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [48]:
# Data preporation for metric model
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

PUNCT_TO_REMOVE = string.punctuation
ENGLISH_STOPWORDS = set(stopwords.words("english"))

def text_to_tensor(sent):
    sent = sent.lower()
    sent = sent.translate(str.maketrans('', '', PUNCT_TO_REMOVE))
    sent = " ".join([word for word in str(sent).split() if word not in ENGLISH_STOPWORDS])
    sent = word_tokenize(sent)

    words = []
    for word in sent:
        query = list(vocabframe.query("key == @word")['translation'])
        if len(query) > 0:
            words.append(query[0])
    return torch.tensor(words, dtype=torch.int64)

In [50]:
import torch.nn as nn

class TextRegressionModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super().__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=False)
        self.dropout = nn.Dropout(0.4)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.linear1 = nn.Linear(hidden_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.linearOut = nn.Linear(hidden_dim, 1)
        self.out = nn.Sigmoid()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        dout = self.dropout(embedded)
        lstm_out, (ht, ct) = self.lstm(dout)
        out = self.linear1(lstm_out)
        out = self.relu(out)
        out = self.linear1(out)
        out = self.relu(out)
        return self.linearOut(out)

In [51]:
# Load metric model
from torch import nn

metric_model = TextRegressionModel(114506, 300, 200)
cpt = torch.load("best.pt")
metric_model.load_state_dict(cpt)

<All keys matched successfully>

In [52]:
def get_offset(sent):
    offset = [0]
    offset.append(sent.size(0))
    offset = torch.tensor(offset[:-1]).cumsum(dim=0)
    return offset

In [53]:
def predict(
    model,
    sent,
    offset
):
    with torch.no_grad():
        model.eval()

        output = model(sent, offset)
        if output.item() > 1:
            score = 1.0
        else: score = output.item()

    return round(score, 4)

In [54]:
# Score paraphpased sentences
predicted = []
pred_scores = []
for set5 in paraphrased:
    best_score = 1.1
    best = -1
    for i, sent in enumerate(set5):
        tokenized = text_to_tensor(sent)
        offset = get_offset(tokenized)
        score = predict(metric_model, tokenized, offset)
        if score < best_score:
            best_score = score
            best = i
    predicted.append(set5[best])
    pred_scores.append(best_score)

In [55]:
input_score = list(test_sample['Tox score'])
inputs = list(test_sample['Toxic'])

In [56]:
# Look at the scores
scores = pd.DataFrame(list(zip(inputs, input_score, pred_scores, predicted)), index=None, columns=['Toxic style', 'Before', 'After', 'Translation'])
scores

Unnamed: 0,Toxic style,Before,After,Translation
0,i threw them all out,0.908439,0.6947,y
1,the background is that our thing is cool and p...,0.986022,0.1955,Our thing is cool and popular and harvard conn...
2,nothing yet but i want you to know i smell a rat,0.957481,0.1968,i smell a rat.
3,could you not appreciate what you fucking had,0.998181,0.6947,"Können Sie nicht wissen, was Sie gefuckt haben?"
4,your destruction is inevitable,0.715417,0.1395,your destruction destruction is inevitable
5,thats really fucked up,0.999102,0.6947,
6,were fucking free,0.957807,0.0263,free
7,you were really sick and your mother wanted me...,0.973515,0.8552,you was really sick and your mother wanted me ...
8,a nice bowl of soup a little blow job from mél...,0.991525,0.0821,a little blow job from mélanie youll see
9,shit clayton run,0.999473,0.2306,shit clayton run
