In [12]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import PeftModel
import pandas as pd
from bert_score import score
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import nltk
nltk.download('punkt')
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yyxxc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [13]:
def preprocess(tweet_series):
    stopwords = nltk.corpus.stopwords.words('english')
    other_exclusions = ["#ff", "ff", "rt"]
    stopwords.extend(other_exclusions)
    tweet_series = tweet_series.astype(str)

    tweet_series = tweet_series.str.replace(r'\s+', ' ', regex=True)
    tweet_series = tweet_series.str.replace(r'@[\w\-]+', '', regex=True)
    tweet_series = tweet_series.str.replace(r'http[s]?://\S+', '', regex=True)

    tweet_series = tweet_series.str.replace(r'[^a-zA-Z]', ' ', regex=True)
    tweet_series = tweet_series.str.replace(r'\s+', ' ', regex=True)
    tweet_series = tweet_series.str.replace(r'^\s+|\s+?$', '', regex=True)
    tweet_series = tweet_series.str.replace(r'\d+(\.\d+)?', 'numbr', regex=True)
    tweet_series = tweet_series.str.lower()

    tokenized = tweet_series.apply(lambda x: x.split())
    # tokenized = tokenized.apply(lambda x: [w for w in x if w not in stopwords])
    # tokenized = tokenized.apply(lambda x: [stemmer.stem(w) for w in x])
    cleaned = tokenized.apply(lambda x: ' '.join(x))
    return cleaned

In [23]:
model_path = "./lora-t5-base-hate-neutral"
tokenizer = AutoTokenizer.from_pretrained(model_path)
base_model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-base")
model = PeftModel.from_pretrained(base_model, model_path)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [15]:
dataset = pd.read_csv("data.csv")
dataset["original_comment"] = preprocess(dataset["original_comment"])
dataset["non_offensive_comment"] = preprocess(dataset["non_offensive_comment"])
df_train, df = train_test_split(dataset, test_size=0.2, random_state=42)

texts = df["original_comment"].tolist()
refs = df["non_offensive_comment"].tolist()

texts_train = df_train["original_comment"].tolist()
refs_train = df_train["non_offensive_comment"].tolist()

In [25]:
print(len(texts), len(refs))
print(len(texts_train), len(refs_train))

922 922
3688 3688


In [24]:
def generate_neutralized(texts, max_length=128):
    inputs = [f"neutralize: {t}" for t in texts]
    encodings = tokenizer(inputs, return_tensors="pt", padding=True, truncation=True).to(model.device)
    outputs = model.generate(**encodings, max_length=max_length)
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

preds = generate_neutralized(texts)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


KeyboardInterrupt: 

In [18]:
original_model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-small")
original_model.eval()
def generate_neutralized_original(texts, max_length=128):
    inputs = [f"neutralize: {t}" for t in texts]
    encodings = tokenizer(inputs, return_tensors="pt", padding=True, truncation=True).to(original_model.device)
    outputs = original_model.generate(**encodings, max_length=max_length)
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

preds_original = generate_neutralized_original(texts)

In [19]:
smoothie = SmoothingFunction().method4
bleu_scores = [
    sentence_bleu([ref.split()], pred.split(), smoothing_function=smoothie)
    for pred, ref in zip(preds, refs)
]
print("Avg BLEU:", sum(bleu_scores) / len(bleu_scores))

Avg BLEU: 0.3217395472608178


In [20]:
smoothie = SmoothingFunction().method4

bleu_scores_original = [
    sentence_bleu([ref.split()], pred_original.split(), smoothing_function=smoothie)
    for pred_original, ref in zip(preds_original, refs)
]

print("Pretrain BLEU:", sum(bleu_scores_original) / len(bleu_scores_original))

Pretrain BLEU: 0.2129090388525372


In [21]:
P, R, F1 = score(preds_original, refs, lang="en", verbose=True)
print("BERTScore F1 of Pretrained T5 model:", F1.mean().item())

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 29/29 [06:15<00:00, 12.94s/it]


computing greedy matching.


100%|██████████| 15/15 [00:00<00:00, 33.36it/s]


done in 375.82 seconds, 2.45 sentences/sec
BERTScore F1 of Pretrained T5 model: 0.8507360816001892


In [22]:
P, R, F1 = score(preds, refs, lang="en", verbose=True)
print("BERTScore F1:", F1.mean().item())

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 29/29 [03:15<00:00,  6.74s/it]


computing greedy matching.


100%|██████████| 15/15 [00:00<00:00, 54.81it/s]


done in 195.81 seconds, 4.71 sentences/sec
BERTScore F1: 0.9093093872070312


In [13]:
for i in range(len(preds)):
    print(f"Original: {texts[i]}")
    print(f"Reference: {refs[i]}")
    print(f"Generated: {preds[i]}")
    print("=============")

Original: rt this bitch feet been through years of slavery
Reference: rt this person feet been through years of hardship
Generated: rt this person feet been through years of slavery
Original: son why is our mail lady so ghetto
Reference: son why is our mail lady so unconventional
Generated: son why is our mail lady so ghetto
Original: the hoes finna be like owieee trey august and chris brown can have my baby at the same time lmfaooooooo hoes ain t loyal
Reference: the individual finna be like owieee trey august and chris brown can have my baby at the same time lmfaooooooo individual ain t loyal
Generated: the people finna be like owieee trey august and chris brown can have my baby at the same time lmfaooooooo people ain t loyal
Original: rt you gotta be a really weak bitch to let somebody s tweets offend you especially if you never met the person
Reference: rt you gotta be a really vulnerable person to let somebody s tweets offend you especially if you never met the person
Generated: r