In [1]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq
)
from datasets import load_dataset, Dataset
from peft import LoraConfig, get_peft_model, TaskType
from transformers import BitsAndBytesConfig
import pandas as pd
from nltk.tokenize import word_tokenize
# from nltk.corpus import stopwords
from nltk.stem.porter import *
import regex as re
import nltk

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
dataset = pd.read_csv('data.csv')
tweet = dataset['original_comment']
neutralized_tweet = dataset['non_offensive_comment']

In [None]:
def preprocess(tweet_series):
    stopwords = nltk.corpus.stopwords.words('english')
    other_exclusions = ["#ff", "ff", "rt"]
    stopwords.extend(other_exclusions)
    stemmer = PorterStemmer()
    tweet_series = tweet_series.astype(str)

    tweet_series = tweet_series.str.replace(r'\s+', ' ', regex=True)
    tweet_series = tweet_series.str.replace(r'@[\w\-]+', '', regex=True)
    tweet_series = tweet_series.str.replace(r'http[s]?://\S+', '', regex=True)

    tweet_series = tweet_series.str.replace(r'[^a-zA-Z]', ' ', regex=True)
    tweet_series = tweet_series.str.replace(r'\s+', ' ', regex=True)
    tweet_series = tweet_series.str.replace(r'^\s+|\s+?$', '', regex=True)
    tweet_series = tweet_series.str.replace(r'\d+(\.\d+)?', 'numbr', regex=True)
    tweet_series = tweet_series.str.lower()

    tokenized = tweet_series.apply(lambda x: x.split())
    # tokenized = tokenized.apply(lambda x: [w for w in x if w not in stopwords])
    # tokenized = tokenized.apply(lambda x: [stemmer.stem(w) for w in x])
    cleaned = tokenized.apply(lambda x: ' '.join(x))
    return cleaned

neutralized_tweet = preprocess(neutralized_tweet)  
original_tweet = preprocess(tweet)

In [7]:
for i in range(len(neutralized_tweet)):
    print(f"Original: {original_tweet[i]}")
    print(f"Neutralized: {neutralized_tweet[i]}")
    print("==="*10)

Original: rt as a woman you shouldn t complain about cleaning up your house amp as a man you should always take the trash out
Neutralized: rt as a woman you should feel comfortable maintaining a clean home amp as a man contributing by taking the trash out is always appreciated
Original: rt boy dats cold tyga dwn bad for cuffin dat hoe in the st place
Neutralized: rt boy that s unexpected tyga made an interesting choice in the first place
Original: rt dawg rt you ever fuck a bitch and she start to cry you be confused as shit
Neutralized: rt dawg rt you ever be in an emotional moment with someone and they start to cry it can be really confusing
Original: rt she look like a tranny
Neutralized: rt she looks very different in her style
Original: rt the shit you hear about me might be true or it might be faker than the bitch who told it to ya
Neutralized: rt the things you hear about me might be true or they might be completely untrue depending on the source who told it to you
Original: the 

In [40]:
offensive_dataset = dataset[dataset["class"] != 2].sample(n=2000, random_state=42)
offensive_dataset.to_csv("hate_neutral.csv", index=False, columns=["processed_tweets", "class"])

In [41]:
offensive_dataset = dataset[dataset["class"] != 2].sample(n=400)
offensive_dataset.to_csv("hate_neutral_eval.csv", index=False, columns=["processed_tweets", "class"])

In [12]:
model_name = "google-t5/t5-small"

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# --- Apply LoRA ---
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM,
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()



trainable params: 294,912 || all params: 60,801,536 || trainable%: 0.4850


In [None]:
# --- Tokenize Dataset ---
def tokenize(example):
    tokens = tokenizer(
        example["input_text"],
        truncation=True,
        padding="max_length",
        max_length=256
    )
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

dataset = Dataset.from_pandas(pd.DataFrame(dataset))
tokenized_dataset = dataset.map(tokenize, batched=True)

In [13]:
training_args = TrainingArguments(
    output_dir="lora-t5-small",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    num_train_epochs=2,
    logging_dir="logs",
    save_strategy="epoch",
    logging_steps=10,
    report_to="none"
)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

NameError: name 'tokenized_dataset' is not defined

In [None]:
trainer.train()

In [None]:
model.save_pretrained("lora-t5small-hate-neutral")
tokenizer.save_pretrained("lora-t5small-hate-neutral")