In [22]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq
)
from datasets import load_dataset, Dataset
from peft import LoraConfig, get_peft_model, TaskType
from transformers import BitsAndBytesConfig
import pandas as pd
from nltk.tokenize import word_tokenize
# from nltk.corpus import stopwords
from nltk.stem.porter import *
import regex as re
import nltk

In [19]:
dataset = pd.read_csv('HateSpeechData.csv')
tweet = dataset['tweet']

In [33]:
stopwords = nltk.corpus.stopwords.words('english')
other_exclusions = ["#ff", "ff", "rt"]
stopwords.extend(other_exclusions)
stemmer = PorterStemmer()

def preprocess(tweet_series):  
    tweet_series = tweet_series.astype(str)

    tweet_series = tweet_series.str.replace(r'\s+', ' ', regex=True)
    tweet_series = tweet_series.str.replace(r'@[\w\-]+', '', regex=True)
    tweet_series = tweet_series.str.replace(r'http[s]?://\S+', '', regex=True)

    tweet_series = tweet_series.str.replace(r'[^a-zA-Z]', ' ', regex=True)
    tweet_series = tweet_series.str.replace(r'\s+', ' ', regex=True)
    tweet_series = tweet_series.str.replace(r'^\s+|\s+?$', '', regex=True)
    tweet_series = tweet_series.str.replace(r'\d+(\.\d+)?', 'numbr', regex=True)
    tweet_series = tweet_series.str.lower()

    tokenized = tweet_series.apply(lambda x: x.split())
    # tokenized = tokenized.apply(lambda x: [w for w in x if w not in stopwords])
    # tokenized = tokenized.apply(lambda x: [stemmer.stem(w) for w in x])
    cleaned = tokenized.apply(lambda x: ' '.join(x))
    return cleaned

processed_tweets = preprocess(dataset['tweet'])   

dataset['processed_tweets'] = processed_tweets

In [34]:
print(dataset[["processed_tweets"]].head(10))

                                    processed_tweets
0  rt as a woman you shouldn t complain about cle...
1  rt boy dats cold tyga dwn bad for cuffin dat h...
2  rt dawg rt you ever fuck a bitch and she start...
3                          rt she look like a tranny
4  rt the shit you hear about me might be true or...
5  the shit just blows me claim you so faithful a...
6  i can not just sit up and hate on another bitc...
7  cause i m tired of you big bitches coming for ...
8  amp you might not get ya bitch back amp thats ...
9              hobbies include fighting mariam bitch


In [40]:
offensive_dataset = dataset[dataset["class"] != 2].sample(n=2000, random_state=42)
offensive_dataset.to_csv("hate_neutral.csv", index=False, columns=["processed_tweets", "class"])

In [41]:
offensive_dataset = dataset[dataset["class"] != 2].sample(n=400)
offensive_dataset.to_csv("hate_neutral_eval.csv", index=False, columns=["processed_tweets", "class"])

In [12]:
model_name = "google-t5/t5-small"

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# --- Apply LoRA ---
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM,
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()



trainable params: 294,912 || all params: 60,801,536 || trainable%: 0.4850


In [None]:
# --- Tokenize Dataset ---
def tokenize(example):
    tokens = tokenizer(
        example["input_text"],
        truncation=True,
        padding="max_length",
        max_length=256
    )
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

dataset = Dataset.from_pandas(pd.DataFrame(dataset))
tokenized_dataset = dataset.map(tokenize, batched=True)

In [13]:
training_args = TrainingArguments(
    output_dir="lora-t5-small",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    num_train_epochs=2,
    logging_dir="logs",
    save_strategy="epoch",
    logging_steps=10,
    report_to="none"
)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

NameError: name 'tokenized_dataset' is not defined

In [None]:
trainer.train()

In [None]:
model.save_pretrained("lora-t5small-hate-neutral")
tokenizer.save_pretrained("lora-t5small-hate-neutral")