In [27]:
import pandas as pd
import re
from bs4 import BeautifulSoup

# Load data
data = pd.read_csv('HateSpeechDatasetBalanced.csv')

# Randomly sample 2000 rows from the dataset
data = data.sample(n=10000, random_state=42).reset_index(drop=True)
# Function to remove HTML tags
def remove_html_tags(text):
    soup = BeautifulSoup(text, 'html.parser')
    return soup.get_text()

# Function to remove special characters and digits
def remove_special_chars_and_digits(text):
    text = re.sub(r'[^A-Za-z\s]', '', text)
    return text

data['Content'] = data['Content'].apply(remove_html_tags)
data['Content'] = data['Content'].apply(remove_special_chars_and_digits)


In [28]:
from symspellpy import SymSpell, Verbosity
import pkg_resources

# Function to correct spellings
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
dictionary_path = pkg_resources.resource_filename("symspellpy", "frequency_dictionary_en_82_765.txt")
term_index = 0  # column of the term names in the dictionary text file
count_index = 1   # column of the term frequencies in the dictionary text file
if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
    print("Dictionary file not found")
    
def correct_spellings(text):
    suggestions = sym_spell.lookup_compound(text, max_edit_distance=2)
    if suggestions:
        return suggestions[0].term
    else:
        return text
    
data['Content'] = data['Content'].apply(correct_spellings)

#print(data['Content'].sample(5))


In [29]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
for name, param in model.named_parameters():
    param.requires_grad = False

# Unfreeze the last two layers
for name, param in model.named_parameters():
    
    if 'transformer.layer.5' in name or 'classifier' in name:
        param.requires_grad = True

In [31]:
from datasets import Dataset

# Convert to a Hugging Face Dataset
dataset = Dataset.from_pandas(data)

In [32]:
def tokenize_function(examples):
    return tokenizer(examples['Content'], padding="max_length", truncation=True,max_length=512)

# Apply tokenization
dataset = dataset.map(tokenize_function, batched=True)

# Rename columns to match what the model expects
dataset = dataset.rename_column("Label", "labels")


Map:   0%|          | 0/5 [00:00<?, ? examples/s]

In [33]:
# Split the dataset
train_test_split = dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']


In [34]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    evaluation_strategy='epoch',  # Set evaluation strategy to 'epoch'
    save_strategy='epoch',  # Set save strategy to 'epoch'
    metric_for_best_model='accuracy',
    prediction_loss_only=False,
    report_to='none',
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir='./logs',
    lr_scheduler_type='cosine',
    gradient_accumulation_steps=2
)

In [35]:
from transformers import Trainer

# Accuracy Calculation Function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = (preds == labels).mean()
    return {"accuracy": accuracy}

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics
)

# Training
trainer.train()

# Saving the best model
trainer.save_model("best-fine-tuned-distilbert-hate-speech")
tokenizer.save_pretrained("best-fine-tuned-distilbert-hate-speech")

# Evaluation
eval_result = trainer.evaluate()
'''
# Ensure the correct key for accuracy
if 'eval_accuracy' in eval_result:
    print(f"Validation Accuracy: {eval_result['eval_accuracy']}")
elif 'accuracy' in eval_result:
    print(f"Validation Accuracy: {eval_result['accuracy']}")
else:
    print("Validation accuracy not found in evaluation results.")
'''
print(eval_result)



  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.6285097002983093, 'eval_accuracy': 1.0, 'eval_runtime': 2.1118, 'eval_samples_per_second': 0.474, 'eval_steps_per_second': 0.474, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.6485671997070312, 'eval_accuracy': 1.0, 'eval_runtime': 1.6291, 'eval_samples_per_second': 0.614, 'eval_steps_per_second': 0.614, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.6630244851112366, 'eval_accuracy': 1.0, 'eval_runtime': 1.4707, 'eval_samples_per_second': 0.68, 'eval_steps_per_second': 0.68, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.6706757545471191, 'eval_accuracy': 1.0, 'eval_runtime': 1.9308, 'eval_samples_per_second': 0.518, 'eval_steps_per_second': 0.518, 'epoch': 4.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.6728020310401917, 'eval_accuracy': 1.0, 'eval_runtime': 1.7057, 'eval_samples_per_second': 0.586, 'eval_steps_per_second': 0.586, 'epoch': 5.0}
{'train_runtime': 80.6911, 'train_samples_per_second': 0.248, 'train_steps_per_second': 0.062, 'train_loss': 0.3475176334381104, 'epoch': 5.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.6728020310401917, 'eval_accuracy': 1.0, 'eval_runtime': 1.4353, 'eval_samples_per_second': 0.697, 'eval_steps_per_second': 0.697, 'epoch': 5.0}


In [38]:
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer

# Create a pipeline for text classification with custom labels
# Load the pre-trained model and tokenizer from a saved location

model_path = 'best-fine-tuned-distilbert-hate-speech'

tokenizer_path = 'best-fine-tuned-distilbert-hate-speech'
# Define label mapping
label_map = {0: 'non-hateful', 1: 'hateful'}
model = AutoModelForSequenceClassification.from_pretrained(model_path)

tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)


classifier = pipeline('text-classification', model=model, tokenizer=tokenizer)

# Test the pipeline

sentence = "Shut up and stop wasting everyone’s time with your pathetic excuses."

def preprocess_sentence(sentence):
    sentence = sentence.apply(remove_html_tags)
    sentence = sentence.apply(remove_special_chars_and_digits)
    sentence = sentence.apply(correct_spellings)
    return sentence
sentence=pd.DataFrame({'Content':[sentence]})
sentence=preprocess_sentence(sentence['Content'])
results = classifier(sentence.tolist())

# Map the predicted label to human-readable label
for result in results:
    label_id = int(result['label'].split('_')[-1])  # Extract the label ID
    human_readable_label = label_map[label_id]
    print(f"Predicted class: {human_readable_label}, Score: {result['score']}")


  return torch.load(checkpoint_file, map_location="cpu")


Predicted class: non-hateful, Score: 0.5174718499183655
