In [14]:
import os
import re
from functools import partial
from bs4 import BeautifulSoup
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, BertForSequenceClassification, AdamW
import torch
import torch.nn as nn

In [15]:
class CustomDataset(Dataset):
    def __init__(self, root_dir, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.samples = []

        for label in os.listdir(root_dir):
            label_dir = os.path.join(root_dir, label)
            for filename in os.listdir(label_dir):
                with open(os.path.join(label_dir, filename), 'r', encoding='utf-8') as file:
                    text = file.read()
                    text = text_preprocessing([text])[0]  # Apply text preprocessing
                self.samples.append((text, int(label)))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        text, label = self.samples[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [16]:
# Define text preprocessing functions
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

# Modify this path accordingly
SLANG_PATH = "static/slang.txt"

with open(SLANG_PATH) as file:
    slang_map = dict(map(str.strip, line.partition('\t')[::2])
                     for line in file if line.strip())

slang_words = sorted(slang_map, key=len, reverse=True)
regex = re.compile(r"\b({})\b".format("|".join(map(re.escape, slang_words))))
replaceSlang = partial(regex.sub, lambda m: slang_map[m.group(1)])

def text_preprocessing(final):
    preprocessed_text = []
    for sentence in final:
        sentence = BeautifulSoup(sentence, 'lxml').get_text()
        sentence = replaceSlang(sentence)
        sentence = decontracted(sentence)
        sentence = re.sub("\S*\d\S*", "", sentence).strip()
        sentence = re.sub('[^A-Za-z]+', ' ', sentence)
        preprocessed_text.append(sentence.strip())
    return preprocessed_text

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [17]:
# Load the dataset
root_dir = "/home/lenovo/Documents/Computer Vision/streamlit-hatefulmemedection-main/dataset_hate/train/texts"

dataset = CustomDataset(root_dir, tokenizer, max_len=128)

# Split dataset into training and testing sets
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

# Set up model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Custom training parameters
batch_size = 16
epochs = 3
learning_rate = 2e-5

# Create dataloaders for training and testing sets
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=learning_rate)  # Use AdamW from torch.optim

  sentence = BeautifulSoup(sentence, 'lxml').get_text()
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:

# Training loop
for epoch in range(epochs):
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        loss = criterion(logits, labels)  # Fixing the error here
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}/{epochs}, Loss: {total_loss}')

# Evaluation loop
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        _, predicted = torch.max(logits, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f'Accuracy on cross validation set: {accuracy}')


  sentence = BeautifulSoup(sentence, 'lxml').get_text()
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TypeError: cross_entropy_loss(): argument 'input' (position 1) must be Tensor, not SequenceClassifierOutput

In [3]:
torch.save(model, "bert_hate_classification_model.pth")

print("Model saved successfully.")

NameError: name 'model' is not defined

In [18]:
model = torch.load("bert_hate_classification_model.pth")

In [19]:
test_root_dir="/home/lenovo/Documents/Computer Vision/streamlit-hatefulmemedection-main/dataset_hate/test/texts"
dataset_test=CustomDataset(test_root_dir, tokenizer, max_len=128)
final_test_loader=DataLoader(dataset_test, batch_size=batch_size, shuffle=False)

# Test loop
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for batch in final_test_loader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        _, predicted = torch.max(logits, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f'Accuracy on test set: {accuracy}')

  sentence = BeautifulSoup(sentence, 'lxml').get_text()


Accuracy on test set: 0.5454545454545454


In [21]:
def predict_text_hatefulness(model, tokenizer, text_file_path):
    # Read text from file
    with open(text_file_path, 'r', encoding='utf-8') as file:
        text = file.read()

    # Preprocess the text
    text = text_preprocessing([text])[0]

    # Tokenize the text
    inputs = tokenizer(text, return_tensors="pt", max_length=128, truncation=True, padding=True)

    # Perform inference
    with torch.no_grad():
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probabilities = nn.functional.softmax(logits, dim=1)
        predicted_class = torch.argmax(probabilities, dim=1).item()
        print(probabilities)

    return predicted_class, probabilities[0][predicted_class].item()

text_file_path = "/home/lenovo/Documents/Computer Vision/streamlit-hatefulmemedection-main/dataset_hate/test/texts/1/26910.txt"

# Call the predict_text_hatefulness function
predicted_class, confidence = predict_text_hatefulness(model, tokenizer, text_file_path)

# Interpret the prediction
class_names = ['Not Hateful', 'Hateful']
print(f'Text is {class_names[predicted_class]} with confidence {confidence:.2f}')


tensor([[0.8940, 0.1060]])
Text is Not Hateful with confidence 0.89
