In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch
from datasets import load_dataset

# Load the Multi-Dimensional Gender Bias dataset from Hugging Face
dataset = load_dataset("md_gender_bias")

# Prepare data
data = pd.DataFrame(dataset["train"])
X = data["text"]
y = data["labels"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [5]:


# SVM Classifier
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

svm_classifier = SVC()
svm_classifier.fit(X_train_tfidf, y_train)
svm_predictions = svm_classifier.predict(X_test_tfidf)

print("SVM Classifier:")
print("Accuracy:", accuracy_score(y_test, svm_predictions))
print(classification_report(y_test, svm_predictions))



ValueError: You appear to be using a legacy multi-label data representation. Sequence of sequences are no longer supported; use a binary array or sparse matrix instead - the MultiLabelBinarizer transformer can convert to this format.

In [6]:



# Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100)
rf_classifier.fit(X_train_tfidf, y_train)
rf_predictions = rf_classifier.predict(X_test_tfidf)

print("Random Forest Classifier:")
print("Accuracy:", accuracy_score(y_test, rf_predictions))
print(classification_report(y_test, rf_predictions))



ValueError: Unknown label type: unknown. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.

In [None]:


# ANN with Hugging Face Transformers
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")

ann_classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)
ann_predictions = ann_classifier(X_test.tolist(), truncation=True, padding=True)

ann_predictions = [entry['label'] for entry in ann_predictions]

print("Artificial Neural Network (BERT):")
print("Accuracy:", accuracy_score(y_test, ann_predictions))
print(classification_report(y_test, ann_predictions))

# RNN with PyTorch
from torch.utils.data import DataLoader, TensorDataset
from torch import nn, optim
import torch

# Tokenize and encode text data
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
encoded_texts = tokenizer(list(X_train), padding=True, truncation=True, return_tensors='pt')
X_train_tensors = encoded_texts['input_ids']
X_train_masks = encoded_texts['attention_mask']

encoded_texts = tokenizer(list(X_test), padding=True, truncation=True, return_tensors='pt')
X_test_tensors = encoded_texts['input_ids']
X_test_masks = encoded_texts['attention_mask']

# Create PyTorch DataLoader
batch_size = 32
train_data = TensorDataset(X_train_tensors, X_train_masks, torch.tensor(y_train.values))
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

test_data = TensorDataset(X_test_tensors, X_test_masks, torch.tensor(y_test.values))
test_loader = DataLoader(test_data, batch_size=batch_size)

# RNN Model
class RNNClassifier(nn.Module):
    def __init__(self):
        super(RNNClassifier, self).__init__()
        self.bert = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")
        self.dropout = nn.Dropout(0.2)
        self.fc1 = nn.Linear(768, 2)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        output = outputs.logits
        return output

rnn_model = RNNClassifier()
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(rnn_model.parameters(), lr=1e-5)

# Training loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
rnn_model.to(device)
for epoch in range(3):
    rnn_model.train()
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = rnn_model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

# Evaluation
rnn_model.eval()
rnn_predictions = []
with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, _ = batch
        input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)
        outputs = rnn_model(input_ids, attention_mask)
        predictions = torch.argmax(outputs, dim=1).cpu().numpy()
        rnn_predictions.extend(predictions)

print("Recurrent Neural Network (BERT):")
print("Accuracy:", accuracy_score(y_test, rnn_predictions))
print(classification_report(y_test, rnn_predictions))
