In [None]:
import pandas as pd
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from transformers import get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from datasets import Dataset
import numpy as np

In [None]:
data = pd.read_csv("C:/Users/ADMIN PC/Desktop/Comment/Comment_dataset.csv")
data.head()

In [None]:
data['user_gender'] = data['user_gender'].map({'Male' : 0, 'Female' : 1})

In [None]:
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

In [None]:
# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)

In [None]:
# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["comment"], padding="max_length", truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

In [None]:
# Set the format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'user_gender'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'user_gender'])

In [None]:
#  DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

In [None]:
# Load the BERT model
model = BertForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

In [None]:
def load_model():
    model = BertForSequenceClassification.from_pretrained("C:/Users/ADMIN PC/Desktop/Comment/BERT/bert")
    tokenizer = BertTokenizer.from_pretrained("C:/Users/ADMIN PC/Desktop/Comment/BERT/token")
    model.to(device)
    model.eval()
    return model, tokenizer

In [None]:
model, tokenizer = load_model()

In [None]:
# Predict gender function
def predict_male_or_female(comment, model, tokenizer, device):
    model.eval()
    encoding = tokenizer(comment, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
    input_ids = encoding["input_ids"].to(device)
    attention_mask = encoding["attention_mask"].to(device)
    with torch.no_grad():
        outputs = model(input_ids, attention_mask)
        prediction = torch.argmax(outputs.logits, dim=1).cpu().item()
    return "Male" if prediction == 0 else "Female"

In [None]:
# Predict gender probability function 
def predict_gender_prob(text, model, tokenizer, device, max_length=128):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    with torch.no_grad():
        outputs = model(input_ids, attention_mask)
        softmax = torch.nn.Softmax(dim=1)
        probabilities = softmax(outputs.logits).cpu().numpy()
    return probabilities

In [None]:
# Predict instance function
def predict_instance(text):
    preds = predict_gender_prob(text, model, tokenizer, device)
    return preds

In [None]:
from lime.lime_text import LimeTextExplainer, IndexedString
import numpy as np
from operator import itemgetter
from tqdm import tqdm
import csv

In [None]:
# LIME explainer
class_names = ['Male', 'Female']
explainer = LimeTextExplainer(class_names=class_names, random_state=42)

In [None]:
# Sort tuples array by second item
def sort_tuples_array_by_second_item(tuples):
    return sorted(tuples, key=itemgetter(1))

In [None]:
# Get max explained words function
def get_max_explained_words(txt, explainer_num_samples=100):
    prediction = predict_male_or_female(txt, model, tokenizer, device)
    prediction_label = 0 if prediction == "Male" else 1

    exp = explainer.explain_instance(txt, predict_instance, num_samples=explainer_num_samples)
    exp_list = []
    for x in zip(exp.local_exp[1], exp.as_list()):
        exp_list.append((x[1][0], x[1][1], x[0][0]))

    # Features with negative score are for Male class
    male_list = list(filter(lambda x: x[1] < 0, exp_list))
    male_list = sort_tuples_array_by_second_item(male_list)

    # Features with positive score are for Female class
    female_list = list(filter(lambda x: x[1] > 0, exp_list))
    female_list = sort_tuples_array_by_second_item(female_list)

    # If comment predicted Male
    if prediction_label == 0:
        if len(male_list) > 1:
            male_mc = male_list[0]
            if (male_mc[0], 0) in words:
                words[(male_mc[0], 0)]['lime_score'].append(male_mc[1])
                words[(male_mc[0], 0)]['position'] = male_mc[2]
            else:
                words[(male_mc[0], 0)] = {}
                words[(male_mc[0], 0)]['lime_score'] = [male_mc[1]]
                words[(male_mc[0], 0)]['position'] = male_mc[2]
                wordsForCSV.append([male_mc[0], 0, male_mc[1]])

            male_mc = male_list[1]
            if (male_mc[0], 0) in words:
                words[(male_mc[0], 0)]['lime_score'].append(male_mc[1])
                words[(male_mc[0], 0)]['position'] = male_mc[2]
            else:
                words[(male_mc[0], 0)] = {}
                words[(male_mc[0], 0)]['lime_score'] = [male_mc[1]]
                words[(male_mc[0], 0)]['position'] = male_mc[2]
                wordsForCSV.append([male_mc[0], 0, male_mc[1]])
    else:
        if len(female_list) > 1:
            female_mc = female_list[(len(female_list)-1)]
            if (female_mc[0], 1) in words:
                words[(female_mc[0], 1)]['lime_score'].append(female_mc[1])
                words[(female_mc[0], 1)]['position'] = female_mc[2]
            else:
                words[(female_mc[0], 1)] = {}
                words[(female_mc[0], 1)]['lime_score'] = [female_mc[1]]
                words[(female_mc[0], 1)]['position'] = female_mc[2]
                wordsForCSV.append([female_mc[0], 1, female_mc[1]])

            female_mc = female_list[(len(female_list)-2)]
            if (female_mc[0], 1) in words:
                words[(female_mc[0], 1)]['lime_score'].append(female_mc[1])
                words[(female_mc[0], 1)]['position'] = female_mc[2]
            else:
                words[(female_mc[0], 1)] = {}
                words[(female_mc[0], 1)]['lime_score'] = [female_mc[1]]
                words[(female_mc[0], 1)]['position'] = female_mc[2]
                wordsForCSV.append([female_mc[0], 1, female_mc[1]])

    return words, wordsForCSV

In [None]:
# Load data from CSV
def load_original_data(data_file):
    df = pd.read_csv(data_file)
    df.fillna("", inplace=True) 
    comments = df['comment'].tolist()
    genders = df['user_gender'].map(lambda x: 0 if x == "Male" else 1).tolist()
    return comments, genders

In [None]:
# Load original data
original_comments, original_genders = load_original_data('C:/Users/ADMIN PC/Desktop/Comment/Comment_dataset.csv')

In [None]:
n = int(len(original_comments) / 7)
x = [original_comments[i:i + n] for i in range(0, len(original_comments), n)]
y = [original_genders[i:i + n] for i in range(0, len(original_genders), n)]

In [None]:
# Initialize variables
header = ["word", "label", "limescore"]
file_path = 'C:/Users/ADMIN PC/Desktop/Comment/BERT/file.csv'
words = {}
wordsForCSV = []

In [None]:
# Process each chunk of comments
for comment in tqdm(x[0], total=len(x[0])):
    words, wordsForCSV = get_max_explained_words(comment)

# Write results to CSV
with open(file_path, 'w', encoding='UTF8', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(header)
    writer.writerows(wordsForCSV)

100%|██████████| 27157/27157 [2:56:54<00:00,  2.56it/s]  


In [None]:
for comment in tqdm(x[1], total=len(x[1])):
    words, wordsForCSV = get_max_explained_words(comment)

# Append results to the existing CSV file
with open(file_path, 'a', encoding='UTF8', newline='') as f:  
    writer = csv.writer(f)
    writer.writerows(wordsForCSV)  

100%|██████████| 27157/27157 [2:55:33<00:00,  2.58it/s]  


In [None]:
for comment in tqdm(x[2], total=len(x[2])):
    words, wordsForCSV = get_max_explained_words(comment)

with open(file_path, 'a', encoding='UTF8', newline='') as f:  
    writer = csv.writer(f)
    writer.writerows(wordsForCSV)  

100%|██████████| 27157/27157 [3:04:05<00:00,  2.46it/s]  


In [None]:
for comment in tqdm(x[3], total=len(x[3])):
    words, wordsForCSV = get_max_explained_words(comment)

with open(file_path, 'a', encoding='UTF8', newline='') as f:  
    writer = csv.writer(f)
    writer.writerows(wordsForCSV)  

100%|██████████| 27157/27157 [2:58:03<00:00,  2.54it/s]  


In [None]:
for comment in tqdm(x[4], total=len(x[4])):
    words, wordsForCSV = get_max_explained_words(comment)

with open(file_path, 'a', encoding='UTF8', newline='') as f: 
    writer = csv.writer(f)
    writer.writerows(wordsForCSV)

100%|██████████| 27157/27157 [2:57:09<00:00,  2.55it/s]  


In [None]:
for comment in tqdm(x[5], total=len(x[5])):
    words, wordsForCSV = get_max_explained_words(comment)


with open(file_path, 'a', encoding='UTF8', newline='') as f:  
    writer = csv.writer(f)
    writer.writerows(wordsForCSV)

100%|██████████| 27157/27157 [3:00:33<00:00,  2.51it/s]  


In [None]:
for comment in tqdm(x[6], total=len(x[6])):
    words, wordsForCSV = get_max_explained_words(comment)


with open(file_path, 'a', encoding='UTF8', newline='') as f:  
    writer = csv.writer(f)
    writer.writerows(wordsForCSV)  

100%|██████████| 27157/27157 [3:01:31<00:00,  2.49it/s]  
