In [None]:
!pip install transformers
!pip install lime

In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
from lime.lime_text import LimeTextExplainer
import numpy as np
from operator import itemgetter
from tqdm import tqdm

In [None]:
def load_data(data_file):
  # read csv file
  df = pd.read_csv(data_file)

  # replace nan(no value) comment with ""(empty string)
  df.fillna("", inplace=True)

  comments = df['comment'].tolist()
  genders = df['user_gender'].tolist()

  genders = [0 if gender == "Male" else 1 for gender in genders]

  return comments, genders

In [None]:
comments, genders = load_data('/content/drive/MyDrive/Datasets/dataset.csv')

In [None]:
n = int(len(comments) / 12)
x = [comments[i:i + n] for i in range(0, len(comments), n)]
y = [genders[i:i + n] for i in range(0, len(genders), n)]

first_half_comments = x[0]
second_half_comments = x[1]
third_half_comments = x[2]
fourth_half_comments = x[3]
fifth_half_comments = x[4]
sixth_half_comments = x[5]
seventh_half_comments = x[6]
eighth_half_comments = x[7]
ninth_half_comments = x[8]
tenth_half_comments = x[9]
eleventh_half_comments = x[10]
twelveth_half_comments = x[11]

first_half_genders = y[0]
second_half_genders = y[1]
third_half_genders = y[2]
fourth_half_genders = y[3]
fifth_half_genders = x[4]
sixth_half_genders = x[5]
seventh_half_genders = x[6]
eighth_half_genders = x[7]
ninth_half_genders = x[8]
tenth_half_genders = x[9]
eleventh_half_genders = x[10]
twelveth_half_genders = x[11]

In [None]:
print(len(first_half_comments))
print(len(second_half_comments))
print(len(third_half_comments))
print(len(fourth_half_comments))
print(len(fifth_half_comments))
print(len(sixth_half_comments))
print(len(seventh_half_comments))
print(len(eighth_half_comments))
print(len(ninth_half_comments))
print(len(tenth_half_comments))
print(len(eleventh_half_comments))
print(len(twelveth_half_comments))


print("=====================")
print(len(first_half_genders))
print(len(second_half_genders))
print(len(third_half_genders))
print(len(fourth_half_genders))
print(len(fifth_half_genders))
print(len(sixth_half_genders))
print(len(seventh_half_genders))
print(len(eighth_half_genders))
print(len(ninth_half_genders))
print(len(tenth_half_genders))
print(len(eleventh_half_genders))
print(len(twelveth_half_genders))

In [None]:
class TextClassificationDataset(Dataset):

    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label)}

In [None]:
class DeepGenderClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(DeepGenderClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.drop = nn.Dropout(p=0.3)
        self.layer1 = nn.Linear(self.bert.config.hidden_size, 128)
        self.drop2 = nn.Dropout(p=0.2)
        self.layer2 = nn.Linear(128, 64)
        self.drop3 = nn.Dropout(p=0.3)
        self.out = nn.Linear(64, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        output = self.drop(pooled_output)
        output = self.layer1(output)
        output = self.drop2(output)
        output = self.layer2(output)
        output = self.drop3(output)
        return self.out(output)

In [None]:
# Set up parameters
bert_model_name = 'bert-base-cased'
num_classes = 2
max_length = 64
batch_size = 32
num_epochs = 6
learning_rate = 2e-5

In [None]:
train_comments, test_comments, train_genders, test_genders = train_test_split(comments, genders, test_size=0.2, random_state=42)

In [None]:
# Initialize tokenizer, dataset, and data loader
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
train_dataset = TextClassificationDataset(train_comments, train_genders, tokenizer, max_length)
test_dataset = TextClassificationDataset(test_comments, test_genders, tokenizer, max_length)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
loaded_model = DeepGenderClassifier(bert_model_name, num_classes)
loaded_model.load_state_dict(torch.load("/content/drive/MyDrive/Datasets/Bert Modal On One Comment Full Data/bert_classifier_on_one_comment_cleaned_data_epoch_3.pth"))
loaded_model.to(device)

In [None]:
def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions)

In [None]:
accuracy, report = evaluate(loaded_model, train_dataloader, device)
print(f"Train Accuracy: {accuracy:.4f}")
print(report)

In [None]:
accuracy, report = evaluate(loaded_model, test_dataloader, device)
print(f"Test Accuracy: {accuracy:.4f}")
print(report)

In [None]:
def predicts(text, model, tokenizer, device, max_length=128):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        # outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        # _, preds = torch.max(outputs, dim=1)
        # return preds.cpu().tolist()
        output = model(input_ids, attention_mask)
        softmax = torch.nn.Softmax(dim=1)
        output = softmax(output.cpu()).numpy()
        return np.array(output)

In [None]:
predicts("I've had the 50watter since Oct of last year and I'm still impressed and blown away every time I play it. It's unreal. Killer choice", loaded_model, tokenizer, device)

In [None]:
RANDOM_SEED = 42
class_names = ['Male', 'Female']

In [None]:
def predict_instance(text):
  preds = predicts(text, loaded_model,tokenizer, device)
  return preds

In [None]:
def predict_male_or_female(text, model, tokenizer, device, max_length=128):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)
        return preds.cpu().tolist()
        return np.array(output)

In [None]:
def sort_tuples_array_by_second_item(tuples):
    # Sort the tuples by the second item using the itemgetter function
    return sorted(tuples, key=itemgetter(1))

In [None]:
def get_max_explained_words(txt, explainer_num_samples=100):

  prediction = predict_male_or_female(txt, loaded_model,tokenizer, device)
  prediction = prediction[0]
  # print(" ")
  # print("prediction")
  # print(prediction)

  exp = explainer.explain_instance(txt, predict_instance, num_samples=explainer_num_samples)

  exp_list = []
  for x in zip(exp.local_exp[1], exp.as_list()):
    exp_list.append((x[1][0], x[1][1], x[0][0]))

  # print("exp_list")
  # print(exp_list)

  # features with negative score are for Male class
  male_list = list(filter(lambda x: x[1] < 0, exp_list))
  male_list = sort_tuples_array_by_second_item(male_list)

  # print("male_list")
  # print(male_list)
  # print(len(male_list))

  # features with positive score are for female class
  female_list = list(filter(lambda x: x[1] > 0, exp_list))
  female_list = sort_tuples_array_by_second_item(female_list)

  # print("female_list")
  # print(female_list)
  # print(len(female_list))

  # min is used while the male score is negative
  male_mc = min(male_list, key=itemgetter(1)) if len(male_list) else None

  # print("male_mc")
  # print(male_mc)

  # max is used while the female score is negative
  female_mc = max(female_list, key=itemgetter(1)) if len(female_list) else None

  # print("female_mc")
  # print(female_mc)

  # if comment predicted Male
  if prediction == 0:
    if len(male_list) > 1:
      male_mc = male_list[0]
      if (male_mc, 0) in words:
        words[(male_mc[0], 0)]['lime_score'].extend(male_mc[1])
        words[(male_mc[0], 0)]['position'] = male_mc[2]
      else:
        words[(male_mc[0], 0)] = {}
        words[(male_mc[0], 0)]['lime_score'] = [male_mc[1]]
        words[(male_mc[0], 0)]['position'] = male_mc[2]
        wordsForCSV.append([male_mc[0], 0, male_mc[1]])

      male_mc = male_list[1]
      if (male_mc, 0) in words:
        words[(male_mc[0], 0)]['lime_score'].extend(male_mc[1])
        words[(male_mc[0], 0)]['position'] = male_mc[2]
      else:
        words[(male_mc[0], 0)] = {}
        words[(male_mc[0], 0)]['lime_score'] = [male_mc[1]]
        words[(male_mc[0], 0)]['position'] = male_mc[2]
        wordsForCSV.append([male_mc[0], 0, male_mc[1]])
    # elif len(male_list) == 1:
    #   male_mc = male_list[0]
    #   if (male_mc, 0) in words:
    #     words[(male_mc[0], 0)]['lime_score'].extend(male_mc[1])
    #     words[(male_mc[0], 0)]['position'] = male_mc[2]
    #   else:
    #     words[(male_mc[0], 0)] = {}
    #     words[(male_mc[0], 0)]['lime_score'] = [male_mc[1]]
    #     words[(male_mc[0], 0)]['position'] = male_mc[2]
    #     wordsForCSV.append([male_mc[0], 0, male_mc[1]])

  else:
    if len(female_list) > 1:
      female_mc = female_list[(len(female_list)-1)]
      if (female_mc, 1) in words:
        words[(female_mc[0], 1)]['lime_score'].extend(female_mc[1])
        words[(female_mc[0], 1)]['position'] = female_mc[2]
      else:
        words[(female_mc[0], 1)] = {}
        words[(female_mc[0], 1)]['lime_score'] = [female_mc[1]]
        words[(female_mc[0], 1)]['position'] = female_mc[2]
        wordsForCSV.append([female_mc[0], 1, female_mc[1]])

      female_mc = female_list[(len(female_list)-2)]
      if (female_mc, 1) in words:
        words[(female_mc[0], 1)]['lime_score'].extend(female_mc[1])
        words[(female_mc[0], 1)]['position'] = female_mc[2]
      else:
        words[(female_mc[0], 1)] = {}
        words[(female_mc[0], 1)]['lime_score'] = [female_mc[1]]
        words[(female_mc[0], 1)]['position'] = female_mc[2]
        wordsForCSV.append([female_mc[0], 1, female_mc[1]])

    # elif len(female_list) == 1:
    #   female_mc = female_list[0]
    #   if (female_mc, 1) in words:
    #     words[(female_mc[0], 1)]['lime_score'].extend(female_mc[1])
    #     words[(female_mc[0], 1)]['position'] = female_mc[2]
    #   else:
    #     words[(female_mc[0], 1)] = {}
    #     words[(female_mc[0], 1)]['lime_score'] = [female_mc[1]]
    #     words[(female_mc[0], 1)]['position'] = female_mc[2]
    #     wordsForCSV.append([female_mc[0], 1, female_mc[1]])

  # Male words
  # if male_mc is not None:
  #   if (male_mc, 0) in words:
  #     words[(male_mc[0], 0)]['lime_score'].extend(male_mc[1])
  #     words[(male_mc[0], 0)]['position'] = male_mc[2]
  #   else:
  #     words[(male_mc[0], 0)] = {}
  #     words[(male_mc[0], 0)]['lime_score'] = [male_mc[1]]
  #     words[(male_mc[0], 0)]['position'] = male_mc[2]
  #     wordsForCSV.append([male_mc[0], 0, male_mc[1]])

  #Female Words
  # if female_mc is not None:
  #   if (female_mc, 1) in words:
  #     words[(female_mc[0], 1)]['lime_score'].extend(female_mc[1])
  #     words[(female_mc[0], 1)]['position'] = female_mc[2]
  #   else:
  #     words[(female_mc[0], 1)] = {}
  #     words[(female_mc[0], 1)]['lime_score'] = [female_mc[1]]
  #     words[(female_mc[0], 1)]['position'] = female_mc[2]
  #     wordsForCSV.append([female_mc[0], 1, female_mc[1]])

  return words, wordsForCSV

In [None]:
explainer = LimeTextExplainer(class_names=class_names, random_state=RANDOM_SEED)

In [None]:
txt = "I've had the 50watter since Oct of last year and I'm still impressed and blown away every time I play it. It's unreal. Killer choice"
# predict_instance(txt)

exp = explainer.explain_instance(txt, predict_instance, num_samples=100)

print("exp")
print(exp.as_list())
print("exp.local_exp[1]")
print(exp.local_exp[1])

In [None]:
words = {}
wordsForCSV = []
# words, wordsForCSV = get_max_explained_words(txt)

In [None]:
for comment in tqdm(second_half_comments, total = len(second_half_comments)):
  words, wordsForCSV = get_max_explained_words(comment)

In [None]:
import csv

header=["word", "label", "limescore"]

with open('/content/drive/MyDrive/Datasets/Bert Modal On One Comment Full Data/2_extracted_strong_words_by_bert_base_cased_on_one_comment_cleaned_data.csv', 'w', encoding='UTF8', newline='') as f:
    writer = csv.writer(f)

    # write the header
    writer.writerow(header)

    # write multiple rows
    writer.writerows(wordsForCSV)


In [None]:
# print(words)
# print("------------------")
print(len(wordsForCSV))
# print(wordsForCSV)