In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install lime

In [None]:
# Importing libraries
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding, Flatten
from keras.models import Model
from keras.models import Sequential
from keras.initializers import Constant

from lime.lime_text import LimeTextExplainer

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from operator import itemgetter
from tqdm import tqdm

In [None]:
def load_data(data_file):
  # read csv file
  df = pd.read_csv(data_file)

  # replace nan(no value) comment with ""(empty string)
  df.fillna("", inplace=True)

  comments = df['comments'].tolist()
  genders = df['labels'].tolist()

  genders = [0 if gender == "Male" else 1 for gender in genders]

  return comments, genders

In [None]:
comments, genders = load_data('/content/drive/MyDrive/Datasets/Bert Modal On One Comment Full Data/cleaned_dataset.csv')

# df = pd.DataFrame(list(zip(comments, genders)),
#                columns =['comments', 'genders'])

# print(df)

comments = np.array(comments)
genders = np.array(genders)

In [None]:
MAX_SEQUENCE_LENGTH = 300
MAX_NUM_WORDS = 50000

In [None]:
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(comments)
sequences = tokenizer.texts_to_sequences(comments)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

In [None]:
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = to_categorical(genders)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, labels,
                                                    test_size=0.2,
                                                    stratify=genders,
                                                    random_state=42)
print(len(X_train),len( X_test), len(y_train),len( y_test))

In [None]:
EMBEDDING_DIM = 60
num_words = MAX_NUM_WORDS
embedding_layer = Embedding(num_words,EMBEDDING_DIM,input_length=MAX_SEQUENCE_LENGTH,trainable=True)

In [None]:
# Model Building

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(64, 3, activation='relu')(embedded_sequences)
x = Conv1D(64, 3, activation='relu')(x)
x = MaxPooling1D(2)(x)
x=Flatten()(x)
x = Dense(100, activation='relu')(x)
preds = Dense(2, activation='softmax')(x)
model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',optimizer='rmsprop',metrics=['acc'])
model.summary()


In [None]:
# Model training

model.fit(X_train, y_train,batch_size=50, epochs=30, validation_data=(X_test, y_test))

In [None]:
# evaluate the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))


In [None]:
def predict_proba(arr):
  sequences_new = tokenizer.texts_to_sequences(arr)
  data = pad_sequences(sequences_new, maxlen=MAX_SEQUENCE_LENGTH)
  yprob = model.predict(data, verbose=None)
  # yclasses=yprob.argmax(axis=-1)

  returnable=[]
  for i in yprob:
    temp=i[0]
    returnable.append(np.array([temp,1-temp])) #I would recommend rounding temp and 1-temp off to 2 places
  return np.array(returnable)

  # return yclasses

In [None]:
def predict_male_or_female(txt):
    arr = np.array([txt])
    sequences_new = tokenizer.texts_to_sequences(arr)
    data = pad_sequences(sequences_new, maxlen=MAX_SEQUENCE_LENGTH)
    yprob = model.predict(data, verbose=None)
    yclasses=yprob.argmax(axis=-1)

    return yclasses[0]

In [None]:
# from sklearn.metrics import accuracy_score, classification_report

# predictions = predict_proba(comments)

# accuracy = accuracy_score(genders, predictions)
# print(f"Validation Accuracy: {accuracy:.4f}")


# report = classification_report(genders, predictions)
# print(report)

In [None]:
txt = "I've had the 50watter since Oct of last year and I'm still impressed and blown away every time I play it. It's unreal. Killer choice"

print(predict_male_or_female(txt))

In [None]:
class_names=['Male','Female']
explainer= LimeTextExplainer(class_names=class_names)

In [None]:
# txt = "I've had the 50watter since Oct of last year and I'm still impressed and blown away every time I play it. It's unreal. Killer choice"
# txt = "This is a great picture of u!!!! Beautiful"

# explainer.explain_instance(txt,predict_proba).show_in_notebook(text=True)

In [None]:
def sort_tuples_array_by_second_item(tuples):
    # Sort the tuples by the second item using the itemgetter function
    return sorted(tuples, key=itemgetter(1))

In [None]:
def get_max_explained_words(txt):

  prediction = predict_male_or_female(txt)
  # print(" ")
  # print("prediction")
  # print(prediction)

  exp = explainer.explain_instance(txt, predict_proba)

  exp_list = []
  for x in zip(exp.local_exp[1], exp.as_list()):
    exp_list.append((x[1][0], x[1][1], x[0][0]))

  # print("exp_list")
  # print(exp_list)

  # features with negative score are for Male class
  male_list = list(filter(lambda x: x[1] < 0, exp_list))
  male_list = sort_tuples_array_by_second_item(male_list)

  # print("male_list")
  # print(male_list)
  # print(len(male_list))

  # features with positive score are for female class
  female_list = list(filter(lambda x: x[1] > 0, exp_list))
  female_list = sort_tuples_array_by_second_item(female_list)

  # print("female_list")
  # print(female_list)
  # print(len(female_list))

  # # min is used while the male score is negative
  male_mc = min(male_list, key=itemgetter(1)) if len(male_list) else None

  # print("male_mc")
  # print(male_mc)

  # max is used while the female score is negative
  female_mc = max(female_list, key=itemgetter(1)) if len(female_list) else None

  # print("female_mc")
  # print(female_mc)

  # if comment predicted Male
  if prediction == 0:
    if len(male_list) > 1:
      male_mc = male_list[0]
      if (male_mc, 0) in words:
        words[(male_mc[0], 0)]['lime_score'].extend(male_mc[1])
        words[(male_mc[0], 0)]['position'] = male_mc[2]
      else:
        words[(male_mc[0], 0)] = {}
        words[(male_mc[0], 0)]['lime_score'] = [male_mc[1]]
        words[(male_mc[0], 0)]['position'] = male_mc[2]
        wordsForCSV.append([male_mc[0], 0, male_mc[1]])

      male_mc = male_list[1]
      if (male_mc, 0) in words:
        words[(male_mc[0], 0)]['lime_score'].extend(male_mc[1])
        words[(male_mc[0], 0)]['position'] = male_mc[2]
      else:
        words[(male_mc[0], 0)] = {}
        words[(male_mc[0], 0)]['lime_score'] = [male_mc[1]]
        words[(male_mc[0], 0)]['position'] = male_mc[2]
        wordsForCSV.append([male_mc[0], 0, male_mc[1]])
    elif len(male_list) == 1:
      male_mc = male_list[0]
      if (male_mc, 0) in words:
        words[(male_mc[0], 0)]['lime_score'].extend(male_mc[1])
        words[(male_mc[0], 0)]['position'] = male_mc[2]
      else:
        words[(male_mc[0], 0)] = {}
        words[(male_mc[0], 0)]['lime_score'] = [male_mc[1]]
        words[(male_mc[0], 0)]['position'] = male_mc[2]
        wordsForCSV.append([male_mc[0], 0, male_mc[1]])

  else:
    if len(female_list) > 1:
      female_mc = female_list[(len(female_list)-1)]
      if (female_mc, 1) in words:
        words[(female_mc[0], 1)]['lime_score'].extend(female_mc[1])
        words[(female_mc[0], 1)]['position'] = female_mc[2]
      else:
        words[(female_mc[0], 1)] = {}
        words[(female_mc[0], 1)]['lime_score'] = [female_mc[1]]
        words[(female_mc[0], 1)]['position'] = female_mc[2]
        wordsForCSV.append([female_mc[0], 1, female_mc[1]])

      female_mc = female_list[(len(female_list)-2)]
      if (female_mc, 1) in words:
        words[(female_mc[0], 1)]['lime_score'].extend(female_mc[1])
        words[(female_mc[0], 1)]['position'] = female_mc[2]
      else:
        words[(female_mc[0], 1)] = {}
        words[(female_mc[0], 1)]['lime_score'] = [female_mc[1]]
        words[(female_mc[0], 1)]['position'] = female_mc[2]
        wordsForCSV.append([female_mc[0], 1, female_mc[1]])

    elif len(female_list) == 1:
      female_mc = female_list[0]
      if (female_mc, 1) in words:
        words[(female_mc[0], 1)]['lime_score'].extend(female_mc[1])
        words[(female_mc[0], 1)]['position'] = female_mc[2]
      else:
        words[(female_mc[0], 1)] = {}
        words[(female_mc[0], 1)]['lime_score'] = [female_mc[1]]
        words[(female_mc[0], 1)]['position'] = female_mc[2]
        wordsForCSV.append([female_mc[0], 1, female_mc[1]])

  # -------------------------------------------------------------------------------------------------------------

  # Male words
  # if male_mc is not None:
  #   if (male_mc, 0) in words:
  #     words[(male_mc[0], 0)]['lime_score'].extend(male_mc[1])
  #     words[(male_mc[0], 0)]['position'] = male_mc[2]
  #   else:
  #     words[(male_mc[0], 0)] = {}
  #     words[(male_mc[0], 0)]['lime_score'] = [male_mc[1]]
  #     words[(male_mc[0], 0)]['position'] = male_mc[2]
  #     wordsForCSV.append([male_mc[0], 0, male_mc[1]])

  #Female Words
  # if female_mc is not None:
  #   if (female_mc, 1) in words:
  #     words[(female_mc[0], 1)]['lime_score'].extend(female_mc[1])
  #     words[(female_mc[0], 1)]['position'] = female_mc[2]
  #   else:
  #     words[(female_mc[0], 1)] = {}
  #     words[(female_mc[0], 1)]['lime_score'] = [female_mc[1]]
  #     words[(female_mc[0], 1)]['position'] = female_mc[2]
  #     wordsForCSV.append([female_mc[0], 1, female_mc[1]])

  return words, wordsForCSV

In [None]:
def load_original_data(data_file):
  # read csv file
  df = pd.read_csv(data_file)

  # replace nan(no value) comment with ""(empty string)
  df.fillna("", inplace=True)

  comments = df['comment'].tolist()
  genders = df['user_gender'].tolist()

  genders = [0 if gender == "Male" else 1 for gender in genders]

  return comments, genders

In [None]:
original_comments, original_genders = load_original_data('/content/drive/MyDrive/Datasets/dataset.csv')

In [None]:
n = int(len(original_comments) / 14)
x = [original_comments[i:i + n] for i in range(0, len(original_comments), n)]
y = [original_genders[i:i + n] for i in range(0, len(original_genders), n)]

first_half_comments = x[0]
second_half_comments = x[1]
third_half_comments = x[2]
fourth_half_comments = x[3]
fifth_half_comments = x[4]
sixth_half_comments = x[5]
seventh_half_comments = x[6]
eighth_half_comments = x[7]
ninth_half_comments = x[8]
tenth_half_comments = x[9]
eleventh_half_comments = x[10]
twelveth_half_comments = x[11]
thirteenth_half_comments = x[12]
fourteenth_half_comments = x[13]

first_half_genders = y[0]
second_half_genders = y[1]
third_half_genders = y[2]
fourth_half_genders = y[3]
fifth_half_genders = x[4]
sixth_half_genders = x[5]
seventh_half_genders = x[6]
eighth_half_genders = x[7]
ninth_half_genders = x[8]
tenth_half_genders = x[9]
eleventh_half_genders = x[10]
twelveth_half_genders = x[11]
thirteenth_half_genders = x[12]
fourteenth_half_genders = x[13]

In [None]:
print(len(first_half_comments))
print(len(second_half_comments))
print(len(third_half_comments))
print(len(fourth_half_comments))
print(len(fifth_half_comments))
print(len(sixth_half_comments))
print(len(seventh_half_comments))
print(len(eighth_half_comments))
print(len(ninth_half_comments))
print(len(tenth_half_comments))
print(len(eleventh_half_comments))
print(len(twelveth_half_comments))
print(len(thirteenth_half_comments))
print(len(fourteenth_half_comments))


print("=====================")
print(len(first_half_genders))
print(len(second_half_genders))
print(len(third_half_genders))
print(len(fourth_half_genders))
print(len(fifth_half_genders))
print(len(sixth_half_genders))
print(len(seventh_half_genders))
print(len(eighth_half_genders))
print(len(ninth_half_genders))
print(len(tenth_half_genders))
print(len(eleventh_half_genders))
print(len(twelveth_half_genders))
print(len(thirteenth_half_genders))
print(len(fourteenth_half_genders))

In [None]:
words = {}
wordsForCSV = []

for comment in tqdm(first_half_comments, total = len(first_half_comments)):
    words, wordsForCSV = get_max_explained_words(comment)

In [None]:
import csv

header=["word", "label", "limescore"]

with open('/content/drive/MyDrive/Datasets/CNN/1_extracted_strong_words_by_rnn.csv', 'w', encoding='UTF8', newline='') as f:
    writer = csv.writer(f)

    # write the header
    writer.writerow(header)

    # write multiple rows
    writer.writerows(wordsForCSV)