In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install sentence_transformers
!pip install lime

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

import re
from bs4 import BeautifulSoup
from tqdm import tqdm
from nltk.stem import WordNetLemmatizer
import nltk
from nltk.corpus import stopwords
nltk.download('wordnet')
nltk.download('stopwords')
import re
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
import string

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,f1_score, confusion_matrix

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense , Input , LSTM , Embedding, Dropout , Activation, GRU, Flatten
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model, Sequential
from keras.layers import Convolution1D
from keras import initializers, regularizers, constraints, optimizers, layers
from tqdm import tqdm
from operator import itemgetter

In [None]:
def load_data(data_file):
  # read csv file
  df = pd.read_csv(data_file)

  # replace nan(no value) comment with ""(empty string)
  df.fillna("", inplace=True)

  comments = df['comments'].tolist()
  genders = df['labels'].tolist()

  genders = [0 if gender == "Male" else 1 for gender in genders]

  return comments, genders

In [None]:
comments, genders = load_data('/content/drive/MyDrive/Datasets/Bert Modal On One Comment Full Data/cleaned_dataset.csv')

# df = pd.DataFrame(list(zip(comments, genders)),
#                columns =['comments', 'genders'])

# print(df)

comments = np.array(comments)
genders = np.array(genders)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(comments, genders,
                                                    test_size=0.2,
                                                    stratify=genders,
                                                    random_state=42)
print(len(X_train),len( X_test), len(y_train),len( y_test))

In [None]:
# Model

top_words = 10000
max_comment_length = 300
embedding_vecor_length = 64

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
list_tokenized_train = tokenizer.texts_to_sequences(X_train)

In [None]:
X_train = pad_sequences(list_tokenized_train, maxlen=max_comment_length)

In [None]:
model = Sequential()
model.add(Embedding(top_words+1, embedding_vecor_length, input_length=max_comment_length))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
model.fit(X_train,y_train, epochs=2, batch_size=64)

In [None]:
list_tokenized_test = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(list_tokenized_test, maxlen=max_comment_length)
prediction = model.predict(X_test)
y_pred = (prediction > 0.5)
print("Accuracy of the model : ", accuracy_score(y_pred, y_test))
print('F1-score: ', f1_score(y_pred, y_test))
print('Confusion matrix:')
confusion_matrix(y_test,y_pred)


In [None]:
def predict_proba(arr):
  list_tokenized_ex = tokenizer.texts_to_sequences(arr)
  Ex = pad_sequences(list_tokenized_ex, maxlen=max_comment_length)
  pred=model.predict(Ex, verbose=None)
  returnable=[]
  for i in pred:
    temp=i[0]
    returnable.append(np.array([1-temp,temp])) #I would recommend rounding temp and 1-temp off to 2 places
  return np.array(returnable)


In [None]:
def predict_male_or_female(txt):
    arr = np.array([txt])
    list_tokenized_test = tokenizer.texts_to_sequences(arr)
    test_instance = pad_sequences(list_tokenized_test, maxlen=max_comment_length)
    prediction = model.predict(test_instance, verbose=None)
    y_pred = prediction[0][0]
    pred = 0
    if(y_pred > 0.5):
        pred = 1
    else:
        pred = 0
    # y_pred = (prediction > 0.5)
    return pred

In [None]:
txt = "I've had the 50watter since Oct of last year and I'm still impressed and blown away every time I play it. It's unreal. Killer choice"

print(predict_male_or_female(txt))

In [None]:
from lime.lime_text import LimeTextExplainer
class_names=['Male','Female']
explainer= LimeTextExplainer(class_names=class_names)


In [None]:
explainer.explain_instance("I've had the 50watter since Oct of last year and I'm still impressed and blown away every time I play it. It's unreal. Killer choice",predict_proba).show_in_notebook(text=True)

In [None]:
def sort_tuples_array_by_second_item(tuples):
    # Sort the tuples by the second item using the itemgetter function
    return sorted(tuples, key=itemgetter(1))

In [None]:
def get_max_explained_words(txt):

  prediction = predict_male_or_female(txt)
  # print(" ")
  # print("prediction")
  # print(prediction)

  exp = explainer.explain_instance(txt, predict_proba)

  exp_list = []
  for x in zip(exp.local_exp[1], exp.as_list()):
    exp_list.append((x[1][0], x[1][1], x[0][0]))

  # print("exp_list")
  # print(exp_list)

  # features with negative score are for Male class
  male_list = list(filter(lambda x: x[1] < 0, exp_list))
  male_list = sort_tuples_array_by_second_item(male_list)

  # print("male_list")
  # print(male_list)
  # print(len(male_list))

  # features with positive score are for female class
  female_list = list(filter(lambda x: x[1] > 0, exp_list))
  female_list = sort_tuples_array_by_second_item(female_list)

  # print("female_list")
  # print(female_list)
  # print(len(female_list))

  # # min is used while the male score is negative
  male_mc = min(male_list, key=itemgetter(1)) if len(male_list) else None

  # print("male_mc")
  # print(male_mc)

  # max is used while the female score is negative
  female_mc = max(female_list, key=itemgetter(1)) if len(female_list) else None

  # print("female_mc")
  # print(female_mc)

  # if comment predicted Male
  if prediction == 0:
    if len(male_list) > 1:
      male_mc = male_list[0]
      if (male_mc, 0) in words:
        words[(male_mc[0], 0)]['lime_score'].extend(male_mc[1])
        words[(male_mc[0], 0)]['position'] = male_mc[2]
      else:
        words[(male_mc[0], 0)] = {}
        words[(male_mc[0], 0)]['lime_score'] = [male_mc[1]]
        words[(male_mc[0], 0)]['position'] = male_mc[2]
        wordsForCSV.append([male_mc[0], 0, male_mc[1]])

      male_mc = male_list[1]
      if (male_mc, 0) in words:
        words[(male_mc[0], 0)]['lime_score'].extend(male_mc[1])
        words[(male_mc[0], 0)]['position'] = male_mc[2]
      else:
        words[(male_mc[0], 0)] = {}
        words[(male_mc[0], 0)]['lime_score'] = [male_mc[1]]
        words[(male_mc[0], 0)]['position'] = male_mc[2]
        wordsForCSV.append([male_mc[0], 0, male_mc[1]])
    elif len(male_list) == 1:
      male_mc = male_list[0]
      if (male_mc, 0) in words:
        words[(male_mc[0], 0)]['lime_score'].extend(male_mc[1])
        words[(male_mc[0], 0)]['position'] = male_mc[2]
      else:
        words[(male_mc[0], 0)] = {}
        words[(male_mc[0], 0)]['lime_score'] = [male_mc[1]]
        words[(male_mc[0], 0)]['position'] = male_mc[2]
        wordsForCSV.append([male_mc[0], 0, male_mc[1]])

  else:
    if len(female_list) > 1:
      female_mc = female_list[(len(female_list)-1)]
      if (female_mc, 1) in words:
        words[(female_mc[0], 1)]['lime_score'].extend(female_mc[1])
        words[(female_mc[0], 1)]['position'] = female_mc[2]
      else:
        words[(female_mc[0], 1)] = {}
        words[(female_mc[0], 1)]['lime_score'] = [female_mc[1]]
        words[(female_mc[0], 1)]['position'] = female_mc[2]
        wordsForCSV.append([female_mc[0], 1, female_mc[1]])

      female_mc = female_list[(len(female_list)-2)]
      if (female_mc, 1) in words:
        words[(female_mc[0], 1)]['lime_score'].extend(female_mc[1])
        words[(female_mc[0], 1)]['position'] = female_mc[2]
      else:
        words[(female_mc[0], 1)] = {}
        words[(female_mc[0], 1)]['lime_score'] = [female_mc[1]]
        words[(female_mc[0], 1)]['position'] = female_mc[2]
        wordsForCSV.append([female_mc[0], 1, female_mc[1]])

    elif len(female_list) == 1:
      female_mc = female_list[0]
      if (female_mc, 1) in words:
        words[(female_mc[0], 1)]['lime_score'].extend(female_mc[1])
        words[(female_mc[0], 1)]['position'] = female_mc[2]
      else:
        words[(female_mc[0], 1)] = {}
        words[(female_mc[0], 1)]['lime_score'] = [female_mc[1]]
        words[(female_mc[0], 1)]['position'] = female_mc[2]
        wordsForCSV.append([female_mc[0], 1, female_mc[1]])

  # -------------------------------------------------------------------------------------------------------------

  # Male words
  # if male_mc is not None:
  #   if (male_mc, 0) in words:
  #     words[(male_mc[0], 0)]['lime_score'].extend(male_mc[1])
  #     words[(male_mc[0], 0)]['position'] = male_mc[2]
  #   else:
  #     words[(male_mc[0], 0)] = {}
  #     words[(male_mc[0], 0)]['lime_score'] = [male_mc[1]]
  #     words[(male_mc[0], 0)]['position'] = male_mc[2]
  #     wordsForCSV.append([male_mc[0], 0, male_mc[1]])

  #Female Words
  # if female_mc is not None:
  #   if (female_mc, 1) in words:
  #     words[(female_mc[0], 1)]['lime_score'].extend(female_mc[1])
  #     words[(female_mc[0], 1)]['position'] = female_mc[2]
  #   else:
  #     words[(female_mc[0], 1)] = {}
  #     words[(female_mc[0], 1)]['lime_score'] = [female_mc[1]]
  #     words[(female_mc[0], 1)]['position'] = female_mc[2]
  #     wordsForCSV.append([female_mc[0], 1, female_mc[1]])

  return words, wordsForCSV

In [None]:
# txt = "This is a great picture of u!!!! Beautiful"

# words, wordsForCSV = get_max_explained_words(txt)

In [None]:
def load_original_data(data_file):
  # read csv file
  df = pd.read_csv(data_file)

  # replace nan(no value) comment with ""(empty string)
  df.fillna("", inplace=True)

  comments = df['comment'].tolist()
  genders = df['user_gender'].tolist()

  genders = [0 if gender == "Male" else 1 for gender in genders]

  return comments, genders

In [None]:
original_comments, original_genders = load_original_data('/content/drive/MyDrive/Datasets/dataset.csv')

In [None]:
n = int(len(original_comments) / 14)
x = [original_comments[i:i + n] for i in range(0, len(original_comments), n)]
y = [original_genders[i:i + n] for i in range(0, len(original_genders), n)]

first_half_comments = x[0]
second_half_comments = x[1]
third_half_comments = x[2]
fourth_half_comments = x[3]
fifth_half_comments = x[4]
sixth_half_comments = x[5]
seventh_half_comments = x[6]
eighth_half_comments = x[7]
ninth_half_comments = x[8]
tenth_half_comments = x[9]
eleventh_half_comments = x[10]
twelveth_half_comments = x[11]
thirteenth_half_comments = x[12]
fourteenth_half_comments = x[13]

first_half_genders = y[0]
second_half_genders = y[1]
third_half_genders = y[2]
fourth_half_genders = y[3]
fifth_half_genders = x[4]
sixth_half_genders = x[5]
seventh_half_genders = x[6]
eighth_half_genders = x[7]
ninth_half_genders = x[8]
tenth_half_genders = x[9]
eleventh_half_genders = x[10]
twelveth_half_genders = x[11]
thirteenth_half_genders = x[12]
fourteenth_half_genders = x[13]

In [None]:
print(len(first_half_comments))
print(len(second_half_comments))
print(len(third_half_comments))
print(len(fourth_half_comments))
print(len(fifth_half_comments))
print(len(sixth_half_comments))
print(len(seventh_half_comments))
print(len(eighth_half_comments))
print(len(ninth_half_comments))
print(len(tenth_half_comments))
print(len(eleventh_half_comments))
print(len(twelveth_half_comments))
print(len(thirteenth_half_comments))
print(len(fourteenth_half_comments))


print("=====================")
print(len(first_half_genders))
print(len(second_half_genders))
print(len(third_half_genders))
print(len(fourth_half_genders))
print(len(fifth_half_genders))
print(len(sixth_half_genders))
print(len(seventh_half_genders))
print(len(eighth_half_genders))
print(len(ninth_half_genders))
print(len(tenth_half_genders))
print(len(eleventh_half_genders))
print(len(twelveth_half_genders))
print(len(thirteenth_half_genders))
print(len(fourteenth_half_genders))

In [None]:
words = {}
wordsForCSV = []

for comment in tqdm(first_half_comments, total = len(first_half_comments)):
    words, wordsForCSV = get_max_explained_words(comment)

In [None]:
# print(wordsForCSV)

In [None]:
import csv

header=["word", "label", "limescore"]

with open('/content/drive/MyDrive/Datasets/RNN/1_extracted_strong_words_by_rnn.csv', 'w', encoding='UTF8', newline='') as f:
    writer = csv.writer(f)

    # write the header
    writer.writerow(header)

    # write multiple rows
    writer.writerows(wordsForCSV)