In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense , Input , LSTM , Embedding
from tensorflow.keras.models import Model, Sequential

In [11]:
top_words = 10000
max_comment_length = 300
embedding_vecor_length = 768

In [13]:
tokenizer = Tokenizer(num_words=top_words)

In [27]:
from keras.models import load_model
model = load_model("C:/Users/user/Downloads/Project/Comment/RNN/RNN.h5")



In [None]:
def predict_male_or_female(comment):
    comment_seq = tokenizer.texts_to_sequences([comment])
    comment_pad = pad_sequences(comment_seq, maxlen=max_comment_length)
    prob = model.predict(comment_pad)[0][0]
    gender = "Female" if prob >= 0.5 else "Male"

    return gender

In [31]:
def predict_probability(comments):
    if isinstance(comments, str):
        comments = [comments]

    comment_seq = tokenizer.texts_to_sequences(comments)
    comment_pad = pad_sequences(comment_seq, maxlen=max_comment_length)
    probs = model.predict(comment_pad)
    probs = np.column_stack([1 - probs, probs])  # [Male_prob, Female_prob]
    
    return probs

In [33]:
txt = "I've had the 50watter since Oct of last year and I'm still impressed and blown away every time I play it. It's unreal. Killer choice"
print(predict_male_or_female(txt))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 354ms/step
Male


In [35]:
txt = "This is a great picture of u!!!! Beautiful"
print(predict_male_or_female(txt))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 153ms/step
Female


In [None]:
!pip install lime

In [37]:
from lime.lime_text import LimeTextExplainer
from operator import itemgetter
from tqdm import tqdm
import csv

In [39]:
class_names = ['Male', 'Female']
explainer = LimeTextExplainer(class_names=class_names)

In [None]:
# Example usage with an explainer (assuming LIME explainer)
txt = "I've had the 50watter since Oct of last year and I'm still impressed and blown away every time I play it. It's unreal. Killer choice"
explainer.explain_instance(txt, predict_probability).show_in_notebook(text=True)

In [None]:
txt = "This is a great picture of u!!!! Beautiful"
explainer.explain_instance(txt, predict_probability).show_in_notebook(text=True)

In [41]:
# Sort tuples array by second item
def sort_tuples_array_by_second_item(tuples):
    return sorted(tuples, key=itemgetter(1))

In [43]:
# Function to get max explained words
def get_max_explained_words(txt, explainer_num_samples=100):
    prediction = predict_male_or_female(txt)
    prediction_label = 0 if prediction == "Male" else 1

    exp = explainer.explain_instance(txt, predict_probability, num_samples=explainer_num_samples)
    exp_list = []
    for x in zip(exp.local_exp[1], exp.as_list()):
        exp_list.append((x[1][0], x[1][1], x[0][0]))

    # Features with negative score are for Male class
    male_list = list(filter(lambda x: x[1] < 0, exp_list))
    male_list = sort_tuples_array_by_second_item(male_list)

    # Features with positive score are for Female class
    female_list = list(filter(lambda x: x[1] > 0, exp_list))
    female_list = sort_tuples_array_by_second_item(female_list)

    # If comment predicted Male
    if prediction_label == 0:
        if len(male_list) > 1:
            male_mc = male_list[0]
            if (male_mc[0], 0) in words:
                words[(male_mc[0], 0)]['lime_score'].append(male_mc[1])
                words[(male_mc[0], 0)]['position'] = male_mc[2]
            else:
                words[(male_mc[0], 0)] = {}
                words[(male_mc[0], 0)]['lime_score'] = [male_mc[1]]
                words[(male_mc[0], 0)]['position'] = male_mc[2]
                wordsForCSV.append([male_mc[0], 0, male_mc[1]])

            male_mc = male_list[1]
            if (male_mc[0], 0) in words:
                words[(male_mc[0], 0)]['lime_score'].append(male_mc[1])
                words[(male_mc[0], 0)]['position'] = male_mc[2]
            else:
                words[(male_mc[0], 0)] = {}
                words[(male_mc[0], 0)]['lime_score'] = [male_mc[1]]
                words[(male_mc[0], 0)]['position'] = male_mc[2]
                wordsForCSV.append([male_mc[0], 0, male_mc[1]])
    else:
        if len(female_list) > 1:
            female_mc = female_list[(len(female_list) - 1)]
            if (female_mc[0], 1) in words:
                words[(female_mc[0], 1)]['lime_score'].append(female_mc[1])
                words[(female_mc[0], 1)]['position'] = female_mc[2]
            else:
                words[(female_mc[0], 1)] = {}
                words[(female_mc[0], 1)]['lime_score'] = [female_mc[1]]
                words[(female_mc[0], 1)]['position'] = female_mc[2]
                wordsForCSV.append([female_mc[0], 1, female_mc[1]])

            female_mc = female_list[(len(female_list) - 2)]
            if (female_mc[0], 1) in words:
                words[(female_mc[0], 1)]['lime_score'].append(female_mc[1])
                words[(female_mc[0], 1)]['position'] = female_mc[2]
            else:
                words[(female_mc[0], 1)] = {}
                words[(female_mc[0], 1)]['lime_score'] = [female_mc[1]]
                words[(female_mc[0], 1)]['position'] = female_mc[2]
                wordsForCSV.append([female_mc[0], 1, female_mc[1]])

    return words, wordsForCSV

In [None]:
def load_original_data(data_file):
  df = pd.read_csv(data_file)
  df.fillna("", inplace=True)
  comments = df['comment'].tolist()
  genders = df['user_gender'].tolist()
  genders = [0 if gender == "Male" else 1 for gender in genders]

  return comments, genders

In [47]:
original_comments, original_genders = load_original_data('C:/Users/user/Downloads/Project/Comment/Comment_dataset.csv')

In [49]:
n = int(len(original_comments) / 14)
x = [original_comments[i:i + n] for i in range(0, len(original_comments), n)]
y = [original_genders[i:i + n] for i in range(0, len(original_genders), n)]

In [51]:
# Initialize variables
header = ["word", "label", "limescore"]
words = {}
wordsForCSV = []
file_path = 'C:/Users/user/Downloads/Project/Comment/RNN/extracted_strong_words_by_rnn.csv'

In [None]:
# Process each chunk of comments
for comment in tqdm(x[0], total=len(x[0])):
    words, wordsForCSV = get_max_explained_words(comment)

with open(file_path, 'w', encoding='UTF8', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(header)
    writer.writerows(wordsForCSV)

In [None]:
for comment in tqdm(x[1], total=len(x[1])):
    words, wordsForCSV = get_max_explained_words(comment)

with open(file_path, 'a', encoding='UTF8', newline='') as f:  
    writer = csv.writer(f)
    writer.writerows(wordsForCSV) 

In [None]:
for comment in tqdm(x[2], total=len(x[2])):
    words, wordsForCSV = get_max_explained_words(comment)

with open(file_path, 'a', encoding='UTF8', newline='') as f:  
    writer = csv.writer(f)
    writer.writerows(wordsForCSV)  

In [None]:
for comment in tqdm(x[3], total=len(x[3])):
    words, wordsForCSV = get_max_explained_words(comment)

with open(file_path, 'w', encoding='UTF8', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(header)
    writer.writerows(wordsForCSV)

In [None]:
for comment in tqdm(x[4], total=len(x[4])):
    words, wordsForCSV = get_max_explained_words(comment)

with open(file_path, 'w', encoding='UTF8', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(wordsForCSV)

In [None]:
for comment in tqdm(x[5], total=len(x[5])):
    words, wordsForCSV = get_max_explained_words(comment)

with open(file_path, 'w', encoding='UTF8', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(header)
    writer.writerows(wordsForCSV)

In [None]:
for comment in tqdm(x[6], total=len(x[6])):
    words, wordsForCSV = get_max_explained_words(comment)

with open(file_path, 'w', encoding='UTF8', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(wordsForCSV)

In [None]:
for comment in tqdm(x[7], total=len(x[7])):
    words, wordsForCSV = get_max_explained_words(comment)

with open(file_path, 'w', encoding='UTF8', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(wordsForCSV)

In [None]:
for comment in tqdm(x[8], total=len(x[8])):
    words, wordsForCSV = get_max_explained_words(comment)

with open(file_path, 'w', encoding='UTF8', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(wordsForCSV)

In [None]:
for comment in tqdm(x[9], total=len(x[9])):
    words, wordsForCSV = get_max_explained_words(comment)

with open(file_path, 'w', encoding='UTF8', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(wordsForCSV)

In [None]:
for comment in tqdm(x[10], total=len(x[10])):
    words, wordsForCSV = get_max_explained_words(comment)

with open(file_path, 'w', encoding='UTF8', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(wordsForCSV)

In [None]:
for comment in tqdm(x[11], total=len(x[11])):
    words, wordsForCSV = get_max_explained_words(comment)

with open(file_path, 'w', encoding='UTF8', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(wordsForCSV)

In [None]:
for comment in tqdm(x[12], total=len(x[12])):
    words, wordsForCSV = get_max_explained_words(comment)

with open(file_path, 'w', encoding='UTF8', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(wordsForCSV)

In [None]:
for comment in tqdm(x[13], total=len(x[13])):
    words, wordsForCSV = get_max_explained_words(comment)

with open(file_path, 'w', encoding='UTF8', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(wordsForCSV)