In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install lime

In [None]:
# Importing libraries
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding, Flatten
from keras.models import Model
from keras.models import Sequential
from keras.initializers import Constant

from lime.lime_text import LimeTextExplainer

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from operator import itemgetter
from tqdm import tqdm

In [None]:
def load_data(data_file):
  # read csv file
  df = pd.read_csv(data_file)

  # replace nan(no value) comment with ""(empty string)
  df.fillna("", inplace=True)

  comments = df['comments'].tolist()
  genders = df['labels'].tolist()

  genders = [0 if gender == "Male" else 1 for gender in genders]

  return comments, genders

In [None]:
comments, genders = load_data('/content/drive/MyDrive/Datasets/Bert Modal On One Comment Full Data/cleaned_dataset.csv')

# df = pd.DataFrame(list(zip(comments, genders)),
#                columns =['comments', 'genders'])

# print(df)

comments = np.array(comments)
genders = np.array(genders)

In [None]:
MAX_SEQUENCE_LENGTH = 300
MAX_NUM_WORDS = 50000

In [None]:
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(comments)
sequences = tokenizer.texts_to_sequences(comments)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

In [None]:
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = to_categorical(genders)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, labels,
                                                    test_size=0.2,
                                                    stratify=genders,
                                                    random_state=42)
print(len(X_train),len( X_test), len(y_train),len( y_test))

In [None]:
EMBEDDING_DIM = 60
num_words = MAX_NUM_WORDS
embedding_layer = Embedding(num_words,EMBEDDING_DIM,input_length=MAX_SEQUENCE_LENGTH,trainable=True)

In [None]:
# Model Building

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(64, 3, activation='relu')(embedded_sequences)
x = Conv1D(64, 3, activation='relu')(x)
x = MaxPooling1D(2)(x)
x=Flatten()(x)
x = Dense(100, activation='relu')(x)
preds = Dense(2, activation='softmax')(x)
model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',optimizer='rmsprop',metrics=['acc'])
model.summary()


In [None]:
# Model training

model.fit(X_train, y_train,batch_size=50, epochs=30, validation_data=(X_test, y_test))

In [None]:
# evaluate the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

In [None]:
def load_data_for_replaced_dataset(data_file):
  # read csv file
  df = pd.read_csv(data_file)

  # replace nan(no value) comment with ""(empty string)
  df.fillna("", inplace=True)

  comments = df['comment'].tolist()
  genders = df['user_gender'].tolist()

  genders = [0 if gender == "Male" else 1 for gender in genders]

  return comments, genders

In [None]:
replaced_comments, replaced_genders = load_data_for_replaced_dataset('/content/drive/MyDrive/Datasets/CNN/combined_replaced_dataset_one_comment_cnn_with_changed.csv')

replaced_comments = np.array(replaced_comments)
replaced_genders = np.array(replaced_genders)

In [None]:
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(replaced_comments)
sequences = tokenizer.texts_to_sequences(replaced_comments)
replaced_data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
replaced_labels = to_categorical(genders)

In [None]:
scores = model.evaluate(replaced_data, replaced_labels, verbose=0)
print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))