In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense , Input , LSTM , Embedding
from tensorflow.keras.models import Model, Sequential

In [2]:
# Load dataset
file_path = "C:/Users/ADMIN PC/Desktop/Comment/Comment_dataset.csv"

In [3]:
def load_data(data_file):
  df = pd.read_csv(data_file)

  # replace nan(no value) comment with ""(empty string)
  df.fillna("", inplace=True)

  comments = df['comment'].tolist()
  genders = df['user_gender'].tolist()

  genders = [0 if gender == "Male" else 1 for gender in genders]

  return comments, genders

In [4]:
comments, genders = load_data(file_path)
comments = np.array(comments)
genders = np.array(genders)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(comments, genders,
                                                    test_size=0.2,
                                                    stratify=genders,
                                                    random_state=42)

In [6]:
top_words = 10000
max_comment_length = 300
embedding_vecor_length = 768

In [7]:
tokenizer = Tokenizer(num_words=top_words)

In [8]:
tokenizer.fit_on_texts(X_train)

In [9]:
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [10]:
X_train_pad = pad_sequences(X_train_seq, maxlen=max_comment_length)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_comment_length)

In [11]:
model = Sequential([
    Embedding(input_dim=top_words+1, output_dim=embedding_vecor_length, input_length=max_comment_length),
    LSTM(100, return_sequences=False),
    Dense(1, activation='sigmoid')
])
model.build(input_shape=(None, max_comment_length))



In [12]:
model.summary()

In [13]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [14]:
history = model.fit(X_train_pad, y_train,
                    validation_data=(X_test_pad, y_test),
                    epochs=3,
                    batch_size=64,
                    verbose=1)

Epoch 1/3
[1m2377/2377[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m790s[0m 332ms/step - accuracy: 0.6848 - loss: 0.5780 - val_accuracy: 0.7418 - val_loss: 0.5000
Epoch 2/3
[1m2377/2377[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m690s[0m 290ms/step - accuracy: 0.7768 - loss: 0.4418 - val_accuracy: 0.7516 - val_loss: 0.4882
Epoch 3/3
[1m2377/2377[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m691s[0m 291ms/step - accuracy: 0.8117 - loss: 0.3752 - val_accuracy: 0.7615 - val_loss: 0.4948


In [15]:
model.save("C:/Users/ADMIN PC/Desktop/Comment/RNN/RNN.h5")



In [16]:
from keras.models import load_model
model = load_model("C:/Users/ADMIN PC/Desktop/Comment/RNN/RNN.h5")



In [17]:
list_tokenized_test = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(list_tokenized_test, maxlen=max_comment_length)
prediction = model.predict(X_test)
y_pred = (prediction > 0.5)
print("Accuracy of the model : ", accuracy_score(y_pred, y_test))

[1m1189/1189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 36ms/step
Accuracy of the model :  0.7615265248152336


In [18]:
def predict_male_or_female(comment):
    comment_seq = tokenizer.texts_to_sequences([comment])
    comment_pad = pad_sequences(comment_seq, maxlen=max_comment_length)
    prob = model.predict(comment_pad)[0][0]
    gender = "Female" if prob >= 0.5 else "Male"

    return gender

In [19]:
def predict_probability(comments):
    if isinstance(comments, str):
        comments = [comments]

    comment_seq = tokenizer.texts_to_sequences(comments)
    comment_pad = pad_sequences(comment_seq, maxlen=max_comment_length)
    probs = model.predict(comment_pad)
    probs = np.column_stack([1 - probs, probs])  # [Male_prob, Female_prob]

    return probs

In [20]:
txt = "I've had the 50watter since Oct of last year and I'm still impressed and blown away every time I play it. It's unreal. Killer choice"
print(predict_male_or_female(txt))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
Male


In [21]:
txt = "This is a great picture of u!!!! Beautiful"
print(predict_male_or_female(txt))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
Female
