In [1]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer, TFBertModel
from tqdm import tqdm
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, Conv1D, MaxPooling1D, Flatten, Dense, Embedding
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
import torch




In [2]:
def load_data(data_file):
  # read csv file
  df = pd.read_csv(data_file)
  df.fillna("", inplace=True)
  comments = df['comment'].tolist()
  genders = df['user_gender'].tolist()
  genders = [0 if gender == "Male" else 1 for gender in genders]

  return comments, genders

In [3]:
# Load dataset
comments, genders = load_data('C:/Users/ADMIN PC/Desktop/Comment/Comment_dataset.csv')

In [4]:
comments = np.array(comments)
genders = np.array(genders)

In [5]:
genders

array([1, 1, 1, ..., 0, 0, 0])

In [6]:
MAX_SEQUENCE_LENGTH = 300
MAX_NUM_WORDS = 50000

In [7]:
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)

In [8]:
tokenizer.fit_on_texts(comments)
sequences = tokenizer.texts_to_sequences(comments)

In [9]:
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = to_categorical(genders)

In [10]:
data.shape, labels.shape

((190104, 300), (190104, 2))

In [11]:
X_train, X_test, y_train, y_test = train_test_split(data, labels,
                                                    test_size=0.2,
                                                    stratify=genders,
                                                    random_state=42)

In [12]:
EMBEDDING_DIM = 768
num_words = MAX_NUM_WORDS
embedding_layer = Embedding(num_words,EMBEDDING_DIM,input_length=MAX_SEQUENCE_LENGTH,trainable=True)



In [13]:
sequence_input = Input(shape=(300, ))
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(64, kernel_size=3, activation='relu')(embedded_sequences)
x = Conv1D(64, kernel_size=3, activation='relu')(x)
x = MaxPooling1D(pool_size=2)(x)
x = Flatten()(x)
x = Dense(100, activation='relu')(x)
preds = Dense(2, activation='sigmoid')(x)

model = Model(sequence_input, preds)

In [14]:
model.compile(loss='categorical_crossentropy',optimizer='rmsprop',metrics=['acc'])

In [15]:
model.summary()

In [16]:
model.fit(X_train, y_train,batch_size=50, epochs=10, validation_data=(X_test, y_test))

Epoch 1/10
[1m3042/3042[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m671s[0m 220ms/step - acc: 0.6454 - loss: 0.6185 - val_acc: 0.7269 - val_loss: 0.5359
Epoch 2/10
[1m3042/3042[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m705s[0m 232ms/step - acc: 0.7697 - loss: 0.4719 - val_acc: 0.7573 - val_loss: 0.4880
Epoch 3/10
[1m3042/3042[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m714s[0m 235ms/step - acc: 0.8259 - loss: 0.3762 - val_acc: 0.7697 - val_loss: 0.4910
Epoch 4/10
[1m3042/3042[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m699s[0m 230ms/step - acc: 0.8639 - loss: 0.3036 - val_acc: 0.7695 - val_loss: 0.5106
Epoch 5/10
[1m3042/3042[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m684s[0m 225ms/step - acc: 0.8896 - loss: 0.2505 - val_acc: 0.7730 - val_loss: 0.6024
Epoch 6/10
[1m3042/3042[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m677s[0m 223ms/step - acc: 0.9069 - loss: 0.2137 - val_acc: 0.7705 - val_loss: 0.5991
Epoch 7/10
[1m3042/3042[0m [32m━━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x21d9ecd3c50>

In [17]:
model.save("C:/Users/ADMIN PC/Desktop/Comment/CNN/CNN.h5")



In [18]:
scores = model.evaluate(X_test, y_test, verbose=0)
print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

compile_metrics: 77.26%


In [17]:
def predict_proba(arr):
  sequences_new = tokenizer.texts_to_sequences(arr)
  data = pad_sequences(sequences_new, maxlen=MAX_SEQUENCE_LENGTH)
  yprob = model.predict(data, verbose=None)
  # yclasses=yprob.argmax(axis=-1)

  returnable=[]
  for i in yprob:
    temp=i[0]
    returnable.append(np.array([temp,1- temp]))
  return np.array(returnable)

In [18]:
def predict_male_or_female(txt):
    arr = np.array([txt])
    sequences_new = tokenizer.texts_to_sequences(arr)
    data = pad_sequences(sequences_new, maxlen=MAX_SEQUENCE_LENGTH)
    yprob = model.predict(data)[0]
    gender = "Female" if yprob[1] >= 0.5 else "Male"

    return gender

In [19]:
txt = "I've had the 50watter since Oct of last year and I'm still impressed and blown away every time I play it. It's unreal. Killer choice"
predict_male_or_female(txt)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 154ms/step


'Male'

In [20]:
txt = "You l've got a good man there hun. Take care of each other and it'll last a long time.\nFor sore throats my dad used to take 2 tablespoons of apple cider vinegar and the same amount of honey, mix it in at least 8oz of hot water. Drink it while ot's still hot, but not burning. Worked every time for me...still does."
predict_male_or_female(txt)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step


'Female'