In [1]:
#Load file from drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
#!pip install pandas numpy scikit-learn tensorflow keras imbalanced-learn

# Import the necessary libraries

In [3]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.metrics import Precision


# Load and process initial data

In [4]:
# Load the dataset
file_path = '/content/drive/My Drive/Thesis/data.xlsx'
data = pd.read_excel(file_path)

# Preprocess the data by dropping unnecessary columns
data = data[['name', 'gender']]

# Convert the names to strings if not already
data['name'] = data['name'].astype(str)

# Gender label encoding

In [5]:
label_encoder = LabelEncoder()
data['gender_encoded'] = label_encoder.fit_transform(data['gender'])

# Save label encoder
with open('/content/drive/My Drive/Thesis/model/label_encoder.pickle', 'wb') as file:
    pickle.dump(label_encoder, file, protocol=pickle.HIGHEST_PROTOCOL)

# Tokenize names (char-level) and handle length

In [6]:
# Tokenize the names at the character level
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(data['name'])
sequences = tokenizer.texts_to_sequences(data['name'])
max_length = max([len(seq) for seq in sequences])
vocab_size = len(tokenizer.word_index) + 1
data_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

# Save tokenizer
with open('/content/drive/My Drive/Thesis/model/tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Split train/test data and handle imbalance using SMOTE

In [7]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(data_sequences, data['gender_encoded'],
                                                    test_size=0.2, random_state=42)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# BLSTM model definition and traning model

In [8]:
def create_blstm_model(input_dim, output_dim, input_length):
    model = Sequential()
    model.add(Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length))
    model.add(Bidirectional(LSTM(64)))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(2, activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy', Precision()])
    return model

In [9]:
print("\nTraining and evaluating BLSTM model...")

# Create and compile the BLSTM model
model = create_blstm_model(vocab_size, 50, max_length)

# One-hot encode labels
y_train_encoded = to_categorical(y_train_smote)
y_test_encoded = to_categorical(y_test)

# Train the model
model.fit(X_train_smote, y_train_encoded,
          batch_size=32, epochs=10,
          validation_data=(X_test, y_test_encoded))


Training and evaluating BLSTM model...




Epoch 1/10
[1m3015/3015[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 24ms/step - accuracy: 0.7570 - loss: 0.4972 - precision: 0.7570 - val_accuracy: 0.8191 - val_loss: 0.4099 - val_precision: 0.8191
Epoch 2/10
[1m3015/3015[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 22ms/step - accuracy: 0.8404 - loss: 0.3748 - precision: 0.8404 - val_accuracy: 0.8289 - val_loss: 0.3855 - val_precision: 0.8289
Epoch 3/10
[1m3015/3015[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 22ms/step - accuracy: 0.8579 - loss: 0.3372 - precision: 0.8579 - val_accuracy: 0.8648 - val_loss: 0.3145 - val_precision: 0.8648
Epoch 4/10
[1m3015/3015[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 22ms/step - accuracy: 0.8713 - loss: 0.3082 - precision: 0.8713 - val_accuracy: 0.8728 - val_loss: 0.3026 - val_precision: 0.8728
Epoch 5/10
[1m3015/3015[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 23ms/step - accuracy: 0.8831 - loss: 0.2823 - precision: 0.8831 - val_accuracy: 

<keras.src.callbacks.history.History at 0x7ac13c84e490>

In [10]:
# Evaluate the model on the test set
loss, accuracy, precision = model.evaluate(X_test, y_test_encoded, verbose=0)

# Forecast
y_pred = model.predict(X_test, verbose=0)
y_pred_labels = np.argmax(y_pred, axis=1)
y_test_labels = np.argmax(y_test_encoded, axis=1)

# Calculate evaluation indexes
precision = precision_score(y_test_labels, y_pred_labels, average='weighted')
recall = recall_score(y_test_labels, y_pred_labels, average='weighted')
f1 = f1_score(y_test_labels, y_pred_labels, average='weighted')

# Print the results
print(f'Accuracy: {accuracy * 100:.2f}%')
print(f'Precision: {precision * 100:.2f}%')
print(f'Recall: {recall * 100:.2f}%')
print(f'F1 Score: {f1 * 100:.2f}%')

Accuracy: 88.83%
Precision: 89.58%
Recall: 88.83%
F1 Score: 88.96%


In [11]:
# Save model
model.save('/content/drive/My Drive/Thesis/model/blstm_gender_model.h5')

