In [1]:
import pandas as pd
import numpy as np
import re
import random

import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, LSTM, GRU, Dense, Embedding, Input, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score

from tensorflow.keras.losses import SparseCategoricalCrossentropy

In [2]:
df = pd.read_csv('name_gender.csv')

In [3]:
df.head()

Unnamed: 0,name,gender,probability
0,Aaban,M,1.0
1,Aabha,F,1.0
2,Aabid,M,1.0
3,Aabriella,F,1.0
4,Aada,F,1.0


## Checking if there is any rows with null values and removing them

In [4]:
df[df.isna().any(axis=1)]

Unnamed: 0,name,gender,probability
95025,undefined,F,


In [5]:
df = df.dropna()

## Preprocessing

In [6]:
def clean_name(name):
    return re.sub(r'[^a-zA-Z]', '', name.lower())

In [7]:
df['name'] = df['name'].apply(clean_name)

In [8]:
df.head()

Unnamed: 0,name,gender,probability
0,aaban,M,1.0
1,aabha,F,1.0
2,aabid,M,1.0
3,aabriella,F,1.0
4,aada,F,1.0


## Character mapping

In [9]:
all_chars = sorted(list(set(''.join(df['name'].values))))
char2idx = {c: i + 1 for i, c in enumerate(all_chars)}  # 0 will be used for padding
vocab_size = len(char2idx) + 1  # +1 for padding

## Encoding names

In [10]:
max_len = max(df['name'].apply(len))
min_len = min(df['name'].apply(len))

In [11]:
def encode_name(name):
    return [char2idx[c] for c in name]

In [12]:
X = df['name'].apply(encode_name).values
X = pad_sequences(X, maxlen=max_len, padding='pre')

X_oh = np.array([to_categorical(x, num_classes=vocab_size) for x in X]) #one hot endcoding the array X

## Encoding Labels

In [13]:
le = LabelEncoder()
y = le.fit_transform(df['gender'])

## Building model

In [14]:
def build_model(rnn_type='SimpleRNN'):
    model = Sequential()
    model.add(Input(shape=(max_len, vocab_size)))
    
    if rnn_type == 'SimpleRNN':
        model.add(SimpleRNN(64))
    elif rnn_type == 'LSTM':
        model.add(LSTM(64))
    elif rnn_type == 'GRU':
        model.add(GRU(64))
    
    model.add(Dropout(0.3))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer=Adam(0.001), loss='binary_crossentropy', metrics=['accuracy'])
    return model

## Function to evaluate the model

In [15]:
def evaluate_model(model, X_test, y_test, label_encoder):
    # Predict
    y_pred = (model.predict(X_test) > 0.5).astype('int32').flatten()

    # Overall accuracy
    overall_acc = accuracy_score(y_test, y_pred) * 100

    # Class-wise accuracy
    male_idx = np.where(y_test == 0)[0]
    female_idx = np.where(y_test == 1)[0]

    male_acc = accuracy_score(y_test[male_idx], y_pred[male_idx]) * 100
    female_acc = accuracy_score(y_test[female_idx], y_pred[female_idx]) * 100
    
    return overall_acc, male_acc, female_acc

## Training model

In [16]:
data_used = [0.25, 0.5, 0.75, 1]
models = {}
results = {}
for percent in data_used:
    sample_size = int(percent * len(X_oh))
    selected_indices = np.random.choice(len(X_oh), size=sample_size, replace=False)
    X_ = X_oh[selected_indices]
    y_ = y[selected_indices]

    X_train, X_test, y_train, y_test = train_test_split(X_, y_, test_size=0.2, random_state=42)

    for rnn_type in ['SimpleRNN', 'LSTM', 'GRU']:
        print(f'\nTraining {rnn_type} model...')
        model = build_model(rnn_type)
        model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2, verbose=0)
        loss, acc = model.evaluate(X_test, y_test)
        print(f'{rnn_type} Test Accuracy: {acc:.4f}')
        model_name = rnn_type + "_" + str(percent)
        models[model_name] = [rnn_type, percent, model]
    

    for item in models:
        overall, male, female = evaluate_model(models[item][2], X_test, y_test, le)
        results[item] = {
            "Model Name": models[item][0],
            "Data Used": models[item][1],
            "Overall Accuracy": overall,
            "Male Accuracy": male,
            "Female Accuracy": female
        }


Training SimpleRNN model...
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 570us/step - accuracy: 0.8271 - loss: 0.3920
SimpleRNN Test Accuracy: 0.8262

Training LSTM model...
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 971us/step - accuracy: 0.8516 - loss: 0.3499
LSTM Test Accuracy: 0.8523

Training GRU model...
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 852us/step - accuracy: 0.8440 - loss: 0.3486
GRU Test Accuracy: 0.8483
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 767us/step
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step

Training SimpleRNN model...
[1m297/297[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 545us/step - accuracy: 0.8363 - loss: 0.3692
SimpleRNN Test Accuracy: 0.8382

Training LSTM model...
[1m297/297[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 911us/step - accu

In [17]:
result_df = pd.DataFrame(results).T
result_df.sort_values(by='Overall Accuracy', ascending = False)

Unnamed: 0,Model Name,Data Used,Overall Accuracy,Male Accuracy,Female Accuracy
GRU_0.75,GRU,0.75,88.518811,91.376589,83.58209
LSTM_1,LSTM,1.0,88.382005,89.40766,86.610218
LSTM_0.75,LSTM,0.75,88.061037,92.091053,81.099311
GRU_1,GRU,1.0,88.029466,91.974745,81.214122
GRU_0.5,GRU,0.5,87.303341,90.986126,80.941447
LSTM_0.5,LSTM,0.5,86.792949,91.99136,77.812859
GRU_0.25,GRU,0.25,85.303867,89.191659,78.58783
LSTM_0.25,LSTM,0.25,85.272297,88.38581,79.8938
SimpleRNN_1,SimpleRNN,1.0,85.267035,88.27781,80.066016
SimpleRNN_0.75,SimpleRNN,0.75,84.504078,86.740882,80.640069


## Problem Statement #2: Train a Language model using these names

In [18]:
male_names = list(df[df['gender'] == 'M']['name'])
female_names = list(df[df['gender'] == 'F']['name'])

In [19]:
names = list(df['name'])
names[:5]

['aaban', 'aabha', 'aabid', 'aabriella', 'aada']

In [20]:
char2idx = {char: idx+1 for idx, char in enumerate(all_chars)} # since padding takes 0
idx2char = {idx: char for char, idx in char2idx.items()}

In [21]:
vocab_size

27

In [22]:
def create_sequences(names):
    input_seqs, target_seqs = [], []
    for name in names:
        seq = encode_name(name)
        for i in range(1, len(seq)):
            input_seqs.append(seq[:i])
            target_seqs.append(seq[i])
    return input_seqs, target_seqs

### Creating a model for male names

In [23]:
input_seqs, target_seqs = create_sequences(male_names)

max_seq_len = max(len(seq) for seq in input_seqs)
X = pad_sequences(input_seqs, maxlen=max_seq_len, padding='pre')
y = np.array(target_seqs)

model_male = Sequential([
    Embedding(vocab_size, 128, input_length=max_len),
    GRU(128, return_sequences=False),
    Dropout(0.2),
    Dense(vocab_size, activation='softmax')
])

model_male.compile(loss=SparseCategoricalCrossentropy(from_logits=False), optimizer='adam', metrics=['accuracy'])

model_male.fit(X, y, epochs=5, batch_size=64)

Epoch 1/5




[1m2870/2870[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 15ms/step - accuracy: 0.2584 - loss: 2.4335
Epoch 2/5
[1m2870/2870[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 13ms/step - accuracy: 0.3230 - loss: 2.1655
Epoch 3/5
[1m2870/2870[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 15ms/step - accuracy: 0.3451 - loss: 2.0916
Epoch 4/5
[1m2870/2870[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 12ms/step - accuracy: 0.3588 - loss: 2.0373
Epoch 5/5
[1m2870/2870[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 15ms/step - accuracy: 0.3635 - loss: 2.0160


<keras.src.callbacks.history.History at 0x15d9b0cb0>

### Creating a model for Female names

In [24]:
input_seqs, target_seqs = create_sequences(female_names)

max_seq_len = max(len(seq) for seq in input_seqs)
X = pad_sequences(input_seqs, maxlen=max_seq_len, padding='pre')
y = np.array(target_seqs)

model_female = Sequential([
    Embedding(vocab_size, 128, input_length=max_len),
    GRU(128, return_sequences=False),
    Dropout(0.2),
    Dense(vocab_size, activation='softmax')
])

model_female.compile(loss=SparseCategoricalCrossentropy(from_logits=False), optimizer='adam', metrics=['accuracy'])

model_female.fit(X, y, epochs=5, batch_size=64)

Epoch 1/5
[1m5348/5348[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 13ms/step - accuracy: 0.3097 - loss: 2.1805
Epoch 2/5
[1m5348/5348[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 12ms/step - accuracy: 0.3665 - loss: 1.9439
Epoch 3/5
[1m5348/5348[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 12ms/step - accuracy: 0.3816 - loss: 1.8958
Epoch 4/5
[1m5348/5348[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 11ms/step - accuracy: 0.3935 - loss: 1.8624
Epoch 5/5
[1m5348/5348[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 11ms/step - accuracy: 0.3973 - loss: 1.8472


<keras.src.callbacks.history.History at 0x167dfdbe0>

### Generating names

In [25]:
def generate_name(seed_char, gender):
    name = seed_char
    total_length = random.randint(min_len, max_len)
    
    for _ in range(total_length):
        input_seq = encode_name(name)
        padded = pad_sequences([input_seq], maxlen=max_seq_len, padding='pre')
        
        if gender == 'male':
            pred = model_male.predict(padded, verbose=0)
        else:
            pred = model_female.predict(padded, verbose=0)
        
        # Keep sampling until valid (non-zero) index is chosen, to avoid generating 0 which is reserved for padding
        while True:
            next_idx = tf.random.categorical(tf.math.log(pred), num_samples=1).numpy()[0][0]
            if next_idx > 0:
                break
                
        next_char = idx2char[next_idx]
        name += next_char

    return name

In [26]:
generate_name('l', 'female') #seeing a sample output

'lillythianna'

In [27]:
male_names_100 = [generate_name(random.choice(all_chars), 'male') for _ in range(100)]
female_names_100 = [generate_name(random.choice(all_chars), 'female') for _ in range(100)]

names = male_names_100 + female_names_100

In [28]:
gender_male = ["M" for _ in range(100)]
gender_female = ["F" for _ in range(100)]

gender = gender_male + gender_female

In [29]:
len(gender)

200

### Measuring the accuracy using the best model from problem #1

In [30]:
X = df['name'].apply(encode_name).values
X = pad_sequences(X, maxlen=max_len, padding='pre')

X_oh = np.array([to_categorical(x, num_classes=vocab_size) for x in X]) #one hot endcoding the array X

le = LabelEncoder()
y = le.fit_transform(df['gender'])

In [32]:
# the GRU model performed best, hence using it
X_train, X_test, y_train, y_test = train_test_split(X_oh, y, test_size=0.2, random_state=42)
model_3 = build_model('GRU')
model_3.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2, verbose=1)

Epoch 1/10
[1m1901/1901[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.7798 - loss: 0.4498 - val_accuracy: 0.8435 - val_loss: 0.3566
Epoch 2/10
[1m1901/1901[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.8428 - loss: 0.3629 - val_accuracy: 0.8556 - val_loss: 0.3362
Epoch 3/10
[1m1901/1901[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.8519 - loss: 0.3405 - val_accuracy: 0.8638 - val_loss: 0.3225
Epoch 4/10
[1m1901/1901[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.8600 - loss: 0.3249 - val_accuracy: 0.8656 - val_loss: 0.3146
Epoch 5/10
[1m1901/1901[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.8681 - loss: 0.3103 - val_accuracy: 0.8687 - val_loss: 0.3058
Epoch 6/10
[1m1901/1901[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.8748 - loss: 0.2979 - val_accuracy: 0.8775 - val_loss: 0.2922
Epoch 7/10
[1m1

<keras.src.callbacks.history.History at 0x15c760140>

In [33]:
names = [encode_name(name) for name in names]
names = pad_sequences(names, maxlen=max_len, padding='pre')
names = np.array([to_categorical(name, num_classes=vocab_size) for name in names])

gender = le.fit_transform(gender)

In [34]:
overall, male, female = evaluate_model(model_3, names, gender, le)

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step


In [35]:
print(f"Overall Accuracy: {overall}")
print(f"Male Accuracy: {male}")
print(f"Female Accuracy: {female}")

Overall Accuracy: 67.5
Male Accuracy: 70.0
Female Accuracy: 65.0


## Train a language model using names starting with A, M, and Z

In [36]:
filtered_names = df[df['name'].str.startswith(('a', 'm', 'z'))].copy()

In [37]:
filtered_names = list(filtered_names['name'])

In [38]:
input_seqs, target_seqs = create_sequences(filtered_names)

max_seq_len = max(len(seq) for seq in input_seqs)
X = pad_sequences(input_seqs, maxlen=max_seq_len, padding='pre')
y = np.array(target_seqs)

model_4 = Sequential([
    Embedding(vocab_size, 128, input_length=max_len),
    GRU(128, return_sequences=False),
    Dropout(0.2),
    Dense(vocab_size, activation='softmax')
])

model_4.compile(loss=SparseCategoricalCrossentropy(from_logits=False), optimizer='adam', metrics=['accuracy'])

model_4.fit(X, y, epochs=5, batch_size=64)

Epoch 1/5




[1m1660/1660[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 12ms/step - accuracy: 0.2710 - loss: 2.4152
Epoch 2/5
[1m1660/1660[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 11ms/step - accuracy: 0.3366 - loss: 2.1331
Epoch 3/5
[1m1660/1660[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 11ms/step - accuracy: 0.3521 - loss: 2.0649
Epoch 4/5
[1m1660/1660[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 11ms/step - accuracy: 0.3622 - loss: 2.0340
Epoch 5/5
[1m1660/1660[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 11ms/step - accuracy: 0.3717 - loss: 1.9934


<keras.src.callbacks.history.History at 0x167888830>

In [39]:
def generate_name_new():
    name = random.choice(['a', 'm', 'z'])
    total_length = random.randint(min_len, max_len)
    
    for _ in range(total_length):
        input_seq = encode_name(name)
        padded = pad_sequences([input_seq], maxlen=max_seq_len, padding='pre')
        
        pred = model_4.predict(padded, verbose=0)
        
        # Keep sampling until valid (non-zero) index is chosen, to avoid generating 0 which is reserved for padding
        while True:
            next_idx = tf.random.categorical(tf.math.log(pred), num_samples=1).numpy()[0][0]
            if next_idx > 0:
                break
                
        next_char = idx2char[next_idx]
        name += next_char

    return name

In [40]:
names_50 = [generate_name_new() for _ in range(50)]

In [41]:
def compute_perplexity(name, model, max_seq_len, char2idx):
    name = name.lower()
    input_seq = [char2idx[c] for c in name[:-1] if c in char2idx]
    target_seq = [char2idx[c] for c in name[1:] if c in char2idx]
    
    padded_input = pad_sequences([input_seq], maxlen=max_seq_len, padding='pre')
    preds = model.predict(padded_input, verbose=0)[0]
    
    log_probs = []
    for i, target_idx in enumerate(target_seq):
        if i >= len(input_seq):
            break
        prob = preds[target_idx]
        log_probs.append(np.log(prob + 1e-10))  # to avoid log(0), adding 1e-10
    
    if not log_probs:
        return float('inf')
    
    avg_neg_log_likelihood = -np.mean(log_probs)
    return np.exp(avg_neg_log_likelihood)

In [42]:
perplexities = [compute_perplexity(name, model_4, max_seq_len, char2idx) for name in names_50]

In [43]:
df = pd.DataFrame({'names':names_50, 'perplexity_score':perplexities})

In [44]:
df.sort_values(by='perplexity_score')

Unnamed: 0,names,perplexity_score
28,meri,6.997201
14,malayieh,14.976866
25,zor,15.991076
0,muhsadelee,18.01729
10,maurianahahm,18.367105
27,mayreynnethel,19.10187
44,zyzarea,20.142406
37,almeyriashieshal,23.383713
21,ziahiahanas,24.007229
47,merriceale,29.627455


In [45]:
# Lower the perplexity score, the realistic the name is. As per the model's training on real names.