In [2]:
import numpy as np
import pandas as pd
import sys
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN, LSTM, GRU, Embedding, Flatten
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

###Loading dataset

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
file_path = '/content/drive/MyDrive/name_gender.csv'
name_gender = pd.read_csv(file_path)
name_gender

Unnamed: 0,name,gender,probability
0,Aaban,M,1.0
1,Aabha,F,1.0
2,Aabid,M,1.0
3,Aabriella,F,1.0
4,Aada,F,1.0
...,...,...,...
95021,Zyvon,M,1.0
95022,Zyyanna,F,1.0
95023,Zyyon,M,1.0
95024,Zzyzx,M,1.0


###Preprocess data

In [None]:
# Preprocess data
name_gender['name'] = name_gender['name'].str.replace(r'[^\x00-\x7F]+', '', regex=True)  # Remove non-ASCII characters

# Convert gender labels to numerical values
label_encoder = LabelEncoder()
name_gender['gender'] = label_encoder.fit_transform(name_gender['gender'])
name_gender

Unnamed: 0,name,gender,probability
0,Aaban,1,1.0
1,Aabha,0,1.0
2,Aabid,1,1.0
3,Aabriella,0,1.0
4,Aada,0,1.0
...,...,...,...
95021,Zyvon,1,1.0
95022,Zyyanna,0,1.0
95023,Zyyon,1,1.0
95024,Zzyzx,1,1.0


#Problem Statement 1 (For each of the RNN cells(SimpleRNN, LSTM & GPU), we built sequential models to classify names into gender and has been tested on different dataset sizes, giving its respective overall & class accuracy). The input to the model is the one-hot encoded representation of the input sequences, which are the names in this case

###For simple RNN

In [None]:
# Vary dataset size
dataset_sizes = [0.25, 0.5, 0.75,1]
# SimpleRNN
print("\nResults for SimpleRNN:")
for dataset_size in dataset_sizes:
    # Randomly select a percentage of the data
    selected_data = name_gender.sample(frac=dataset_size, random_state=42)

    # Split the selected data into training and testing sets
    train_data, test_data = train_test_split(selected_data, test_size=0.2, random_state=42)

    # Convert gender labels to numerical values
    label_encoder = LabelEncoder()
    train_data['gender'] = label_encoder.fit_transform(train_data['gender'])
    test_data['gender'] = label_encoder.transform(test_data['gender'])

    # Tokenize and one-hot encode the input sequences
    tokenizer = Tokenizer(char_level=True)
    tokenizer.fit_on_texts(train_data['name'])

    train_sequences = tokenizer.texts_to_sequences(train_data['name'])
    test_sequences = tokenizer.texts_to_sequences(test_data['name'])

    max_sequence_length = max(len(seq) for seq in train_sequences + test_sequences)

    train_padded = pad_sequences(train_sequences, maxlen=max_sequence_length, padding='post')
    test_padded = pad_sequences(test_sequences, maxlen=max_sequence_length, padding='post')

    # Build the model
    model = Sequential()
    model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=8, input_length=max_sequence_length))
    model.add(SimpleRNN(100))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # Train the model
    model.fit(train_padded, train_data['gender'], epochs=10, batch_size=32, validation_split=0.2, verbose=0)

    # Evaluate the model on the test set
    predictions = model.predict(test_padded)
    predictions_binary = (predictions > 0.5).astype(int)

    # Calculate overall accuracy and class-wise accuracy
    overall_accuracy = accuracy_score(test_data['gender'], predictions_binary)
    class_report = classification_report(test_data['gender'], predictions_binary, target_names=['Female', 'Male'])

    print(f"\nResults for {int(dataset_size * 100)}% dataset size:")
    print(f"Overall Accuracy: {overall_accuracy * 100:.2f}%")
    print("Class-wise Accuracy:")
    print(class_report)


Results for SimpleRNN:

Results for 25% dataset size:
Overall Accuracy: 82.13%
Class-wise Accuracy:
              precision    recall  f1-score   support

      Female       0.84      0.89      0.86      3041
        Male       0.78      0.70      0.74      1711

    accuracy                           0.82      4752
   macro avg       0.81      0.80      0.80      4752
weighted avg       0.82      0.82      0.82      4752


Results for 50% dataset size:
Overall Accuracy: 84.26%
Class-wise Accuracy:
              precision    recall  f1-score   support

      Female       0.87      0.88      0.88      5982
        Male       0.79      0.79      0.79      3521

    accuracy                           0.84      9503
   macro avg       0.83      0.83      0.83      9503
weighted avg       0.84      0.84      0.84      9503


Results for 75% dataset size:
Overall Accuracy: 85.35%
Class-wise Accuracy:
              precision    recall  f1-score   support

      Female       0.89      0.88   

###For simple LSTM

In [None]:
# Vary dataset size
dataset_sizes = [0.25, 0.5, 0.75,1]
# SimpleRNN
print("\nResults for SimpleLSTM:")
for dataset_size in dataset_sizes:
    # Randomly select a percentage of the data
    selected_data = name_gender.sample(frac=dataset_size, random_state=42)

    # Split the selected data into training and testing sets
    train_data, test_data = train_test_split(selected_data, test_size=0.2, random_state=42)

    # Convert gender labels to numerical values
    label_encoder = LabelEncoder()
    train_data['gender'] = label_encoder.fit_transform(train_data['gender'])
    test_data['gender'] = label_encoder.transform(test_data['gender'])

    # Tokenize and one-hot encode the input sequences
    tokenizer = Tokenizer(char_level=True)
    tokenizer.fit_on_texts(train_data['name'])

    train_sequences = tokenizer.texts_to_sequences(train_data['name'])
    test_sequences = tokenizer.texts_to_sequences(test_data['name'])

    max_sequence_length = max(len(seq) for seq in train_sequences + test_sequences)

    train_padded = pad_sequences(train_sequences, maxlen=max_sequence_length, padding='post')
    test_padded = pad_sequences(test_sequences, maxlen=max_sequence_length, padding='post')

    # Build the model
    model = Sequential()
    model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=8, input_length=max_sequence_length))
    model.add(LSTM(100))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # Train the model
    model.fit(train_padded, train_data['gender'], epochs=10, batch_size=32, validation_split=0.2, verbose=0)

    # Evaluate the model on the test set
    predictions = model.predict(test_padded)
    predictions_binary = (predictions > 0.5).astype(int)

    # Calculate overall accuracy and class-wise accuracy
    overall_accuracy = accuracy_score(test_data['gender'], predictions_binary)
    class_report = classification_report(test_data['gender'], predictions_binary, target_names=['Female', 'Male'])

    print(f"\nResults for {int(dataset_size * 100)}% dataset size:")
    print(f"Overall Accuracy: {overall_accuracy * 100:.2f}%")
    print("Class-wise Accuracy:")
    print(class_report)


Results for SimpleLSTM:

Results for 25% dataset size:
Overall Accuracy: 83.92%
Class-wise Accuracy:
              precision    recall  f1-score   support

      Female       0.88      0.87      0.87      3041
        Male       0.77      0.78      0.78      1711

    accuracy                           0.84      4752
   macro avg       0.83      0.83      0.83      4752
weighted avg       0.84      0.84      0.84      4752


Results for 50% dataset size:
Overall Accuracy: 84.79%
Class-wise Accuracy:
              precision    recall  f1-score   support

      Female       0.87      0.90      0.88      5982
        Male       0.81      0.76      0.79      3521

    accuracy                           0.85      9503
   macro avg       0.84      0.83      0.83      9503
weighted avg       0.85      0.85      0.85      9503


Results for 75% dataset size:
Overall Accuracy: 86.12%
Class-wise Accuracy:
              precision    recall  f1-score   support

      Female       0.88      0.91  

###Simple GRU

In [None]:
# Vary dataset size
dataset_sizes = [0.25, 0.5, 0.75,1]
# SimpleRNN
print("\nResults for SimpleGRU:")
for dataset_size in dataset_sizes:
    # Randomly select a percentage of the data
    selected_data = name_gender.sample(frac=dataset_size, random_state=42)

    # Split the selected data into training and testing sets
    train_data, test_data = train_test_split(selected_data, test_size=0.2, random_state=42)

    # Convert gender labels to numerical values
    label_encoder = LabelEncoder()
    train_data['gender'] = label_encoder.fit_transform(train_data['gender'])
    test_data['gender'] = label_encoder.transform(test_data['gender'])

    # Tokenize and one-hot encode the input sequences
    tokenizer = Tokenizer(char_level=True)
    tokenizer.fit_on_texts(train_data['name'])

    train_sequences = tokenizer.texts_to_sequences(train_data['name'])
    test_sequences = tokenizer.texts_to_sequences(test_data['name'])

    max_sequence_length = max(len(seq) for seq in train_sequences + test_sequences)

    train_padded = pad_sequences(train_sequences, maxlen=max_sequence_length, padding='post')
    test_padded = pad_sequences(test_sequences, maxlen=max_sequence_length, padding='post')

    # Build the model with GRU
    model = Sequential()
    model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=8, input_length=max_sequence_length))
    model.add(GRU(100))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # Train the model
    model.fit(train_padded, train_data['gender'], epochs=10, batch_size=32, validation_split=0.2, verbose=0)

    # Evaluate the model on the test set
    predictions = model.predict(test_padded)
    predictions_binary = (predictions > 0.5).astype(int)

    # Calculate overall accuracy and class-wise accuracy
    overall_accuracy = accuracy_score(test_data['gender'], predictions_binary)
    class_report = classification_report(test_data['gender'], predictions_binary, target_names=['Female', 'Male'])

    print(f"\nResults for {int(dataset_size * 100)}% dataset size:")
    print(f"Overall Accuracy: {overall_accuracy * 100:.2f}%")
    print("Class-wise Accuracy:")
    print(class_report)


Results for SimpleGRU:

Results for 25% dataset size:
Overall Accuracy: 84.09%
Class-wise Accuracy:
              precision    recall  f1-score   support

      Female       0.89      0.86      0.87      3041
        Male       0.76      0.81      0.79      1711

    accuracy                           0.84      4752
   macro avg       0.83      0.83      0.83      4752
weighted avg       0.84      0.84      0.84      4752


Results for 50% dataset size:
Overall Accuracy: 85.68%
Class-wise Accuracy:
              precision    recall  f1-score   support

      Female       0.91      0.86      0.88      5982
        Male       0.78      0.85      0.82      3521

    accuracy                           0.86      9503
   macro avg       0.84      0.86      0.85      9503
weighted avg       0.86      0.86      0.86      9503


Results for 75% dataset size:
Overall Accuracy: 86.92%
Class-wise Accuracy:
              precision    recall  f1-score   support

      Female       0.91      0.89   

**From the above accuracies collected of SimpleRNN, LSTM & GRU, we see that the best accuracy (overall & class-wise) comes from the GRU cell that uses 100% of the dataset size.**

#Problem Statement 2a (trains a SimpleGRU model that is of 100% dataset size as its the best performing model, and then generates 100 male and 100 female names using the trained language model. It measures the accuracy of classifying these names.)

Build and train model

In [9]:
# Load the dataset
file_path = '/content/drive/MyDrive/name_gender.csv'
name_gender = pd.read_csv(file_path)

# Tokenize the names
tokenizer = Tokenizer()
tokenizer.fit_on_texts(name_gender['name'])
total_words = len(tokenizer.word_index) + 1

# Convert names to sequences
sequences = tokenizer.texts_to_sequences(name_gender['name'])

# Pad sequences to have consistent length
padded_sequences = pad_sequences(sequences)

# Create input sequences and labels
X = padded_sequences
y = (name_gender['gender'] == 'male').astype(int)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build a simple GRU model
model = Sequential()
model.add(Embedding(total_words, 32, input_length=X.shape[1]))
model.add(GRU(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x780968f05810>

Generation of names and also measuring accuracy of gender classifier.

In [21]:
# Generate 100 male names and 100 female names
generated_male_names = []
generated_female_names = []

for _ in range(100):
    # Generate a random sequence of indices
    random_seq = np.random.randint(1, total_words, X.shape[1])
    # Decode the sequence to a name
    generated_name = tokenizer.sequences_to_texts([random_seq])[0].strip()
    # Append to the corresponding list based on the predicted gender
    if model.predict(pad_sequences(tokenizer.texts_to_sequences([generated_name]))) > 0.00001:
        generated_male_names.append(generated_name)
    else:
        generated_female_names.append(generated_name)
print("Male generated names are:",generated_male_names)
print("Female generated names are:",generated_female_names)

# Measure the accuracy
if generated_male_names:
    generated_male_correct = sum(model.predict(pad_sequences(tokenizer.texts_to_sequences(generated_male_names))) > 0.5)
else:
    generated_male_correct = 0

if generated_female_names:
    generated_female_correct = sum(model.predict(pad_sequences(tokenizer.texts_to_sequences(generated_female_names))) <= 0.5)
else:
    generated_female_correct = 0

total_generated = len(generated_male_names) + len(generated_female_names)

# Calculate accuracy of classifying genders
accuracy = (generated_male_correct + generated_female_correct) / total_generated if total_generated > 0 else 0
accuracy_scalar = accuracy.item()  # Convert the NumPy array to a scalar value

print(f"Accuracy on generated names: {accuracy_scalar * 100:.2f}%")

Male generated names are: ['eisele', 'nikaya', 'yuvan', 'nekki', 'tareka', 'laquez', 'lekeysha', 'ahniya', 'strauss', 'shakel', 'virsaviya', 'gamel', 'clydette', 'athalee', 'shanvi', 'khiem', 'bosco', 'meriann', 'cyrene', 'solie', 'taraneh', 'paitlynn']
Female generated names are: ['blair', 'jenyssa', 'kavious', 'soline', 'lameer', 'davanta', 'juwahn', 'jaydence', 'alivya', 'jennifier', 'antuan', 'tyrise', 'leburn', 'ahmonie', 'mallerly', 'ahsoka', 'amiriah', 'latoria', 'ezavier', 'chelise', 'jocey', 'sokol', 'isiash', 'geralynn', 'tryell', 'demarquez', 'alinna', 'draevyn', 'sandra', 'berthina', 'enneth', 'jalayna', 'saniyya', 'aiden', 'bibiana', 'adore', 'mechele', 'tattiana', 'cecily', 'barrion', 'deatta', 'klayre', 'aleksah', 'eleecia', 'jenyiah', 'terea', 'marketa', 'kondwani', 'aurin', 'ahlexus', 'shareen', 'sandibel', 'devonne', 'brenham', 'christella', 'karstyn', 'alexea', 'platon', 'jantzen', 'kanyah', 'deadria', 'hatty', 'brenan', 'susie', 'jesten', 'mah', 'arrena', 'mileny', 

The best model (Simple GRU with 100% of dataset size) is pretty accurate of predicting the gender of these generated names, having an accuracy of 78%.

#Problem Statement 2b ( train a language model using names starting with A, M, and Z, and then generate names along with evaluating perplexity)In this case as well we use the GRU with 100% of the dataset size as its the best performing model.

In [22]:
# Load the dataset (assuming your dataset is named name_gender.csv)
file_path = '/content/drive/MyDrive/name_gender.csv'
name_gender = pd.read_csv(file_path)

# Filter names starting with A, M, and Z
selected_names = name_gender[name_gender['name'].str[0].isin(['A', 'M', 'Z'])]

# Tokenize the selected names
tokenizer = Tokenizer()
tokenizer.fit_on_texts(selected_names['name'])
total_words = len(tokenizer.word_index) + 1

# Convert names to sequences
sequences = tokenizer.texts_to_sequences(selected_names['name'])

# Pad sequences to have consistent length
padded_sequences = pad_sequences(sequences)

# Create input sequences and labels
X = padded_sequences
y = (selected_names['gender'] == 'male').astype(int)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build a simple GRU model
model = Sequential()
model.add(Embedding(total_words, 32, input_length=X.shape[1]))
model.add(GRU(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test))

# Generate 50 names and calculate perplexity
generated_names = []

for _ in range(50):
    # Generate a random sequence of indices
    random_seq = np.random.randint(1, total_words, X.shape[1])
    # Decode the sequence to a name
    generated_name = tokenizer.sequences_to_texts([random_seq])[0].strip()
    # Append to the list of generated names
    generated_names.append(generated_name)

# Print each generated name
    print(generated_name)
# Evaluate perplexity
test_sequences = tokenizer.texts_to_sequences(selected_names['name'])
test_padded_sequences = pad_sequences(test_sequences, maxlen=X.shape[1])

perplexity = model.evaluate(test_padded_sequences, (selected_names['gender'] == 'male').astype(int))[0]

print(f"Perplexity on generated names: {perplexity}")

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
mailo
myoshia
majuma
zakarey
arden
muniza
marinn
medford
alaycia
assan
madilyne
marquian
annalee
mariahna
zulmy
amarra
marichal
autiana
zao
markallen
allisha
alanoud
azalynn
maycel
zayden
matayah
macen
mahri
alzahra
mahlea
zubair
amberle
maciej
melannie
marietherese
myrth
melisa
marlyna
avayla
alahni
morpheus
alydia
mantasha
zebbie
asharee
arneita
aylina
memphys
azayla
miari
Perplexity on generated names: 0.0003852528752759099


Here a perplexity of 0.0003852528752759099 is exceptionally low, which is generally a good sign. It suggests that the language model has performed very well on the generated names from the dataset that starts with A, M, and Z. It shows how realistic these names are.