<a href="https://colab.research.google.com/github/Natural-Language-Processing-YU/M3_Assignment/blob/main/scripts/m3_assignment_part_III.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Part III
Using the previous two tutorials, please answer the following using an encorder-decoder approach and an LSTM compared approach.

Please create a transformer-based classifier for English name classification into male or female.

There are several datasets for name for male or female classification. In subseuqent iterations, this could be expanded to included more classifications.

Below is the source from NLTK, which only has male and female available but could be used for the purposes of this assignment.

```
names = nltk.corpus.names
names.fileids()
['female.txt', 'male.txt']
male_names = names.words('male.txt')
female_names = names.words('female.txt')
[w for w in male_names if w in female_names]
['Abbey', 'Abbie', 'Abby', 'Addie', 'Adrian', 'Adrien', 'Ajay', 'Alex', 'Alexis',
'Alfie', 'Ali', 'Alix', 'Allie', 'Allyn', 'Andie', 'Andrea', 'Andy', 'Angel',
'Angie', 'Ariel', 'Ashley', 'Aubrey', 'Augustine', 'Austin', 'Averil', ...]
```

# 1 . Encode-Decoder Based Approach

## 1.1 Data Preparation and Model Setup

In [40]:
import numpy as np
import tensorflow as tf
import nltk
from tensorflow.keras.layers import TextVectorization, Embedding, MultiHeadAttention, LayerNormalization, Dense, GlobalAveragePooling1D, Input
from tensorflow.keras.models import Model

# Download and prepare dataset
nltk.download('names')
from nltk.corpus import names

# Load names
male_names = names.words('male.txt')
female_names = names.words('female.txt')

# Create labels
male_labels = [0] * len(male_names)  # 0 for male
female_labels = [1] * len(female_names)  # 1 for female

# Combine datasets
all_names = np.array(male_names + female_names)
all_labels = np.array(male_labels + female_labels)

# Shuffle dataset
indices = np.arange(all_labels.shape[0])
np.random.shuffle(indices)
all_names = all_names[indices]
all_labels = all_labels[indices]

# Text Vectorization
vectorizer = TextVectorization(max_tokens=10000, output_sequence_length=10)
vectorizer.adapt(all_names)

def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorizer(text), label

# Prepare the final datasets
dataset = tf.data.Dataset.from_tensor_slices((all_names, all_labels))
dataset = dataset.map(vectorize_text)
dataset = dataset.batch(32).prefetch(tf.data.AUTOTUNE)


[nltk_data] Downloading package names to /root/nltk_data...
[nltk_data]   Package names is already up-to-date!


## 1.2 Encoder, Decoder, Model Building, and Training

In [43]:
import numpy as np
import tensorflow as tf
import nltk
from tensorflow.keras.layers import TextVectorization, Embedding, MultiHeadAttention, LayerNormalization, Dense, GlobalAveragePooling1D, Input
from tensorflow.keras.models import Model

# Download and prepare dataset
nltk.download('names')
from nltk.corpus import names

# Load names
male_names = names.words('male.txt')
female_names = names.words('female.txt')

# Create labels
male_labels = [0] * len(male_names)  # 0 for male
female_labels = [1] * len(female_names)  # 1 for female

# Combine datasets
all_names = np.array(male_names + female_names)
all_labels = np.array(male_labels + female_labels)

# Shuffle dataset
indices = np.arange(all_labels.shape[0])
np.random.shuffle(indices)
all_names = all_names[indices]
all_labels = all_labels[indices]

# Text Vectorization
vectorizer = TextVectorization(max_tokens=10000, output_sequence_length=10)
vectorizer.adapt(all_names)

def vectorize_text(text, label):
    text = vectorizer(text)  # Remove tf.expand_dims
    return text, label

# Prepare the final datasets
dataset = tf.data.Dataset.from_tensor_slices((all_names, all_labels))
dataset = dataset.map(vectorize_text)
dataset = dataset.batch(32).prefetch(tf.data.AUTOTUNE)

# Define the Encoder
def encoder(inputs, num_heads, ff_dim):
    # Multi-head self-attention
    attention = MultiHeadAttention(num_heads=num_heads, key_dim=ff_dim)(inputs, inputs)
    # Skip connection and layer normalization
    x = LayerNormalization(epsilon=1e-6)(inputs + attention)
    # Feed-forward network
    ff_output = Dense(ff_dim, activation="relu")(x)
    ff_output = Dense(inputs.shape[-1])(ff_output)
    # Second skip connection and layer normalization
    encoded_seq = LayerNormalization(epsilon=1e-6)(x + ff_output)
    return encoded_seq

# Define the Decoder
def decoder(encoded_seq):
    x = GlobalAveragePooling1D()(encoded_seq)
    x = Dense(20, activation="relu")(x)
    outputs = Dense(1, activation="sigmoid")(x)
    return outputs

# Building the Model
inputs = Input(shape=(None,))
x = Embedding(input_dim=10000, output_dim=64)(inputs)
encoded_seq = encoder(x, num_heads=2, ff_dim=64)
outputs = decoder(encoded_seq)

model = Model(inputs=inputs, outputs=outputs)
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Model Summary
model.summary()

# Train the Model
model.fit(dataset, epochs=10)

# Save the model weights
model.save_weights('name_gender_classifier_weights.h5')


[nltk_data] Downloading package names to /root/nltk_data...
[nltk_data]   Package names is already up-to-date!


Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_3 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 embedding_2 (Embedding)     (None, None, 64)             640000    ['input_3[0][0]']             
                                                                                                  
 multi_head_attention_2 (Mu  (None, None, 64)             33216     ['embedding_2[0][0]',         
 ltiHeadAttention)                                                   'embedding_2[0][0]']         
                                                                                                  
 tf.__operators__.add_4 (TF  (None, None, 64)             0         ['embedding_2[0][0]',   

## 1.3 Model Evaluation and Saving Model Weights


In [44]:
# Evaluate the model on the dataset itself (for demonstration purposes)
loss, accuracy = model.evaluate(dataset)
print("Loss: ", loss)
print("Accuracy: ", accuracy)

# Save the model weights
#model.save_weights('name_gender_classifier_weights.h5')

# Demonstrate loading the model weights (if needed in the future)
model.load_weights('name_gender_classifier_weights.h5')

# Predicting with a few sample names
sample_names = np.array(["Alice", "Bob", "Clarissa", "David"])
sample_labels = np.array([1, 0, 1, 0])  # Just as placeholders

# Prepare sample data
sample_dataset = tf.data.Dataset.from_tensor_slices((sample_names, sample_labels))
sample_dataset = sample_dataset.map(vectorize_text)
sample_dataset = sample_dataset.batch(32)

# Making predictions
for batch in sample_dataset:
    predictions = model.predict(batch[0])
    for name, prediction in zip(sample_names, predictions):
        gender = "Female" if prediction > 0.5 else "Male"
        print(f"Name: {name}, Predicted Gender: {gender}")


Loss:  1.5667062997817993
Accuracy:  0.5002517700195312
Name: Alice, Predicted Gender: Male
Name: Bob, Predicted Gender: Male
Name: Clarissa, Predicted Gender: Male
Name: David, Predicted Gender: Male


# 2. LSTM Based Approach

## 2.1 Data Preparation

In [48]:
import numpy as np
import tensorflow as tf
import nltk
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Download and prepare dataset
nltk.download('names')
from nltk.corpus import names

# Load names
male_names = names.words('male.txt')
female_names = names.words('female.txt')

# Create labels
male_labels = [0] * len(male_names)  # 0 for male
female_labels = [1] * len(female_names)  # 1 for female

# Combine datasets
all_names = np.array(male_names + female_names)
all_labels = np.array(male_labels + female_labels)

# Shuffle dataset
indices = np.arange(all_labels.shape[0])
np.random.shuffle(indices)
all_names = all_names[indices]
all_labels = all_labels[indices]

# Text Vectorization
vectorizer = TextVectorization(max_tokens=10000, output_sequence_length=10)
vectorizer.adapt(all_names)

def vectorize_text(text, label):
    return vectorizer(text), label

# Prepare the final datasets
dataset = tf.data.Dataset.from_tensor_slices((all_names, all_labels))
dataset = dataset.map(vectorize_text)
dataset = dataset.batch(32).prefetch(tf.data.AUTOTUNE)

[nltk_data] Downloading package names to /root/nltk_data...
[nltk_data]   Package names is already up-to-date!


## 2.2 Model Setup

In [49]:
# Define the LSTM Model
model = Sequential([
    Embedding(input_dim=10000, output_dim=64, input_length=10),
    LSTM(64),  # LSTM expects [batch, sequence, features]
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Compile the model again with corrections
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Print model summary to confirm architecture
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 10, 64)            640000    
                                                                 
 lstm_1 (LSTM)               (None, 64)                33024     
                                                                 
 dense_14 (Dense)            (None, 64)                4160      
                                                                 
 dense_15 (Dense)            (None, 1)                 65        
                                                                 
Total params: 677249 (2.58 MB)
Trainable params: 677249 (2.58 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


## 2.3 Training, Evaluation, and Saving the Model

In [50]:
# Train the Model
history = model.fit(dataset, epochs=10)

# Evaluate the model on the same dataset
# Note: Ideally, you should evaluate on a separate test set.
loss, accuracy = model.evaluate(dataset)
print(f"Model Loss: {loss}, Model Accuracy: {accuracy}")

# Save the entire model for future reference
model.save('name_gender_classifier_lstm.h5')

# Load the model (demonstrating loading, not necessary now since it's already in memory)
loaded_model = tf.keras.models.load_model('name_gender_classifier_lstm.h5')

# Making predictions with new data
sample_names = np.array(["Alice", "Bob", "Clarissa", "David"])
sample_labels = np.array([1, 0, 1, 0])  # Just placeholders for labels

# Vectorize the sample names using the established vectorizer
sample_dataset = tf.data.Dataset.from_tensor_slices((sample_names, sample_labels))
sample_dataset = sample_dataset.map(vectorize_text)
sample_dataset = sample_dataset.batch(32)

# Predict using the loaded model
predictions = loaded_model.predict(sample_dataset)
for name, prediction in zip(sample_names, predictions):
    gender = "Female" if prediction > 0.5 else "Male"
    print(f"Name: {name}, Predicted Gender: {gender}")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Model Loss: 0.12037023901939392, Model Accuracy: 0.9536757469177246


  saving_api.save_model(


Name: Alice, Predicted Gender: Female
Name: Bob, Predicted Gender: Male
Name: Clarissa, Predicted Gender: Female
Name: David, Predicted Gender: Male


# References
1. https://arxiv.org/pdf/2102.03692.pdf
2. https://alvinntnu.github.io/NTNU_ENC2045_LECTURES/exercise/13-attention.html
3. https://towardsdatascience.com/deep-learning-gender-from-name-lstm-recurrent-neural-networks-448d64553044
4. https://www.nltk.org/book/ch02.html#sec-lexical-resources