<a href="https://colab.research.google.com/github/Natural-Language-Processing-YU/M3_Assignment/blob/main/scripts/m3_assignment_part_III.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Part III
Using the previous two tutorials, please answer the following using an encorder-decoder approach and an LSTM compared approach.

Please create a transformer-based classifier for English name classification into male or female.

There are several datasets for name for male or female classification. In subseuqent iterations, this could be expanded to included more classifications.

Below is the source from NLTK, which only has male and female available but could be used for the purposes of this assignment.

```
names = nltk.corpus.names
names.fileids()
['female.txt', 'male.txt']
male_names = names.words('male.txt')
female_names = names.words('female.txt')
[w for w in male_names if w in female_names]
['Abbey', 'Abbie', 'Abby', 'Addie', 'Adrian', 'Adrien', 'Ajay', 'Alex', 'Alexis',
'Alfie', 'Ali', 'Alix', 'Allie', 'Allyn', 'Andie', 'Andrea', 'Andy', 'Angel',
'Angie', 'Ariel', 'Ashley', 'Aubrey', 'Augustine', 'Austin', 'Averil', ...]
```

In [1]:
!pip install nltk



In [12]:
from nltk.corpus import names
names.fileids()

['female.txt', 'male.txt']

In [13]:
male_names = names.words('male.txt')
female_names = names.words('female.txt')

#**Encoder-Decoder**

In [15]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel

# Assuming male_names and female_names are already prepared lists

# Step 1: Data Preparation
names = [(name, 'male') for name in male_names] + [(name, 'female') for name in female_names]
names_train, names_val = train_test_split(names, test_size=0.2, random_state=42)

# Step 2: Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class NameDataset(Dataset):
    def __init__(self, names):
        self.names = names
        self.max_seq_length = 20  # Define your desired maximum sequence length

    def __len__(self):
        return len(self.names)

    def __getitem__(self, idx):
        name, label = self.names[idx]
        encoding = tokenizer(name, return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_seq_length)
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()
        label = 1 if label == 'female' else 0  # Encoding labels as 0 for male, 1 for female
        return input_ids, attention_mask, torch.tensor(label, dtype=torch.long)

train_dataset = NameDataset(names_train)
val_dataset = NameDataset(names_val)

# Step 3: Model Architecture
class NameClassifier(nn.Module):
    def __init__(self, pretrained_model_name='bert-base-uncased', num_classes=2):
        super(NameClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(pretrained_model_name)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output  # Use the pooled output from BERT
        output = self.dropout(pooled_output)
        output = self.fc(output)
        return output


# Step 4: Training
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = NameClassifier()
model.to(device)

optimizer = optim.Adam(model.parameters(), lr=1e-5)
criterion = nn.CrossEntropyLoss()

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for input_ids, attention_mask, labels in train_loader:
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * input_ids.size(0)

    epoch_loss = running_loss / len(train_dataset)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}')

# Step 5: Evaluation
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for input_ids, attention_mask, labels in val_loader:
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f'Validation Accuracy: {accuracy:.4f}')

# Step 6: Inference
def predict_gender(name):
    model.eval()
    encoding = tokenizer(name, return_tensors='pt', padding='max_length', truncation=True, max_length=20)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    with torch.no_grad():
        output = model(input_ids=input_ids, attention_mask=attention_mask)
        _, predicted = torch.max(output, 1)
    return 'female' if predicted.item() == 1 else 'male'

# Example usage
name = "John"
predicted_gender = predict_gender(name)
print(f'The predicted gender for the name "{name}" is: {predicted_gender}')


Epoch 1/5, Loss: 0.4022
Epoch 2/5, Loss: 0.2737
Epoch 3/5, Loss: 0.2334
Epoch 4/5, Loss: 0.1917
Epoch 5/5, Loss: 0.1673
Validation Accuracy: 0.8628
The predicted gender for the name "John" is: male


# **LSTM**

In [42]:
import numpy as np
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

# Define input sequence length
max_seq_length = 20

# Tokenize the names
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(male_names + female_names)

# Convert names to sequences
male_sequences = tokenizer.texts_to_sequences(male_names)
female_sequences = tokenizer.texts_to_sequences(female_names)

# Pad sequences to ensure fixed length
male_padded_sequences = pad_sequences(male_sequences, maxlen=max_seq_length, padding='post')
female_padded_sequences = pad_sequences(female_sequences, maxlen=max_seq_length, padding='post')

# Create labels
male_labels = np.ones(len(male_padded_sequences))
female_labels = np.zeros(len(female_padded_sequences))

# Combine male and female data
X_data = np.concatenate((male_padded_sequences, female_padded_sequences), axis=0)
y_data = np.concatenate((male_labels, female_labels), axis=0)

# Shuffle data
indices = np.arange(len(X_data))
np.random.shuffle(indices)
X_data = X_data[indices]
y_data = y_data[indices]

# Split into train and validation sets
train_size = int(0.7 * len(X_data))

X_train = X_data[:train_size]
y_train = y_data[:train_size]

X_val = X_data[train_size:]
y_val = y_data[train_size:]

# Reshape input for LSTM model
X_train = X_train.reshape(-1, max_seq_length, 1)
X_val = X_val.reshape(-1, max_seq_length, 1)


In [49]:
from keras.models import Sequential
from keras.layers import LSTM, Dense

# Define the model
model = Sequential()
model.add(LSTM(64, input_shape=(max_seq_length, 1)))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=20, batch_size=32)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7cfe38107700>

# References
1. https://arxiv.org/pdf/2102.03692.pdf
2. https://alvinntnu.github.io/NTNU_ENC2045_LECTURES/exercise/13-attention.html
3. https://towardsdatascience.com/deep-learning-gender-from-name-lstm-recurrent-neural-networks-448d64553044
4. https://www.nltk.org/book/ch02.html#sec-lexical-resources