<a href="https://colab.research.google.com/github/Veerabhadra-YU/Natural-Language-Processing-NLP-Course/blob/main/Assignment_4_Transformer_part_III_Veerabhadra_Rao.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Part III
Using the previous two tutorials, please answer the following using an encorder-decoder approach and an LSTM compared approach.

Please create a transformer-based classifier for English name classification into male or female.

There are several datasets for name for male or female classification. In subseuqent iterations, this could be expanded to included more classifications.

Below is the source from NLTK, which only has male and female available but could be used for the purposes of this assignment.

```
names = nltk.corpus.names
names.fileids()
['female.txt', 'male.txt']
male_names = names.words('male.txt')
female_names = names.words('female.txt')
[w for w in male_names if w in female_names]
['Abbey', 'Abbie', 'Abby', 'Addie', 'Adrian', 'Adrien', 'Ajay', 'Alex', 'Alexis',
'Alfie', 'Ali', 'Alix', 'Allie', 'Allyn', 'Andie', 'Andrea', 'Andy', 'Angel',
'Angie', 'Ariel', 'Ashley', 'Aubrey', 'Augustine', 'Austin', 'Averil', ...]
```

In [None]:
### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###
None
### END CODE HERE ###

In [None]:
!pip install tensorflow_datasets
!pip install -U tensorflow-text



In [None]:
pip install tensorflow



# **Encoder-Decoder Approach (Transformer-based)**

In [None]:
import numpy as np
import random
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel

# Load data
male_names = ['Abbey', 'Abbie', 'Abby', 'Addie', 'Adrian', 'Adrien', 'Ajay', 'Alex', 'Alexis', 'Alfie', 'Ali', 'Alix', 'Allie', 'Allyn', 'Andie', 'Andrea', 'Andy', 'Angel', 'Angie', 'Ariel', 'Ashley', 'Aubrey', 'Augustine', 'Austin', 'Averil']
female_names = ['Alice', 'Alicia', 'Alina', 'Alison', 'Alissa', 'Allyson', 'Alma', 'Althea', 'Alva', 'Alyson', 'Alyssa', 'Amber', 'Amelia', 'Amie', 'Amy', 'Ana', 'Anastasia', 'Andrea', 'Angel', 'Angela', 'Angelia', 'Angelica', 'Angelina', 'Angeline']

# Create dataset
all_names = male_names + female_names
labels = [0] * len(male_names) + [1] * len(female_names)

class NamesDataset(Dataset):
    def __init__(self, names, labels, tokenizer, max_length):
        self.names = names
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.names)

    def __getitem__(self, idx):
        name = self.names[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(name, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Split data
train_names, val_names, train_labels, val_labels = train_test_split(all_names, labels, test_size=0.2, random_state=42)

# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create datasets and data loaders
train_dataset = NamesDataset(train_names, train_labels, tokenizer, max_length=10)
val_dataset = NamesDataset(val_names, val_labels, tokenizer, max_length=10)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

# Define model
class NameClassifier(nn.Module):
    def __init__(self, hidden_size, num_classes):
        super(NameClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.fc(pooled_output)
        return logits

# Training function
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=45):
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        correct_preds = 0
        total_preds = 0

        for batch in train_loader:
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['labels']

            optimizer.zero_grad()

            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

            _, predicted = torch.max(outputs, 1)
            correct_preds += (predicted == labels).sum().item()
            total_preds += labels.size(0)

        train_loss = running_loss / len(train_loader)
        train_acc = correct_preds / total_preds

        val_loss, val_acc = evaluate_model(model, val_loader, criterion)

        print(f'Epoch {epoch+1}/{num_epochs}:')
        print(f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}')

def evaluate_model(model, val_loader, criterion):
    model.eval()
    running_loss = 0.0
    correct_preds = 0
    total_preds = 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['labels']

            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)

            running_loss += loss.item()

            _, predicted = torch.max(outputs, 1)
            correct_preds += (predicted == labels).sum().item()
            total_preds += labels.size(0)

    val_loss = running_loss / len(val_loader)
    val_acc = correct_preds / total_preds

    return val_loss, val_acc

# Instantiate model and optimizer
model = NameClassifier(hidden_size=768, num_classes=2)
optimizer = optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

# Train model
train_model(model, train_loader, val_loader, criterion, optimizer)


Epoch 1/45:
Train Loss: 0.6932, Train Acc: 0.5385, Val Loss: 0.7733, Val Acc: 0.3000
Epoch 2/45:
Train Loss: 0.7034, Train Acc: 0.5641, Val Loss: 0.7077, Val Acc: 0.3000
Epoch 3/45:
Train Loss: 0.6698, Train Acc: 0.6923, Val Loss: 0.7169, Val Acc: 0.3000
Epoch 4/45:
Train Loss: 0.6104, Train Acc: 0.6410, Val Loss: 0.7224, Val Acc: 0.3000
Epoch 5/45:
Train Loss: 0.5726, Train Acc: 0.7436, Val Loss: 0.7524, Val Acc: 0.2000
Epoch 6/45:
Train Loss: 0.5195, Train Acc: 0.7949, Val Loss: 0.8112, Val Acc: 0.3000
Epoch 7/45:
Train Loss: 0.4686, Train Acc: 0.8974, Val Loss: 0.7106, Val Acc: 0.5000
Epoch 8/45:
Train Loss: 0.4451, Train Acc: 0.8718, Val Loss: 0.7138, Val Acc: 0.6000
Epoch 9/45:
Train Loss: 0.3465, Train Acc: 0.9487, Val Loss: 0.7145, Val Acc: 0.6000
Epoch 10/45:
Train Loss: 0.2724, Train Acc: 0.9487, Val Loss: 0.7048, Val Acc: 0.5000
Epoch 11/45:
Train Loss: 0.2028, Train Acc: 0.9744, Val Loss: 0.7565, Val Acc: 0.7000
Epoch 12/45:
Train Loss: 0.2274, Train Acc: 0.9487, Val Loss: 0

In [None]:
def predict_names(model, tokenizer, names):
    model.eval()
    predictions = []

    for name in names:
        encoding = tokenizer(name, padding='max_length', truncation=True, max_length=10, return_tensors='pt')
        input_ids = encoding['input_ids'].flatten().unsqueeze(0)
        attention_mask = encoding['attention_mask'].flatten().unsqueeze(0)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask)
            _, predicted = torch.max(outputs, 1)
            predictions.append(predicted.item())

    return predictions

# Test the model on the test set
test_loader = DataLoader(val_dataset, batch_size=1, shuffle=False)
test_loss, test_acc = evaluate_model(model, test_loader, criterion)
print(f'Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}')

# Get predicted labels for test names
test_names = val_dataset.names
predicted_labels = predict_names(model, tokenizer, test_names)

# Map predicted labels to class names
class_names = ['male', 'female']
predicted_classes = [class_names[label] for label in predicted_labels]

# Print names along with predicted classes
for name, predicted_class in zip(test_names, predicted_classes):
    print(f'{name}: {predicted_class}')


Test Loss: 1.5881, Test Acc: 0.7000
Allyn: female
Angelia: female
Angelina: female
Angela: female
Angel: female
Alina: female
Alicia: female
Alice: male
Alma: female
Ariel: male


In [None]:
%%shell
jupyter nbconvert --to html /content/Assignment_4_Transformer_part_III_Veerabhadra_Rao.ipynb

[NbConvertApp] Converting notebook /content/Assignment_4_Transformer_part_III_Veerabhadra_Rao.ipynb to html
[NbConvertApp] Writing 634670 bytes to /content/Assignment_4_Transformer_part_III_Veerabhadra_Rao.html




# References
1. https://arxiv.org/pdf/2102.03692.pdf
2. https://alvinntnu.github.io/NTNU_ENC2045_LECTURES/exercise/13-attention.html
3. https://towardsdatascience.com/deep-learning-gender-from-name-lstm-recurrent-neural-networks-448d64553044
4. https://www.nltk.org/book/ch02.html#sec-lexical-resources