In [347]:
import random
import nltk
from nltk.corpus import names
from nltk import DecisionTreeClassifier, NaiveBayesClassifier, classify
from nltk.classify import MaxentClassifier
from sklearn.metrics import (
    precision_score, recall_score, f1_score,
    accuracy_score, confusion_matrix
)
import pandas as pd
nltk.download("names", quiet=True)
from IPython.display import display_html
import re
random.seed(46)


## 1. Introduction

The goal of this experiment is to build a supervised classifier that can predict the gender of a given first name.  
The task uses the *Names Corpus* from the **Natural Language Toolkit (NLTK)**, which contains approximately **7,900 English names**, each labeled as either *male* or *female*.  

This exercise follows the workflow described in *Chapter 6* of *Natural Language Processing with Python*, where the objective is to:
1. Train a model using labeled data.  
2. Make incremental feature improvements guided by a development test (dev-test) set.  
3. Evaluate the final model’s performance on an unseen test set.  

Three classifiers are explored as described in the book:
- **Naive Bayes Classifier**
- **Decision Tree Classifier**
- **Maximum Entropy (MaxEnt) Classifier**

## 2. Data Preparation

The *Names Corpus* is stored within NLTK under two files: `male.txt` and `female.txt`, each containing a list of names associated with the respective gender label.  

To avoid ordering bias, the dataset was **randomly shuffled** using Python’s random module with a fixed seed for reproducibility.  
The shuffled names were divided into three subsets:
- **Training set (6,900 names)** — used to train the model.  
- **Dev-test set (500 names)** — used during model development to test incremental improvements.  
- **Test set (500 names)** — reserved for final evaluation after model tuning.

To streamline the process, I created **helper functions** that handle both the random splitting of the data and the feature extraction.  
Each helper function accepts a **feature function** as an argument, allowing the experiment to easily test different feature engineering strategies across multiple runs while maintaining consistent train/dev/test splits.


In [348]:
def load_and_split():
    data = [(n, "male") for n in names.words("male.txt")] + \
           [(n, "female") for n in names.words("female.txt")]
    random.shuffle(data)
    # 6900 train, 500 dev-test, 500 test
    train, dev_test, test = data[:6900], data[6900:7400], data[7400:7900]
    return train, dev_test, test



In [349]:


def make_sets(data,gender_features):
    return [(gender_features(n), g) for (n, g) in data]

def train_models(train_set):
    models = {
        "NaiveBayes": NaiveBayesClassifier.train(train_set),
        "DecisionTree": DecisionTreeClassifier.train(train_set),
        "MaxEnt": MaxentClassifier.train(train_set, max_iter=50,trace=0)
    }
    return models





def evaluate_model(model, dataset):
    y_true = [label for (_, label) in dataset]
    y_pred = [model.classify(features) for (features, _) in dataset]

    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, pos_label="female")
    rec = recall_score(y_true, y_pred, pos_label="female")
    f1 = f1_score(y_true, y_pred, pos_label="female")

    cm = confusion_matrix(y_true, y_pred, labels=["female", "male"])
    TP, FN, FP, TN = cm[0, 0], cm[0, 1], cm[1, 0], cm[1, 1]
    sensitivity = TP / (TP + FN) if (TP + FN) else 0
    specificity = TN / (TN + FP) if (TN + FP) else 0

    return {
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1": f1,
        "Sensitivity": sensitivity,
        "Specificity": specificity
    }

def generate_report(models, dataset, round_digits=3):
    results = []
    for name, model in models.items():
        metrics = evaluate_model(model, dataset)
        metrics["Model"] = name
        results.append(metrics)
    df = pd.DataFrame(results).set_index("Model").round(round_digits)
    return df


def display_side_by_side(dfs, titles=None):
    html_str = "<div style='display:flex;flex-flow:row nowrap;column-gap:10px'>"
    for df, title in zip(dfs, titles):
        html_str += f"""
        <div style="margin:10px">
            <h4 style="text-align:center">{title}</h4>
            {df.to_html()}
        </div>"""
    html_str += "</div>"

    display_html(html_str, raw=True)

## 3. Model Training


Three supervised classifiers were trained using NLTK’s implementations:  
**Naive Bayes**, **Decision Tree**, and **Maximum Entropy (MaxEnt)**.  
All models were trained on the **training set (6,900 names)**, validated on the **dev-test set (500 names)** for incremental improvements, and later evaluated on the **test set (500 names)** for final performance.

Each experiment uses a helper function, `train_models()`, to train all classifiers consistently.  
Model performance is evaluated using the following metrics:

| Metric | Description |
|---------|--------------|
| **Accuracy** | Overall proportion of correct predictions |
| **Precision** | Proportion of positive predictions that were correct |
| **Recall** | Proportion of actual positives correctly identified |
| **F1** | Harmonic mean of Precision and Recall |
| **Sensitivity** | True positive rate for the positive class (female) |
| **Specificity** | True negative rate for the negative class (male) |

All metrics are reported per experiment in tabular form for the **training**, **dev-test**, and **test** sets.


### 3.1 Experiement 1

In this experiment, I use four simple linguistic features derived from each name: the **first letter**, **last letter**, **name length**, and whether the name **ends with a vowel**. These features capture common gender patterns in names. For instance, many female names tend to end with vowels, while male names often end with consonants. 

In [350]:
def gender_features(name):
    name = name.lower()
    vowels = set("aeiou")

    return {
        "last_letter": name[-1],
        "first_letter": name[0],
        "length": len(name),
        "ends_with_vowel": name[-1] in vowels
    }

In [351]:
random.seed(45)

train, dev_test, test = load_and_split()
train_set, dev_test_set, test_set = map( lambda data : make_sets(data,gender_features), [train, dev_test, test])

models = train_models(train_set)
# Run each separately if you want:
train_report = generate_report(models, train_set)
dev_report = generate_report(models, dev_test_set)
test_report = generate_report(models, test_set)

datasets_r1 = [train_report,test_report,dev_report]
r1_labels = ["Training Set (Exp 1)","Dev Set (Exp 1)", "Test Set (Exp 1)"]
display_side_by_side(datasets_r1,r1_labels)

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1,Sensitivity,Specificity
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
NaiveBayes,0.751,0.863,0.716,0.783,0.716,0.809
DecisionTree,0.841,0.845,0.914,0.878,0.914,0.717
MaxEnt,0.787,0.813,0.857,0.834,0.857,0.667

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1,Sensitivity,Specificity
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
NaiveBayes,0.744,0.852,0.745,0.795,0.745,0.743
DecisionTree,0.728,0.764,0.856,0.807,0.856,0.473
MaxEnt,0.736,0.782,0.838,0.809,0.838,0.533

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1,Sensitivity,Specificity
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
NaiveBayes,0.732,0.847,0.709,0.772,0.709,0.772
DecisionTree,0.778,0.806,0.859,0.832,0.859,0.633
MaxEnt,0.76,0.791,0.85,0.819,0.85,0.6



The **Naive Bayes** model had accuracy around **0.75** on the training set, **0.74** on the dev set, and **0.73** on the test set. It predicts female names correctly most of the time, but its recall shows it still misses some. The difference between its training and test results is small, which means it’s stable but not the most accurate.  

The **Decision Tree** model reached the highest training accuracy at **0.84**, but that dropped to **0.73** on the dev set and **0.78** on the test set. It finds most female names correctly but also mistakes many male names as female. The gap between its training and evaluation results shows it learned patterns too specific to the training data.  

The **Maximum Entropy** model stayed balanced and consistent across all three sets. Its accuracy was about **0.79** on training, **0.74** on the dev set, and **0.76** on the test set. Both its recall and precision remained strong, meaning it recognizes female names accurately without making many false predictions.  

Among the three models, the **Maximum Entropy** classifier performed the best overall. It achieved a good balance between accuracy, recall, and precision, while remaining stable across all datasets.  

As the next step, additional experiments will be performed to test new feature combinations and see if the overall metrics can be improved further.


### 3.2 Experiment 2

In this experiment, we keep the features from Experiment One and add new ones to better describe the sound and structure of names. The **last two letters** help detect common endings that may relate to gender, such as “-na” or “-us.” The **presence of “y”** (`has_y`) captures names that often differ by gender, like “Mary” or “Tony.” The **double letter** feature (`has_double`) looks for repeated letters (like “ll” in “Allison”), which can be more common in certain names. Lastly, the **vowel** and **consonant counts** show how many of each type of letter a name has, helping the model notice sound patterns that may differ between male and female names.


In [352]:
def gender_features2(name):
    name = name.lower()
    vowels = set("aeiou")
    vcount = sum(c in vowels for c in name)
    ccount = sum(c.isalpha() and c not in vowels for c in name)

    return {
        "last_letter": name[-1],
        "last_two": name[-2:],
        "first_letter": name[0],
        "length": len(name),
        "ends_with_vowel": name[-1] in vowels,
                
        'has_y': 'y' in name,
        'has_double': any(a == b for a, b in zip(name, name[1:])),
        "vowel_count": vcount,
        "consonant_count": ccount,
    }

In [355]:
# ----- Experiment Two -----
random.seed(455)
exp2_train, exp2_dev_test, exp2_test = load_and_split()
exp2_train_set, exp2_dev_test_set, exp2_test_set = map(lambda data : make_sets(data,gender_features2), [exp2_train, exp2_dev_test, exp2_test])

exp2_models = train_models(exp2_train_set)

exp2_train_report = generate_report(exp2_models, exp2_train_set)
exp2_dev_report = generate_report(exp2_models, exp2_dev_test_set)
exp2_test_report = generate_report(exp2_models, exp2_test_set)

exp2_datasets = [exp2_train_report, exp2_dev_report, exp2_test_report]
exp2_labels = ["Training Set (Exp 2)", "Dev-test Set (Exp 2)", "Test Set (Exp 2)"]

display_side_by_side(exp2_datasets, exp2_labels)


Unnamed: 0_level_0,Accuracy,Precision,Recall,F1,Sensitivity,Specificity
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
NaiveBayes,0.765,0.866,0.741,0.799,0.741,0.806
DecisionTree,0.919,0.919,0.955,0.937,0.955,0.857
MaxEnt,0.827,0.846,0.887,0.866,0.887,0.726

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1,Sensitivity,Specificity
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
NaiveBayes,0.746,0.848,0.712,0.774,0.712,0.799
DecisionTree,0.756,0.786,0.827,0.806,0.827,0.644
MaxEnt,0.772,0.798,0.84,0.818,0.84,0.665

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1,Sensitivity,Specificity
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
NaiveBayes,0.774,0.887,0.748,0.812,0.748,0.822
DecisionTree,0.772,0.821,0.831,0.826,0.831,0.661
MaxEnt,0.818,0.847,0.88,0.863,0.88,0.701


In Experiment Two, adding new features improved performance on both the training and test sets. On the **training set**, the **Decision Tree** reached the best results with an **accuracy of 0.919**, **recall of 0.955**, and **F1-score of 0.937**, showing that it learned the training data very well. However, this near-perfect performance also suggests some overfitting. The **MaxEnt model** achieved a good balance with an **accuracy of 0.827**, **precision of 0.846**, and **recall of 0.887**, showing it can make accurate predictions without memorizing the data. The **Naive Bayes** model improved slightly, reaching an **accuracy of 0.765**, but its **recall (0.741)** was still lower than the other models.  

On the **test set**, the **MaxEnt model** gave the strongest results overall, with an **accuracy of 0.818**, **recall of 0.880**, and **F1-score of 0.863**. This shows it generalized well to unseen data. The **Decision Tree**, while slightly less accurate (**0.772**), still performed strongly but showed signs of overfitting because its recall dropped from 0.955 to 0.831. The **Naive Bayes** model stayed consistent, improving slightly to **accuracy 0.774**, but it continued to lag behind in recall and sensitivity.  

In short, the new features helped all models learn better patterns. The **Decision Tree** learned the training data best but did not generalize as well. The **MaxEnt model** showed the best overall balance, performing well on both training and test sets. The **Naive Bayes** model remained stable but less powerful. The **dev-test results** confirmed these trends, showing that MaxEnt remained the most reliable model.


### Experiment 3

In this experiment, we keep all the features from Experiment Two and add new ones to capture the rhythm and sound structure of names in more detail. The **vowel ratio** measures how many vowels appear compared to the total length of the name. This helps identify smoother or softer-sounding names, which are sometimes linked to gender patterns. The **syllables** feature estimates how many vowel groups the name contains, giving an idea of how complex or long the name sounds when spoken.  

We also include three new features that look at how the name ends: **ends_liquid**, **ends_nasal**, and **ends_sibilant**. These features check whether the last letter is part of certain sound types—like “l” or “r” for liquids, “m” or “n” for nasals, and “s” or “z” for sibilants. These endings often carry subtle sound patterns that can differ across male and female names. 

In [None]:
def gender_features3(name):
    name = name.lower()

    vowels = set("aeiou")
    vcount = sum(c in vowels for c in name)
    ccount = sum(c.isalpha() and c not in vowels for c in name)
    v = set("aeiou")
    liquid = set("lr")
    nasal  = set("mn")
    sibil  = set("szcxj")  # crude

    return {
        "last_letter": name[-1],
        "last_two": name[-2:],
        "first_letter": name[0],
        "length": len(name),
        "ends_with_vowel": name[-1] in "aeiou",
                
        'has_y': 'y' in name,
        'has_double': any(a == b for a, b in zip(name, name[1:])),
        "vowel_count": vcount,
        "consonant_count": ccount,
        
        
        "vowel_ratio": vcount / max(1, len(name)),
        "syllables":len(re.findall(r"[aeiouy]+", name)) ,
        "ends_liquid": name[-1] in liquid,
        "ends_nasal": name[-1] in nasal,
        "ends_sibilant": name[-1] in sibil

    }

In [None]:
# ----- Experiment Three -----
random.seed(455)
exp3_train, exp3_dev_test, exp3_test = load_and_split()
exp3_train_set, exp3_dev_test_set, exp3_test_set = map(
    lambda data: make_sets(data, gender_features3),
    [exp3_train, exp3_dev_test, exp3_test]
)

exp3_models = train_models(exp3_train_set)

exp3_train_report = generate_report(exp3_models, exp3_train_set)
exp3_dev_report = generate_report(exp3_models, exp3_dev_test_set)
exp3_test_report = generate_report(exp3_models, exp3_test_set)

exp3_datasets = [exp3_train_report, exp3_dev_report, exp3_test_report]
exp3_labels = ["Training Set (Exp 3)", "Dev-test Set (Exp 3)", "Test Set (Exp 3)"]

display_side_by_side(exp3_datasets, exp3_labels)


Unnamed: 0_level_0,Accuracy,Precision,Recall,F1,Sensitivity,Specificity
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
NaiveBayes,0.762,0.865,0.74,0.798,0.74,0.8
DecisionTree,0.923,0.932,0.949,0.94,0.949,0.879
MaxEnt,0.825,0.846,0.886,0.865,0.886,0.719

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1,Sensitivity,Specificity
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
NaiveBayes,0.726,0.826,0.706,0.761,0.706,0.759
DecisionTree,0.73,0.767,0.809,0.787,0.809,0.602
MaxEnt,0.762,0.776,0.864,0.818,0.864,0.597

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1,Sensitivity,Specificity
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
NaiveBayes,0.766,0.821,0.755,0.787,0.755,0.78
DecisionTree,0.77,0.779,0.836,0.806,0.836,0.682
MaxEnt,0.784,0.785,0.857,0.819,0.857,0.687
