In [35]:
import nltk
from nltk.corpus import names
from nltk.classify import SklearnClassifier
import random
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier

In [36]:
nltk.download('names')

[nltk_data] Downloading package names to
[nltk_data]     C:\Users\ilkeb\AppData\Roaming\nltk_data...
[nltk_data]   Package names is already up-to-date!


True

In [37]:
# get male and female name lists from the names corpus
name_corpus = nltk.corpus.names
male_names = name_corpus.words('male.txt')
female_names = name_corpus.words('female.txt')

In [38]:
# create a (name, label) list
labeled_names = ([(name, 'male') for name in male_names] +
                 [(name, 'female') for name in female_names])

In [39]:
# shuffle the list
random.shuffle(labeled_names)

len(labeled_names), labeled_names[:10]

(7944,
 [('Woodman', 'male'),
  ('Sean', 'female'),
  ('Charin', 'female'),
  ('Shep', 'male'),
  ('Margarette', 'female'),
  ('Heinz', 'male'),
  ('Obadiah', 'male'),
  ('Greta', 'female'),
  ('Bird', 'female'),
  ('Christian', 'female')])

In [40]:
def gender_features_1(word):
    return {'last_letter': word[-1].lower()}


In [41]:
def gender_features_2(name):
    features = {}
    name = name.lower()
    features["firstletter"] = name[0]
    features["lastletter"] = name[-1]
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.count(letter)
        features["has(%s)" % letter] = (letter in name)
    return features


In [42]:
print(gender_features_1("Shrek"))
print(gender_features_2("John"))

{'last_letter': 'k'}
{'firstletter': 'j', 'lastletter': 'n', 'count(a)': 0, 'has(a)': False, 'count(b)': 0, 'has(b)': False, 'count(c)': 0, 'has(c)': False, 'count(d)': 0, 'has(d)': False, 'count(e)': 0, 'has(e)': False, 'count(f)': 0, 'has(f)': False, 'count(g)': 0, 'has(g)': False, 'count(h)': 1, 'has(h)': True, 'count(i)': 0, 'has(i)': False, 'count(j)': 1, 'has(j)': True, 'count(k)': 0, 'has(k)': False, 'count(l)': 0, 'has(l)': False, 'count(m)': 0, 'has(m)': False, 'count(n)': 1, 'has(n)': True, 'count(o)': 1, 'has(o)': True, 'count(p)': 0, 'has(p)': False, 'count(q)': 0, 'has(q)': False, 'count(r)': 0, 'has(r)': False, 'count(s)': 0, 'has(s)': False, 'count(t)': 0, 'has(t)': False, 'count(u)': 0, 'has(u)': False, 'count(v)': 0, 'has(v)': False, 'count(w)': 0, 'has(w)': False, 'count(x)': 0, 'has(x)': False, 'count(y)': 0, 'has(y)': False, 'count(z)': 0, 'has(z)': False}


In [43]:
train_names   = labeled_names[1500:]
devtest_names = labeled_names[500:1500]
test_names    = labeled_names[:500]

len(train_names), len(devtest_names), len(test_names)


(6444, 1000, 500)

In [44]:
def make_featuresets(feature_fn, name_label_list):
    """
    name_label_list: [(name, label), ...]
    feature_fn: gender_features_1 veya gender_features_2
    """
    return [(feature_fn(name), label) for (name, label) in name_label_list]


In [45]:
def print_error_samples(errors, max_samples=20):
    """Print a sample of misclassified names."""
    print("\nNumber of dev errors:", len(errors))
    print(f"\nSample errors (first {max_samples}):")
    for (true_label, guess, name) in sorted(errors)[:max_samples]:
        print("correct=%-8s guess=%-8s name=%-30s" %
              (true_label, guess, name))


In [46]:
def train_and_evaluate(clf, clf_name, feature_fn, feature_name):
    print("=" * 80)
    print(f"{clf_name}  |  Features: {feature_name}")
    print("-" * 80)
    
    # prepare feature sets
    train_set   = make_featuresets(feature_fn, train_names)
    devtest_set = make_featuresets(feature_fn, devtest_names)
    test_set    = make_featuresets(feature_fn, test_names)
    
    # train the classifier
    classifier = clf.train(train_set)
    
    # compute accuracy on dev and test sets
    dev_acc  = nltk.classify.accuracy(classifier, devtest_set)
    test_acc = nltk.classify.accuracy(classifier, test_set)
    print(f"Dev accuracy : {dev_acc:.3f}")
    print(f"Test accuracy: {test_acc:.3f}")
    
    # build error list on the dev set
    errors = []
    for (name, true_label) in devtest_names:
        guess = classifier.classify(feature_fn(name))
        if guess != true_label:
            errors.append((true_label, guess, name))
    
    # print a subset of errors
    print_error_samples(errors, max_samples=20)
    
    return classifier, errors


In [47]:
# NLTK wrapper with Logistic Regression
log_reg_clf = SklearnClassifier(
    LogisticRegression(max_iter=1000)
)

In [48]:
# --- Feature set 1: last_letter ---
logreg_f1, errors_logreg_f1 = train_and_evaluate(
    log_reg_clf, 
    clf_name="Classifier #1: Logistic Regression",
    feature_fn=gender_features_1,
    feature_name="Feature Set 1 (last_letter)"
)

Classifier #1: Logistic Regression  |  Features: Feature Set 1 (last_letter)
--------------------------------------------------------------------------------
Dev accuracy : 0.738
Test accuracy: 0.760

Number of dev errors: 262

Sample errors (first 20):
correct=female   guess=male     name=Addis                         
correct=female   guess=male     name=Ag                            
correct=female   guess=male     name=Aidan                         
correct=female   guess=male     name=Aileen                        
correct=female   guess=male     name=Amargo                        
correct=female   guess=male     name=Ambur                         
correct=female   guess=male     name=Anett                         
correct=female   guess=male     name=Annabel                       
correct=female   guess=male     name=Ardelis                       
correct=female   guess=male     name=Ariel                         
correct=female   guess=male     name=Arlyn                        

In [49]:
# --- Feature set 2: advanced features2 ---
log_reg_clf2 = SklearnClassifier(
    LogisticRegression(max_iter=1000)
)

In [50]:
logreg_f2, errors_logreg_f2 = train_and_evaluate(
    log_reg_clf2, 
    clf_name="Classifier #1: Logistic Regression",
    feature_fn=gender_features_2,
    feature_name="Feature Set 2 (first/last/counters)"
)

Classifier #1: Logistic Regression  |  Features: Feature Set 2 (first/last/counters)
--------------------------------------------------------------------------------
Dev accuracy : 0.790
Test accuracy: 0.810

Number of dev errors: 210

Sample errors (first 20):
correct=female   guess=male     name=Addis                         
correct=female   guess=male     name=Ag                            
correct=female   guess=male     name=Amargo                        
correct=female   guess=male     name=Ambur                         
correct=female   guess=male     name=Anett                         
correct=female   guess=male     name=Ardath                        
correct=female   guess=male     name=Ardelis                       
correct=female   guess=male     name=Ashlen                        
correct=female   guess=male     name=Aubry                         
correct=female   guess=male     name=Auguste                       
correct=female   guess=male     name=Avis                 

In [51]:
svm_clf = SklearnClassifier(
    LinearSVC()
)

In [52]:
# --- Feature set 1 ---
svm_f1, errors_svm_f1 = train_and_evaluate(
    svm_clf,
    clf_name="Classifier #2: Linear SVM (LinearSVC)",
    feature_fn=gender_features_1,
    feature_name="Feature Set 1 (last_letter)"
)

Classifier #2: Linear SVM (LinearSVC)  |  Features: Feature Set 1 (last_letter)
--------------------------------------------------------------------------------
Dev accuracy : 0.739
Test accuracy: 0.756

Number of dev errors: 261

Sample errors (first 20):
correct=female   guess=male     name=Addis                         
correct=female   guess=male     name=Ag                            
correct=female   guess=male     name=Aidan                         
correct=female   guess=male     name=Aileen                        
correct=female   guess=male     name=Amargo                        
correct=female   guess=male     name=Ambur                         
correct=female   guess=male     name=Anett                         
correct=female   guess=male     name=Annabel                       
correct=female   guess=male     name=Ardelis                       
correct=female   guess=male     name=Ariel                         
correct=female   guess=male     name=Arlyn                     

In [53]:
# --- Feature set 2 ---
svm_clf2 = SklearnClassifier(
    LinearSVC()
)

svm_f2, errors_svm_f2 = train_and_evaluate(
    svm_clf2,
    clf_name="Classifier #2: Linear SVM (LinearSVC)",
    feature_fn=gender_features_2,
    feature_name="Feature Set 2 (first/last/counters)"
)

Classifier #2: Linear SVM (LinearSVC)  |  Features: Feature Set 2 (first/last/counters)
--------------------------------------------------------------------------------
Dev accuracy : 0.789
Test accuracy: 0.812

Number of dev errors: 211

Sample errors (first 20):
correct=female   guess=male     name=Addis                         
correct=female   guess=male     name=Ag                            
correct=female   guess=male     name=Aidan                         
correct=female   guess=male     name=Amargo                        
correct=female   guess=male     name=Ambur                         
correct=female   guess=male     name=Anett                         
correct=female   guess=male     name=Ardath                        
correct=female   guess=male     name=Ardelis                       
correct=female   guess=male     name=Ashlen                        
correct=female   guess=male     name=Aubry                         
correct=female   guess=male     name=Auguste           

In [54]:
knn_clf = SklearnClassifier(
    KNeighborsClassifier(n_neighbors=5)
)

In [55]:
# --- Feature set 1 ---
knn_f1, errors_knn_f1 = train_and_evaluate(
    knn_clf,
    clf_name="Classifier #3: KNN (k=5)",
    feature_fn=gender_features_1,
    feature_name="Feature Set 1 (last_letter)"
)

Classifier #3: KNN (k=5)  |  Features: Feature Set 1 (last_letter)
--------------------------------------------------------------------------------
Dev accuracy : 0.735
Test accuracy: 0.724

Number of dev errors: 265

Sample errors (first 20):
correct=female   guess=male     name=Addis                         
correct=female   guess=male     name=Ag                            
correct=female   guess=male     name=Aidan                         
correct=female   guess=male     name=Aileen                        
correct=female   guess=male     name=Amargo                        
correct=female   guess=male     name=Ambur                         
correct=female   guess=male     name=Andy                          
correct=female   guess=male     name=Anett                         
correct=female   guess=male     name=Angy                          
correct=female   guess=male     name=Ardelis                       
correct=female   guess=male     name=Arlyn                         
correct=

In [56]:
# --- Feature set 2 ---
knn_clf2 = SklearnClassifier(
    KNeighborsClassifier(n_neighbors=5)
)

knn_f2, errors_knn_f2 = train_and_evaluate(
    knn_clf2,
    clf_name="Classifier #3: KNN (k=5)",
    feature_fn=gender_features_2,
    feature_name="Feature Set 2 (first/last/counters)"
)

Classifier #3: KNN (k=5)  |  Features: Feature Set 2 (first/last/counters)
--------------------------------------------------------------------------------
Dev accuracy : 0.776
Test accuracy: 0.816

Number of dev errors: 224

Sample errors (first 20):
correct=female   guess=male     name=Ambur                         
correct=female   guess=male     name=Antonina                      
correct=female   guess=male     name=Ashlen                        
correct=female   guess=male     name=Aubry                         
correct=female   guess=male     name=Auguste                       
correct=female   guess=male     name=Barry                         
correct=female   guess=male     name=Bride                         
correct=female   guess=male     name=Darline                       
correct=female   guess=male     name=Darryl                        
correct=female   guess=male     name=Daryl                         
correct=female   guess=male     name=Devan                         
