#  Using NLTK for Text Classifications

### import Dependencies

In [1]:
import nltk
from nltk.corpus import names
import random

### Loading Data

In [2]:
nltk.download('names')

[nltk_data] Downloading package names to
[nltk_data]     C:\Users\kanja\AppData\Roaming\nltk_data...
[nltk_data]   Package names is already up-to-date!


True

In [3]:
name_list = names

In [4]:
name_list

<WordListCorpusReader in 'C:\\Users\\kanja\\AppData\\Roaming\\nltk_data\\corpora\\names'>

In [5]:
name_list.fileids()

['female.txt', 'male.txt']

In [6]:
male_names = name_list.words('male.txt')
female_names = name_list.words('female.txt')

In [7]:
labeled_names = ([(name, 'male') for name in male_names] +
                 [(name, 'female') for name in female_names])

In [8]:
len(labeled_names)

7944

In [9]:
random.shuffle(labeled_names)

In [10]:
train_names   = labeled_names[1500:]
devtest_names = labeled_names[500:1500]
test_names    = labeled_names[:500]
len(train_names), len(devtest_names), len(test_names)

(6444, 1000, 500)

### Features Extractors

In [11]:
def feature_extractor1(name):
    name = name.lower()
    return {
        "lastletter" : name[-1],
        "name_length" : len(name),
        "firstletter": name[0],
        "last_two_letter" : name[-2:],
        "last_letter_vowel": name[-1] in "aeiou"
    }

In [12]:
def feature_extractor2(name):
    features = {}
    name = name.lower()
    features["firstletter"] = name[0]
    features["lastletter"] = name[-1]
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features[f"count ({letter})"] = name.count(letter)
        features[f"has({letter})"] = (letter in name)
    return features 

In [13]:
print(feature_extractor1("Shrek"))
print()
print(feature_extractor2("John"))
print()
print(feature_extractor1("Elizabeth"))
print()
print(feature_extractor2("Jane"))

{'lastletter': 'k', 'name_length': 5, 'firstletter': 's', 'last_two_letter': 'ek', 'last_letter_vowel': False}

{'firstletter': 'j', 'lastletter': 'n', 'count (a)': 0, 'has(a)': False, 'count (b)': 0, 'has(b)': False, 'count (c)': 0, 'has(c)': False, 'count (d)': 0, 'has(d)': False, 'count (e)': 0, 'has(e)': False, 'count (f)': 0, 'has(f)': False, 'count (g)': 0, 'has(g)': False, 'count (h)': 1, 'has(h)': True, 'count (i)': 0, 'has(i)': False, 'count (j)': 1, 'has(j)': True, 'count (k)': 0, 'has(k)': False, 'count (l)': 0, 'has(l)': False, 'count (m)': 0, 'has(m)': False, 'count (n)': 1, 'has(n)': True, 'count (o)': 1, 'has(o)': True, 'count (p)': 0, 'has(p)': False, 'count (q)': 0, 'has(q)': False, 'count (r)': 0, 'has(r)': False, 'count (s)': 0, 'has(s)': False, 'count (t)': 0, 'has(t)': False, 'count (u)': 0, 'has(u)': False, 'count (v)': 0, 'has(v)': False, 'count (w)': 0, 'has(w)': False, 'count (x)': 0, 'has(x)': False, 'count (y)': 0, 'has(y)': False, 'count (z)': 0, 'has(z)': F

In [14]:
# function to apply the feature extractor
def apply_feature(feature_fn, label_list):
    return [(feature_fn(n), g) for (n,g) in label_list]

### Model Classifiers

In [15]:
def print_error_samples(errors, max_samples=20):
    """Print a sample of misclassified names."""
    print("\nNumber of dev errors:", len(errors))
    print(f"\nSample errors (first {max_samples}):")
    for (true_label, guess, name) in sorted(errors)[:max_samples]:
        print(f"correct={true_label} guess={guess} name={name}")


In [16]:
# check if class need a
import inspect

def requires_arguments(cls):
    sig = inspect.signature(cls.__init__)
    args = sig.parameters
    return len(args) > 1

In [17]:
def train_and_evaluate(clf, clf_name, feature_fn, feature_name):
    print("=" * 80)
    print(f"{clf_name}  |  Features: {feature_name}")
    print("-" * 80)
    
    # prepare feature sets
    # train_set   = apply_feature(feature_fn, train_names)
    devtest_set = apply_feature(feature_fn, devtest_names)
    test_set    = apply_feature(feature_fn, test_names)
    
    # train the classifier

    if requires_arguments(clf):
        train_set   = apply_feature(feature_fn, train_names)
        classifier = clf.train(train_set)
    else:
        classifier = clf
    
    # compute accuracy on dev and test sets
    dev_acc  = nltk.classify.accuracy(classifier, devtest_set)
    test_acc = nltk.classify.accuracy(classifier, test_set)
    print(f"Dev accuracy : {dev_acc:.3f}")
    print(f"Test accuracy: {test_acc:.3f}")
    
    # build error list on the dev set
    errors = []
    for (name, true_label) in devtest_names:
        guess = classifier.classify(feature_fn(name))
        if guess != true_label:
            errors.append((true_label, guess, name))
    
    # print a subset of errors
    print_error_samples(errors, max_samples=20)

### Model One

In [18]:
train_set   = apply_feature(feature_extractor1, train_names)
classifier = nltk.DecisionTreeClassifier.train(train_set)

train_and_evaluate(classifier, "Nltk DecisionTreeClassifier", feature_extractor1, "Feature Set 1 (last_letter/name length/firstletter/end_vowel)")

Nltk DecisionTreeClassifier  |  Features: Feature Set 1 (last_letter/name length/firstletter/end_vowel)
--------------------------------------------------------------------------------
Dev accuracy : 0.776
Test accuracy: 0.716

Number of dev errors: 224

Sample errors (first 20):
correct=female guess=male name=Abbe
correct=female guess=male name=Adore
correct=female guess=male name=Ali
correct=female guess=male name=Allie
correct=female guess=male name=Amargo
correct=female guess=male name=Andromache
correct=female guess=male name=Anne-Mar
correct=female guess=male name=Anni
correct=female guess=male name=Ariel
correct=female guess=male name=Audre
correct=female guess=male name=Aurie
correct=female guess=male name=Bebe
correct=female guess=male name=Becky
correct=female guess=male name=Belle
correct=female guess=male name=Bevvy
correct=female guess=male name=Bobine
correct=female guess=male name=Britt
correct=female guess=male name=Caril
correct=female guess=male name=Carmon
correct=fe

In [19]:
train_set   = apply_feature(feature_extractor2, train_names)
classifier = nltk.DecisionTreeClassifier.train(train_set)

train_and_evaluate(classifier, "Nltk DecisionTreeClassifier", feature_extractor2, "Feature Set 2 (first/last/counters)")

Nltk DecisionTreeClassifier  |  Features: Feature Set 2 (first/last/counters)
--------------------------------------------------------------------------------
Dev accuracy : 0.812
Test accuracy: 0.758

Number of dev errors: 188

Sample errors (first 20):
correct=female guess=male name=Ali
correct=female guess=male name=Amargo
correct=female guess=male name=Anne-Mar
correct=female guess=male name=Aphrodite
correct=female guess=male name=Aprilette
correct=female guess=male name=Ashlen
correct=female guess=male name=Ashli
correct=female guess=male name=Becky
correct=female guess=male name=Blinny
correct=female guess=male name=Charlott
correct=female guess=male name=Cindelyn
correct=female guess=male name=Darb
correct=female guess=male name=Deb
correct=female guess=male name=Denys
correct=female guess=male name=Doloritas
correct=female guess=male name=Dorian
correct=female guess=male name=Easter
correct=female guess=male name=Em
correct=female guess=male name=Flo
correct=female guess=male 

### Model Two

In [20]:
train_set   = apply_feature(feature_extractor1, train_names)
classifier = nltk.ConditionalExponentialClassifier.train(train_set)

train_and_evaluate(classifier, "Nltk ConditionalExponentialClassifier", feature_extractor1,  "Feature Set 1 (last_letter/name length/firstletter/end_vowel)")

  ==> Training (100 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -0.69315        0.370
             2          -0.42935        0.791
             3          -0.37452        0.805
             4          -0.34920        0.807
             5          -0.33464        0.810
             6          -0.32513        0.813
             7          -0.31840        0.816
             8          -0.31338        0.816
             9          -0.30949        0.819
            10          -0.30639        0.820
            11          -0.30386        0.821
            12          -0.30175        0.822
            13          -0.29998        0.823
            14          -0.29846        0.822
            15          -0.29715        0.822
            16          -0.29601        0.822
            17          -0.29501        0.821
            18          -0.29412        0.821
            19          -0.29333        0.821
 

In [21]:
train_set   = apply_feature(feature_extractor1, train_names)
classifier = nltk.ConditionalExponentialClassifier.train(train_set)

train_and_evaluate(classifier, "Nltk ConditionalExponentialClassifier", feature_extractor1, "Feature Set 2 (first/last/counters)")

  ==> Training (100 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -0.69315        0.370
             2          -0.42935        0.791
             3          -0.37452        0.805
             4          -0.34920        0.807
             5          -0.33464        0.810
             6          -0.32513        0.813
             7          -0.31840        0.816
             8          -0.31338        0.816
             9          -0.30949        0.819
            10          -0.30639        0.820
            11          -0.30386        0.821
            12          -0.30175        0.822
            13          -0.29998        0.823
            14          -0.29846        0.822
            15          -0.29715        0.822
            16          -0.29601        0.822
            17          -0.29501        0.821
            18          -0.29412        0.821
            19          -0.29333        0.821
 

### Model Three

In [22]:
from sklearn.svm import LinearSVC
classifier = nltk.SklearnClassifier(LinearSVC())
train_and_evaluate(classifier, "Nltk SklearnClassifier", feature_extractor2,  "Feature Set 1 (last_letter/name length/firstletter/end_vowel)")

Nltk SklearnClassifier  |  Features: Feature Set 1 (last_letter/name length/firstletter/end_vowel)
--------------------------------------------------------------------------------
Dev accuracy : 0.824
Test accuracy: 0.778

Number of dev errors: 176

Sample errors (first 20):
correct=female guess=male name=Amargo
correct=female guess=male name=Andromache
correct=female guess=male name=Anne-Mar
correct=female guess=male name=Aphrodite
correct=female guess=male name=Ardath
correct=female guess=male name=Ashlen
correct=female guess=male name=Audre
correct=female guess=male name=Aurore
correct=female guess=male name=Babs
correct=female guess=male name=Becky
correct=female guess=male name=Blondell
correct=female guess=male name=Britt
correct=female guess=male name=Carmon
correct=female guess=male name=Charis
correct=female guess=male name=Charlott
correct=female guess=male name=Chicky
correct=female guess=male name=Darb
correct=female guess=male name=Deb
correct=female guess=male name=Dolori

In [23]:
classifier = nltk.SklearnClassifier(LinearSVC())
train_and_evaluate(classifier, "Nltk SklearnClassifier", feature_extractor2, "Feature Set 2 (first/last/count)")

Nltk SklearnClassifier  |  Features: Feature Set 2 (first/last/count)
--------------------------------------------------------------------------------
Dev accuracy : 0.824
Test accuracy: 0.778

Number of dev errors: 176

Sample errors (first 20):
correct=female guess=male name=Amargo
correct=female guess=male name=Andromache
correct=female guess=male name=Anne-Mar
correct=female guess=male name=Aphrodite
correct=female guess=male name=Ardath
correct=female guess=male name=Ashlen
correct=female guess=male name=Audre
correct=female guess=male name=Aurore
correct=female guess=male name=Babs
correct=female guess=male name=Becky
correct=female guess=male name=Blondell
correct=female guess=male name=Britt
correct=female guess=male name=Carmon
correct=female guess=male name=Charis
correct=female guess=male name=Charlott
correct=female guess=male name=Chicky
correct=female guess=male name=Darb
correct=female guess=male name=Deb
correct=female guess=male name=Doloritas
correct=female guess=male