# Text Classifications Using NLTK
### Dependencies import

In [1]:
import nltk

In [2]:
from nltk.corpus import names
import random

### Data Extracts from the nltk corpus library

In [3]:
nltk.download('names')

[nltk_data] Downloading package names to
[nltk_data]     C:\Users\300389976\AppData\Roaming\nltk_data...
[nltk_data]   Package names is already up-to-date!


True

In [4]:
names = names
names

<WordListCorpusReader in 'C:\\Users\\300389976\\AppData\\Roaming\\nltk_data\\corpora\\names'>

In [5]:
names.fileids()

['female.txt', 'male.txt']

In [6]:
male_names = names.words('male.txt')

In [7]:
female_names = names.words('female.txt')

In [8]:
names_list = [(name, 'males') for name in male_names] + [(name, 'female') for name in female_names]

In [9]:
random.shuffle(names_list)

### Feature Functions

In [10]:
# function to get the last character in a name

def feature_extractor1(name):
    return {'last_letter': name[-1]}

In [11]:
# function that counts the number of vowels in a name

def feature_extractor2(name):
    vowels = 'aeiou'
    count = sum(1 for ch in name.lower() if ch in vowels)
    return {"vowel_count" : count}

In [12]:
# function that count return first, last , count and presence boolean

def feature_extractor3(name):
    features = {}
    name = name.lower()
    features["firstletter"] = name[0]
    features["lastletter"] = name[-1]
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features[f"count ({letter})"] = name.count(letter)
        features[f"has({letter})"] = (letter in name)
    return features

In [13]:
print(feature_extractor1("Elizabeth"))
print(feature_extractor1("Escobar"))

{'last_letter': 'h'}
{'last_letter': 'r'}


In [14]:
print(feature_extractor2("Elizabeth"))
print(feature_extractor2("Escobar"))

{'vowel_count': 4}
{'vowel_count': 3}


In [15]:
print(feature_extractor3("Elizabeth"))
print()
print(feature_extractor3("Escobar"))

{'firstletter': 'e', 'lastletter': 'h', 'count (a)': 1, 'has(a)': True, 'count (b)': 1, 'has(b)': True, 'count (c)': 0, 'has(c)': False, 'count (d)': 0, 'has(d)': False, 'count (e)': 2, 'has(e)': True, 'count (f)': 0, 'has(f)': False, 'count (g)': 0, 'has(g)': False, 'count (h)': 1, 'has(h)': True, 'count (i)': 1, 'has(i)': True, 'count (j)': 0, 'has(j)': False, 'count (k)': 0, 'has(k)': False, 'count (l)': 1, 'has(l)': True, 'count (m)': 0, 'has(m)': False, 'count (n)': 0, 'has(n)': False, 'count (o)': 0, 'has(o)': False, 'count (p)': 0, 'has(p)': False, 'count (q)': 0, 'has(q)': False, 'count (r)': 0, 'has(r)': False, 'count (s)': 0, 'has(s)': False, 'count (t)': 1, 'has(t)': True, 'count (u)': 0, 'has(u)': False, 'count (v)': 0, 'has(v)': False, 'count (w)': 0, 'has(w)': False, 'count (x)': 0, 'has(x)': False, 'count (y)': 0, 'has(y)': False, 'count (z)': 1, 'has(z)': True}

{'firstletter': 'e', 'lastletter': 'r', 'count (a)': 1, 'has(a)': True, 'count (b)': 1, 'has(b)': True, 'coun

In [16]:
# function to apply the feature extractor

def apply_feature(feature_fn, label_list):

    return [(feature_fn(n), g) for (n,g) in label_list]

### Data Split

In [17]:
len(names_list)

7944

In [18]:
train_names = names_list[1500:]
devtest_names = names_list[500: 1500]
test_names = names_list[:500]

In [19]:
len(train_names), len(devtest_names) , len(test_names)

(6444, 1000, 500)

### Modeling section

In [20]:
def print_error_samples(errors, max_samples=20):
    """Print a sample of misclassified names."""
    print("\nNumber of dev errors:", len(errors))
    print(f"\nSample errors (first {max_samples}):")
    for (true_label, guess, name) in sorted(errors)[:max_samples]:
        print(f" correct={true_label:<8} guess={guess:<8} name={name}")

In [21]:
# check if class need a
import inspect

def requires_arguments(cls):
    sig = inspect.signature(cls.__init__)
    args = sig.parameters
    return len(args) > 1

In [22]:


def train_and_evaluate(clf, clf_name, feature_fn, feature_name):
    print("=" * 80)
    print(f"{clf_name}  |  Features: {feature_name}")
    print("-" * 80)
    
    # prepare feature sets
    # train_set   = apply_feature(feature_fn, train_names)
    devtest_set = apply_feature(feature_fn, devtest_names)
    test_set    = apply_feature(feature_fn, test_names)
    
    # train the classifier

    if requires_arguments(clf):
        train_set   = apply_feature(feature_fn, train_names)
        classifier = clf.train(train_set)
    else:
        classifier = clf
    
    # compute accuracy on dev and test sets
    dev_acc  = nltk.classify.accuracy(classifier, devtest_set)
    test_acc = nltk.classify.accuracy(classifier, test_set)
    print(f"Dev accuracy : {dev_acc:.3f}")
    print(f"Test accuracy: {test_acc:.3f}")
    
    # build error list on the dev set
    errors = []
    for (name, true_label) in devtest_names:
        guess = classifier.classify(feature_fn(name))
        if guess != true_label:
            errors.append((true_label, guess, name))
    
    # print a subset of errors
    print_error_samples(errors, max_samples=20)
    
    

### Model one

In [23]:
train_set   = apply_feature(feature_extractor1, train_names)
classifier = nltk.DecisionTreeClassifier.train(train_set)

train_and_evaluate(classifier, "Nltk DecisionTreeClassifier", feature_extractor1, "Using the last Letter")

Nltk DecisionTreeClassifier  |  Features: Using the last Letter
--------------------------------------------------------------------------------
Dev accuracy : 0.751
Test accuracy: 0.752

Number of dev errors: 249

Sample errors (first 20):
 correct=female   guess=males    name=Abagael
 correct=female   guess=males    name=Aimil
 correct=female   guess=males    name=Alis
 correct=female   guess=males    name=Alix
 correct=female   guess=males    name=Alleen
 correct=female   guess=males    name=Allyson
 correct=female   guess=males    name=Alyss
 correct=female   guess=males    name=Angel
 correct=female   guess=males    name=Ann
 correct=female   guess=males    name=Ardeen
 correct=female   guess=males    name=Averil
 correct=female   guess=males    name=Ayn
 correct=female   guess=males    name=Beatrix
 correct=female   guess=males    name=Bette-Ann
 correct=female   guess=males    name=Bird
 correct=female   guess=males    name=Brenn
 correct=female   guess=males    name=Brett
 corr

In [24]:
train_set   = apply_feature(feature_extractor2, train_names)
classifier = nltk.DecisionTreeClassifier.train(train_set)

train_and_evaluate(classifier, "Nltk DecisionTreeClassifier", feature_extractor2, "Using vowel Count")

Nltk DecisionTreeClassifier  |  Features: Using vowel Count
--------------------------------------------------------------------------------
Dev accuracy : 0.627
Test accuracy: 0.642

Number of dev errors: 373

Sample errors (first 20):
 correct=males    guess=female   name=Abdul
 correct=males    guess=female   name=Abraham
 correct=males    guess=female   name=Addie
 correct=males    guess=female   name=Adolphe
 correct=males    guess=female   name=Aguinaldo
 correct=males    guess=female   name=Aharon
 correct=males    guess=female   name=Alden
 correct=males    guess=female   name=Alessandro
 correct=males    guess=female   name=Alexander
 correct=males    guess=female   name=Alfonzo
 correct=males    guess=female   name=Algernon
 correct=males    guess=female   name=Alic
 correct=males    guess=female   name=Alister
 correct=males    guess=female   name=Alley
 correct=males    guess=female   name=Allyn
 correct=males    guess=female   name=Alston
 correct=males    guess=female   n

In [25]:
train_set   = apply_feature(feature_extractor3, train_names)
classifier = nltk.DecisionTreeClassifier.train(train_set)

train_and_evaluate(classifier, "Nltk DecisionTreeClassifier", feature_extractor3, "Using first, last and count letter")

Nltk DecisionTreeClassifier  |  Features: Using first, last and count letter
--------------------------------------------------------------------------------
Dev accuracy : 0.781
Test accuracy: 0.796

Number of dev errors: 219

Sample errors (first 20):
 correct=female   guess=males    name=Ajay
 correct=female   guess=males    name=Alix
 correct=female   guess=males    name=Alleen
 correct=female   guess=males    name=Andromache
 correct=female   guess=males    name=Angel
 correct=female   guess=males    name=Ann
 correct=female   guess=males    name=Ardeen
 correct=female   guess=males    name=Averil
 correct=female   guess=males    name=Ayn
 correct=female   guess=males    name=Bird
 correct=female   guess=males    name=Blake
 correct=female   guess=males    name=Brenn
 correct=female   guess=males    name=Brett
 correct=female   guess=males    name=Brooke
 correct=female   guess=males    name=Cat
 correct=female   guess=males    name=Chandal
 correct=female   guess=males    name=Da

### Model Two

In [26]:
train_set   = apply_feature(feature_extractor1, train_names)
classifier = nltk.ConditionalExponentialClassifier.train(train_set)

train_and_evaluate(classifier, "Nltk ConditionalExponentialClassifier", feature_extractor1, "Using the last Letter")

  ==> Training (100 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -0.69315        0.371
             2          -0.37512        0.766
             3          -0.37482        0.766
             4          -0.37464        0.766
             5          -0.37452        0.766
             6          -0.37443        0.766
             7          -0.37437        0.766
             8          -0.37431        0.766
             9          -0.37427        0.766
            10          -0.37424        0.766
            11          -0.37421        0.766
            12          -0.37419        0.766
            13          -0.37417        0.766
            14          -0.37415        0.766
            15          -0.37414        0.766
            16          -0.37413        0.766
            17          -0.37411        0.766
            18          -0.37410        0.766
            19          -0.37409        0.766
 

In [27]:
import sys
import os 
sys.stdout = open(os.devnull, 'w')
train_set   = apply_feature(feature_extractor2, train_names)

classifier = nltk.ConditionalExponentialClassifier.train(train_set)
sys.stdout = sys.__stdout__
train_and_evaluate(classifier, "Nltk ConditionalExponentialClassifier", feature_extractor2, "Using vowel Count")

In [28]:
train_set   = apply_feature(feature_extractor3, train_names)
classifier = nltk.ConditionalExponentialClassifier.train(train_set)

train_and_evaluate(classifier, "Nltk ConditionalExponentialClassifier", feature_extractor3, "Using first, last and count letter")

### Model Three

In [29]:
from sklearn.svm import LinearSVC
classifier = nltk.SklearnClassifier(LinearSVC())
train_and_evaluate(classifier, "Nltk SklearnClassifier", feature_extractor1, "Using the last Letter")

In [30]:
classifier = nltk.SklearnClassifier(LinearSVC())
train_and_evaluate(classifier, "Nltk SklearnClassifier", feature_extractor2, "Feature Set 2 (vowel count)")

In [31]:
classifier = nltk.SklearnClassifier(LinearSVC())
train_and_evaluate(classifier, "Nltk SklearnClassifier", feature_extractor3, "Feature Set 2 (first/last/count)")