In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import nltk
import random

In [2]:
#------------------------------------------------ STEP 1 (COMPLETE)
print("1. Loading Austen and Melville sentences...")
a_sents_all = nltk.corpus.gutenberg.sents('austen-emma.txt')
m_sents_all = nltk.corpus.gutenberg.sents('melville-moby_dick.txt')

1. Loading Austen and Melville sentences...


In [3]:
#------------------------------------------------ STEP 2
print("2. Discarding short sentences and labeling...")
a_sents = [(s, 'austen') for s in a_sents_all if len(s)>2]
m_sents = [(s, 'melville') for s in m_sents_all if len(s)>2]

2. Discarding short sentences and labeling...


In [4]:
#------------------------------------------------ STEP 3
print("3. Joining the two author sentence lists...")
sents = a_sents + m_sents

3. Joining the two author sentence lists...


In [5]:
#------------------------------------------------ STEP 4
print("4. Sentence stats:")
print(" # of total sentences:", len(sents))
print(" # of Austen sentences:", len(a_sents))
print(" # of Melville sentences:", len(m_sents))

4. Sentence stats:
 # of total sentences: 17152
 # of Austen sentences: 7563
 # of Melville sentences: 9589


In [6]:
#------------------------------------------------ STEP 5
print("5. Shuffling...")
random.Random(42).shuffle(sents)

5. Shuffling...


In [7]:
#------------------------------------------------ STEP 6
print("6. Partitioning...") 
test_sents = sents[:1000]     
devtest_sents = sents[1000:2000]  
train_sents = sents[2000:]    

print(" # of test sentences:", len(test_sents))
print(" # of devtest sentences:", len(devtest_sents))
print(" # of training sentences:", len(train_sents))

6. Partitioning...
 # of test sentences: 1000
 # of devtest sentences: 1000
 # of training sentences: 15152


In [8]:
#------------------------------------------------ STEP 7
print("7. Defining a feature-generator function...")
mainchars = {'Emma', 'Harriet', 'Ahab', 'Weston', 'Knightley', 'Elton',
             'Woodhouse', 'Jane', 'Stubb', 'Queequeg', 'Fairfax', 'Churchill',
             'Frank', 'Starbuck', 'Pequod', 'Hartfield', 'Bates', 'Highbury',
             'Perry', 'Bildad', 'Peleg', 'Pip', 'Cole', 'Goddard',
             'Campbell', 'Donwell', 'Dixon', 'Taylor', 'Tashtego'}

noCharNames = False    # For [PART B] Q3
if noCharNames :
    print('NOTE: Top 35 proper nouns have been neutralized.') 

def gen_feats(sent):
    featdict = {}
    for w in sent:
        if noCharNames == True:
            if w in mainchars: w = 'MontyPython'
        featdict['contains-'+w.lower()] = 1
    return featdict

7. Defining a feature-generator function...


In [9]:
#------------------------------------------------ [PART B] Q3

noCharNames2 = True    # For [PART B] Q3
if noCharNames :
    print('NOTE: Top 35 proper nouns have been neutralized.') 

def gen_feats2(sent):
    featdict = {}
    for w in sent:
        if noCharNames2 == True:
            if w in mainchars: w = 'MontyPython'
        featdict['contains-'+w.lower()] = 1
    return featdict

In [10]:
#------------------------------------------------ STEP 8
print("8. Generating feature sets...")
test_feats = [(gen_feats(n), author) for (n,author) in test_sents]
devtest_feats = [(gen_feats(n), author) for (n,author) in devtest_sents]  
train_feats = [(gen_feats(n), author) for (n,author) in train_sents]

## For [PART B] Q3
test_feats2 = [(gen_feats2(n), author) for (n,author) in test_sents]
devtest_feats2 = [(gen_feats2(n), author) for (n,author) in devtest_sents]  
train_feats2 = [(gen_feats2(n), author) for (n,author) in train_sents]

8. Generating feature sets...


In [11]:
#------------------------------------------------ STEP 9
print("9. Training...")
whosaid = nltk.NaiveBayesClassifier.train(train_feats)

## For [PART B] Q3
whosaid2 = nltk.NaiveBayesClassifier.train(train_feats2)

9. Training...


In [12]:
#------------------------------------------------ STEP 10
print("10. Testing...")
accuracy = nltk.classify.accuracy(whosaid, test_feats) 
print(" Accuracy score:", accuracy)

## For [PART B] Q3
accuracy2 = nltk.classify.accuracy(whosaid2, test_feats2) 
print(" Accuracy score for PARTB Q3:", accuracy2)

10. Testing...
 Accuracy score: 0.942
 Accuracy score for PARTB Q3: 0.931


In [13]:
#------------------------------------------------ STEP 11
print("11. Sub-dividing development testing set...")

# aa: real author Austen, guessed Austen
# mm: real author Melville, guessed Melville
# am: real author Austen, guessed Melville
# ma: real author Melville, guessed Austen

aa, mm, am, ma = [], [], [], []
for (sent, auth) in devtest_sents:
    guess = whosaid.classify(gen_feats(sent))
    if auth == 'austen' and guess == 'austen':
        aa.append( (auth, guess, sent) )
    elif auth == 'melville' and guess == 'melville':
        mm.append( (auth, guess, sent) )
    elif auth == 'austen' and guess == 'melville':
        am.append( (auth, guess, sent) )
    else:
        ma.append( (auth, guess, sent) )

11. Sub-dividing development testing set...


In [14]:
#------------------------------------------------ STEP 12
print("12. Sample CORRECT and INCORRECT predictions from dev-test set:")
print("-------")
for x in (aa, mm ,am, ma):
    auth, guess, sent = random.choice(x)
    print('REAL=%-8s GUESS=%-8s' % (auth, guess))  # string formatting
    print(' '.join(sent))
    print("-------")
print()

12. Sample CORRECT and INCORRECT predictions from dev-test set:
-------
REAL=austen   GUESS=austen  
By birth she belonged to Highbury : and when at three years old , on losing her mother , she became the property , the charge , the consolation , the fondling of her grandmother and aunt , there had seemed every probability of her being permanently fixed there ; of her being taught only what very limited means could command , and growing up with no advantages of connexion or improvement , to be engrafted on what nature had given her in a pleasing person , good understanding , and warm - hearted , well - meaning relations .
-------
REAL=melville GUESS=melville
Come , then , to my cabin .
-------
REAL=austen   GUESS=melville
They are ripening fast ."
-------
REAL=melville GUESS=austen  
Oh , my sweet cardinals !
-------



In [15]:
#------------------------------------------------ STEP 13
print("13. Looking up 40 most informative features...")
whosaid.show_most_informative_features(40)

13. Looking up 40 most informative features...
Most Informative Features
           contains-miss = 1              austen : melvil =    385.9 : 1.0
          contains-frank = 1              austen : melvil =    157.2 : 1.0
           contains-thou = 1              melvil : austen =    103.1 : 1.0
        contains-captain = 1              melvil : austen =     86.7 : 1.0
            contains-mrs = 1              austen : melvil =     67.2 : 1.0
          contains-smith = 1              austen : melvil =     64.0 : 1.0
             contains-ye = 1              melvil : austen =     59.4 : 1.0
         contains-martin = 1              austen : melvil =     58.0 : 1.0
           contains-feet = 1              melvil : austen =     50.6 : 1.0
       contains-isabella = 1              austen : melvil =     49.6 : 1.0
           contains-fish = 1              melvil : austen =     42.3 : 1.0
      contains-agreeable = 1              austen : melvil =     41.1 : 1.0
           contains-dear = 

## 1. Classifier Accuracy
#### What is the system's accuracy? Is it lower or higher than you expected?

In [16]:
accuracy = nltk.classify.accuracy(whosaid, test_feats) 
print("Accuracy score on test data:", accuracy)

Accuracy score on test data: 0.942


The accuracy score on test data is 94.2% which is surprising high considering how models deployed in companies or real world applications are considered to be functional if  the accuracy score is over 60-70%.

## 2. Features
#### Examine the gen_feats() function. What sort of features is used in this classifier model? Examine the list of the most informative features and make observations. Do you notice any patterns? Any surprising entries?

In [17]:
print("Looking up 40 most informative features...")
whosaid.show_most_informative_features(40)

Looking up 40 most informative features...
Most Informative Features
           contains-miss = 1              austen : melvil =    385.9 : 1.0
          contains-frank = 1              austen : melvil =    157.2 : 1.0
           contains-thou = 1              melvil : austen =    103.1 : 1.0
        contains-captain = 1              melvil : austen =     86.7 : 1.0
            contains-mrs = 1              austen : melvil =     67.2 : 1.0
          contains-smith = 1              austen : melvil =     64.0 : 1.0
             contains-ye = 1              melvil : austen =     59.4 : 1.0
         contains-martin = 1              austen : melvil =     58.0 : 1.0
           contains-feet = 1              melvil : austen =     50.6 : 1.0
       contains-isabella = 1              austen : melvil =     49.6 : 1.0
           contains-fish = 1              melvil : austen =     42.3 : 1.0
      contains-agreeable = 1              austen : melvil =     41.1 : 1.0
           contains-dear = 1   

gen_feats( ) function has a list of main character names and whenever that name appears in a sentence, it assigns a value of 1 to that character name key in the feature dictionary. Each of the feature represents a dummy variable indicating whether it contains a certain main character's name (=1) or not (=0). When we look at the top40 most informative features, we see that words thats are associated with females such as "miss", "mrs" and "dear" and words that represent emotions such as "gratitude" and "delighted" are (depending on the word) 22 to 380 times more likely be said by Austen than Melville. On the other hand, male associated words like "captain" and words that refer to specific entities (e.g. animals, nature, objects) including "fish", "green" and "sea" are more likely to be said by Melville than Austen.

## 3. Main character names
#### Some of you are probably thinking: the classifier must be getting a lot of help from the main character names such as Emma, Ahab and Queequeg. Let's see how well it does without them. The script already contains a switch that you can turn on to "neutralize" the top 35 most common character names and place names in the two novels by turning them all into 'MontyPython'. Edit the file and set the value of noCharNames to True. Re-run the script. How is the new classifier's performance? Did it degrade as much as you expected? Why do you think that is? How is the top feature list affected? When you're done, set noCharNames back to False and re-build your classifier by running the script again. For the rest of this homework, USE THIS ORIGINAL SETTING.

In [18]:
print("Accuracy score on test data after neutralizing top 35 most common character names:", nltk.classify.accuracy(whosaid, test_feats2))

Accuracy score on test data after neutralizing top 35 most common character names: 0.934


In [19]:
print("Looking up 40 most informative features after neutralizing top 35 most common character names")
whosaid2.show_most_informative_features(40)

Looking up 40 most informative features after neutralizing top 35 most common character names
Most Informative Features
           contains-miss = 1              austen : melvil =    385.9 : 1.0
           contains-thou = 1              melvil : austen =    103.1 : 1.0
        contains-captain = 1              melvil : austen =     86.7 : 1.0
            contains-mrs = 1              austen : melvil =     67.2 : 1.0
          contains-smith = 1              austen : melvil =     64.0 : 1.0
             contains-ye = 1              melvil : austen =     59.4 : 1.0
         contains-martin = 1              austen : melvil =     58.0 : 1.0
           contains-feet = 1              melvil : austen =     50.6 : 1.0
       contains-isabella = 1              austen : melvil =     49.6 : 1.0
           contains-fish = 1              melvil : austen =     42.3 : 1.0
      contains-agreeable = 1              austen : melvil =     41.1 : 1.0
           contains-dear = 1              austen : melv

The original accuracy score was 0.942 but it dropped to 0.934 after neutralizing the top 35 most common character names. In my opinion, this drop in accuracy score is understandable because whether certain main character names appear in the text or not is pretty influential in the classification process. For example, "Perry" was one of the top40 significant features for the first model where we set noCharNames as False. In that model, "Perry" was 24 times more likely to appear in Austin's speech than that of Melville. After setting noCharNames as True, the "Perry" feature was no longer was included as one of the features in the model which led to a slightly decrease in accuracy score. However, the top feature list hasn't been affected as much and the same pattern mentioned in Question 2 still holds.

## 4. Trying out sentences
#### Test the classifier on the two sentences below. Sent1 is actually by Jane Austen, taken from Persuasion. Sent2 is from Alice's Adventures in Wonderland by Lewis Caroll.
- (Sent1) Anne was to leave them on the morrow, an event which they all dreaded.
- (Sent2) So Alice began telling them her adventures from the time when she first saw the White Rabbit.

In [20]:
sent1 = "Anne was to leave them on the morrow, an event which they all dreaded."
sent2 = "So Alice began telling them her adventures from the time when she first saw the White Rabbit."

In [21]:
def gen_sent_feat(sentence):
    features = {}
    for word in nltk.word_tokenize(sentence):
        features['contains-{}'.format(word.lower())] = 1
    return features

In [22]:
sent1_feat = gen_sent_feat(sent1)
sent2_feat = gen_sent_feat(sent2)

In [23]:
whosaid.classify(sent1_feat)

'austen'

In [24]:
whosaid.classify(sent2_feat)

'melville'

sent1 which we know is from Austen was classified as Austen, so the classifier seems to be working fine. sent2 was classified as "melville" which makes sense because sent2 includes words like "White", "Rabbit" which is related to the pattern we found earlier that sentences that include words representing solid entities (e.g. animals, nature stuff, objects etc.) are more likely to appear in melville's text.

## 5. Label probabilities for a sentence

#### Labeling judgments aside, how likely does your model thinks that Sent1 is Austen? That is essentially P(austen|Sent1). To find out, we need to use the .prob_classify method instead of the usual .classify. Below demonstrates how to find the probability estimates assigned to eithe****r label for the sentence 'Hello, world'. whosaid thinks it's 72% Melville and 28% Austen:

** a. Try it with Sent1. What is P(austen|Sent1)? That is, given Sent1, how likely is it to be Austen? What is P(melville|Sent1)?**

In [25]:
print(whosaid.prob_classify(sent1_feat).prob('austen'), whosaid.prob_classify(sent1_feat).prob('melville'))

0.9567679967331554 0.04323200326684133


P(austen|Sent1) = 95.6% and P(melville|Sent1) = 4.3%

**b. How about Sent2 -- P(austen|Sent2) and P(melville|Sent2)?**

In [26]:
print(whosaid.prob_classify(sent2_feat).prob('austen'), whosaid.prob_classify(sent2_feat).prob('melville'))

0.4385996907223784 0.5614003092776179


P(austen|Sent2) = 43.9% and P(melville|Sent2) = 56.1%

**c. From a. and b., how "confident" is your classifier about Sent1 being Austen? Is it equally confident on Sent2 being Melville?**

No! The classifier is very confident about sent1 being Austen as we can see from the very high conditional probability of 95.6%. On the other hand, the classifier is less confident about sent1 being Melville than sent1 being Austen as we can see from the conditional probability of 56.1% although this is still higher than the conditional probability of sent2 being Austen.

## 6. Trying out made-up sentences
Now, test the classifier on the following made-up sentences:

- (Sent3) He knows the truth
- (Sent4) She knows the truth
- (Sent5) blahblahblah blahblah

In [27]:
sent3 = 'He knows the truth'
sent4 = 'She knows the truth'
sent5 = 'blahblahblah blahblah'

**a. What labels did the classifier give to Sent3 and Sent4, and with what probabilities? Any thoughts?**

In [68]:
sent3_feat = gen_sent_feat(sent3)
sent4_feat = gen_sent_feat(sent4)

print("Classified label of sent3: ", whosaid.classify(sent3_feat))
print("Classified label of sent4: ", whosaid.classify(sent4_feat))

print("Conditional Probs for sent3 (austen|sent3 & melville|sent3): ",
      whosaid.prob_classify(sent3_feat).prob('austen'), whosaid.prob_classify(sent3_feat).prob('melville'))
print("Conditional Probs for sent4 (austen|sent4 & melville|sent4): ",
      whosaid.prob_classify(sent4_feat).prob('austen'), whosaid.prob_classify(sent4_feat).prob('melville'))

Classified label of sent3:  melville
Classified label of sent4:  austen
Conditional Probs for sent3 (austen|sent3 & melville|sent3):  0.4910301810185188 0.5089698189814804
Conditional Probs for sent4 (austen|sent4 & melville|sent4):  0.9383515139571844 0.061648486042814823


This makes sense because we have previously found out from the top40 features that melville is closely associated with masculine terms such as captain/him/his while austen is more closely associated with feminine terms such as "miss/she/her". We also observe that the conditional probability of sent4 being austen is overwhelmingly high (93.8%) and this illustrates that feminine words such as "she" has an immense influence in classifying whether a sentence is Austen or not.

**b. What about Sent5? Given that neither "word" appeared in the training data, why do you think the classifier made the prediction it did?**

In [29]:
sent5_feat = gen_sent_feat(sent5)

print("Classified label of sent5: ", whosaid.classify(sent5_feat))

print("Conditioanl Probs for sent5 (austen|sent5 & melville|sent5): ",
      whosaid.prob_classify(sent5_feat).prob('austen'), whosaid.prob_classify(sent5_feat).prob('melville'))

Classified label of sent5:  melville
Conditioanl Probs for sent5 (austen|sent5 & melville|sent5):  0.44034184649904307 0.5596581535009568


I am assuming that we had more sentences coming from melville in the corpus which made the probability be more heavily weighted towards melville.

## 7. Base probabilities (= priors)
Not knowing anything about a sentence, is it more likely to be Austen or Melville? We can answer this question by establishing the base probabilities, i.e. priors. In your training data (i.e., train_sents or train_feats)

**a. How many sentences are there?**

In [30]:
print("There are {} sentences.".format(len(train_sents)))

There are 15152 sentences.


**b. How many of them are Austen?**

In [31]:
austen = []
melville = []
for s in train_sents:
    if s[1]=='austen':
        austen.append(s)
    elif s[1] == 'melville':
        melville.append(s)

In [32]:
print("There are {} sentences from Austen.".format(len(austen)))

There are 6672 sentences from Austen.


**c. How many of them are Melville?**

In [33]:
print("There are {} sentences from Austen.".format(len(melville)))

There are 8480 sentences from Austen.


**d. From the above, what are P(austen) and P(melville)?**

In [34]:
print("P(austen): ", len(austen) / len(train_sents))
print("P(melville): ", len(melville) / len(train_sents))

P(austen):  0.440337909186906
P(melville):  0.5596620908130939


**e. How is your answer to d. related to the classifier's prediction on Sent5 above?**

The prior probabilities exactly match with the conditional probabilities of P(austen|sent5) and P(melville|sent5) and this shows that prior probabilities which are weighted depending on how many sentences from melville or austen appear in the corpus have been used for sentences like sent5 where neither of the two words is in the training data.

## 8. Calculating odds ratio
Would the word 'very' be more indicative of Austen or Melville, and how strongly so? Let's answer this by calculating its odds ratio. Find out the following, again in the training data:

**a. How many Austen sentences contain 'very'? Make sure to count 'Very' as well.**

In [35]:
austen_very_count = 0
for s in austen:
    if 'very' in s[0]:
        austen_very_count += 1
    elif 'Very' in s[0]:
        austen_very_count += 1

In [36]:
print("{} Austen sentences contain the word 'very'.".format(austen_very_count))

934 Austen sentences contain the word 'very'.


**b.How about Melville sentences?**

In [37]:
melville_very_count = 0
for s in melville:
    if 'very' in s[0]:
        melville_very_count += 1
    elif 'Very' in s[0]:
        melville_very_count += 1

In [38]:
print("{} Melville sentences contain the word 'very'.".format(melville_very_count))

271 Melville sentences contain the word 'very'.


**c. What is P(very|austen)? That is, given an Austen sentence, how likely is it to contain 'very'?**

In [39]:
print("P(very|austen): ", austen_very_count / len(austen))

P(very|austen):  0.13998800959232613


**d. What is P(very|melville)? That is, given a Melville sentence, how likely is it to contain 'very'?**

In [40]:
print("P(very|melville): ", melville_very_count / len(melville))

P(very|melville):  0.03195754716981132


**e. What is Austen-to-Melville odds ratio of 'very'?**

In [41]:
print("Austen to Melville odds ratio is: ",(austen_very_count / len(austen)) / (melville_very_count / len(melville)))

Austen to Melville odds ratio is:  4.380436610121497


## 9. Feature weights in model
P(very|austen) and P(very|melville) are indeed the 'weights' your model assigns to the feature 'contains-very':1. Let's confirm this by probing your model. Use the .feature_weights() method:

**a. What are the weights of 'very'?**

In [42]:
whosaid.feature_weights('contains-very', 1)

{'austen': 0.14004196013786901, 'melville': 0.0320127343473647}

**b. Do they match up with what you calculated in 8.c and 8.d above? (They should. The small differences are effects of smoothing, which may be more pronounced in other cases.)**

We got 0.13998800959232613 and 0.03195754716981132 for 8.c and 8.d and they almost match with the values we got from the feature_weights (0.14, 0.032) with some minor differences due to smoothing

## 10. Zero counts and feature weights
In order to accommodate features and feature-value pairs never encountered in the training data, a machine learning algorithm will adopt a couple of strategies, including smoothing.

**a. Look up the feature weights of 'contains-whale' and also 'contains-ahab'. What do you notice?**

In [43]:
whosaid.feature_weights('contains-whale', 1)

{'austen': 7.49288176232579e-05, 'melville': 0.11266360099044924}

In [44]:
whosaid.feature_weights('contains-ahab', 1)

{'austen': 7.49288176232579e-05, 'melville': 0.05017097040443344}

I notice that the weight of the two words for austen is exactly the same while the weight of "whale" for melville is higher than that of "ahab".

**b. This time, look up the feature weights for words 'marriage' and 'Emma'. Anything noticeable?**

In [45]:
whosaid.feature_weights('contains-marriage', 1)

{'austen': 0.004870373145511764, 'melville': 0.00029477655936799903}

In [46]:
whosaid.feature_weights('contains-emma', 1)

{'austen': 0.10962086018282631, 'melville': 5.895531187359981e-05}

I notice that the feature weight of "marriage" is more than 10 times higher for austen than for melville. On the flip side, I notice that the feature weight of "emma" for austen is more ethan 1859 times higher than for melville. The words "marriage" and "emma" are very highly important in determining whether a sentence is classifed as Austen.

**c. Find a word that occurs in Austen's work only, and another that occurs only in Melville, and then look up their feature weights. You should have a theory by now -- sum up what is going on with these words and their feature weights.**

In [47]:
austen_Jane_count = 0
for s in austen:
    if 'Jane' in s[0]:
        austen_Jane_count+=1

In [48]:
# Jane feature weight for Austen
austen_Jane_count / len(austen)

0.03821942446043165

In [49]:
melville_captain_count = 0
for s in melville:
    if 'captain' in s[0]:
         melville_captain_count += 1

In [50]:
# captain feature weight for Melville
melville_captain_count / len(melville)

0.01179245283018868

"Jane" has a pretty high feature weight for Austen of 0.038 considering the low feature weights can be something as low as some integer times 10 to the power of -5. Similarly, the feature weight of "captain" for Melville is 0.0117 which is also pretty high. These high values of feature weight shows that "Jane" and "captain" are key words that can be used to determine whether a sentence is classified as Austen or Melville respectively.

**d. As a comparison point, the word 'cautiously' occurs exactly once in the Austen training sentences, and likewise in the Melville training sentences. How do its feature weights compare against the ones you saw above?**

In [51]:
whosaid.feature_weights('contains-cautiously', 1)

{'austen': 0.0002247864528697737, 'melville': 0.00017686593562079943}

The feature weights of "cautiously" for Austen and Melville are both about 10 times lower than "Jane" and "captain" respectively. This means the likelihood of the word "cautiously" appearing in Austen and Melville text is lower than "Jane" and "captain" respectively. This makes sense considering how "Jane" and "captain" are unique words that only appear in one text or the other while "cautiously" can appear in both texts (although in varying degrees of frequency).

**e. Now, try 'contains-internet'. What happens this time?**

In [52]:
import logging
logger = logging.getLogger('ftpuploader')

try:
    whosaid.feature_weights('contains-internet', 1)
    
except Exception as e:
    logger.error(e)

('melville', 'contains-internet')


We see that none of the sentences from Melville contains the word "internet".

In [53]:
austen_internet_count = 0
for s in austen:
    if 'internet' in s[0]:
        austen_internet_count+=1

In [54]:
# internet feature weight for Austen
austen_internet_count / len(austen)

0.0

oops! We see that none of the sentences from Austen contains the word "internet" as well.

**f. Then, using the .prob_classify method, find out the likelihood of 'She hates the internet' being an Austen sentence. Then try 'She hates the'. What can you conclude about the classifier's handling of features it never encountered in the training data?**

In [55]:
sent7 = 'She hates the internet'
sent8 = 'She hates the'
sent7_feat = gen_sent_feat(sent7)
sent8_feat = gen_sent_feat(sent8)

In [56]:
print(whosaid.prob_classify(sent7_feat).prob('austen'), whosaid.prob_classify(sent8_feat).prob('austen'))

0.8958225408422634 0.8958225408422634


We see that the probabilities of sent7 and sent8 being classifed as Austen are the same because the word "internet" is not in the training data. The classifier ignores words like "internet" that are not part of the training data and only uses other words that appear in the training data to calculate conditional probabilities.

## 11. Combining feature weights

**a. You already calculated the Austen prior: P(austen) in 7 d. What is it?**

P(austen) is 0.440337909186906.

**b. P(he|austen) can be found through whosaid.feature_weights('contains-he', 1). Likewise with the rest of the words.**

In [57]:
whosaid.feature_weights('contains-he', 1)

{'austen': 0.16776562265847444, 'melville': 0.15334276618323311}

In [58]:
whosaid.feature_weights('contains-knows', 1)

{'austen': 0.004270942604525701, 'melville': 0.0028888102818063906}

In [59]:
whosaid.feature_weights('contains-the', 1)

{'austen': 0.3799640341675408, 'melville': 0.6001061195613725}

In [60]:
whosaid.feature_weights('contains-truth', 1)

{'austen': 0.004870373145511764, 'melville': 0.004067916519278387}

**c. From a. and b., calculate P(Sent3, austen)**

In [64]:
p_sent3_aust = 0.440337909186906 * 0.16776562265847444 * 0.004270942604525701 * 0.3799640341675408 * 0.004870373145511764

In [61]:
print("P(sent3, austen) = P(austen) * P(austen|he) * P(austen|knows) * P(austen|the) * P(austen|truth) = {}".\
      format(0.440337909186906 * 0.16776562265847444 * 0.004270942604525701 * 0.3799640341675408 * 0.004870373145511764))

P(sent3, austen) = P(austen) * P(austen|he) * P(austen|knows) * P(austen|the) * P(austen|truth) = 5.838718138066814e-07


**d. Similarly, calculate P(Sent3, melville).**

In [63]:
p_sent3_melv = 0.5596620908130939 * 0.15334276618323311 * 0.0028888102818063906 * 0.6001061195613725 * 0.004067916519278387

In [62]:
print("P(sent3, melville) = P(melville) * P(melville|he) * P(melville|knows) * P(melville|the) * P(melville|truth) = {}".\
      format(0.5596620908130939 * 0.15334276618323311 * 0.0028888102818063906 * 0.6001061195613725 * 0.004067916519278387))

P(sent3, melville) = P(melville) * P(melville|he) * P(melville|knows) * P(melville|the) * P(melville|truth) = 6.052130617577441e-07


**e. Now, calculate P(Sent3) as P(Sent3, austen) + P(Sent3, melville).**

In [65]:
print("P(sent3) = P(sent3, austen) + P(sent3, melville) = {}".format(p_sent3_aust + p_sent3_melv))

P(sent3) = P(sent3, austen) + P(sent3, melville) = 1.1890848755644255e-06


**f. Ultimately, the probability question we like to answer is: "Given the sentence He knows the truth, how likely is it to be Austen"? That is, what is P(austen|Sent3)? Use formula ① above to calculate this.**

In [66]:
print("P(austen|sent3) = {}".format(p_sent3_aust / (p_sent3_aust + p_sent3_melv)))

P(austen|sent3) = 0.4910261881260021


**g. Does the figure match up with the classifier's estimation from 6.a above? (It should.)**

P(austen|sent3) calculated in 6.a was 0.4910301810185188 which matches with the P(austen|sent3) calculated in (f) with a minor difference in the later parts of the decimals which can be considered as an acceptable difference within the margion of error.

## 12. Performance on the development-test data
Work with the four lists aa, mm, am, ma to answer the following questions.

**a. Of the 1,000 development-test set, how many of them did the classifier correctly label?**

In [71]:
print("{} out of 1000 development test set were correctly labeled".format(len(aa) + len(mm)))

957 out of 1000 development test set were correctly labeled


**b. What is whosaid's accuracy on the development test data? Ideally, it should be close to its performance on the test data -- is it?**

whosaid's accuracy on the development-test data is 957 / 1000 = 0.957 while the the accuracy score on the test data was 0.942 as we have seen in step10 of Part(A). The accuracy score on the development test data isn't necessarily close to that on the test data especially when the model overfitted and hence is not good at classifying new unseen test data. If the accuracy on the development-test data gets closer to the performance on the test data, this means the model is moving towards a direction where variance is being reduced so that its predictive power for unseen data increases.

**c. What % of the sentences did the classifier label as 'austen'? How about 'melville'? Why do you think it is not 50-50?**

In [72]:
(len(ma) + len(aa)) * 100 / 1000, (len(am) + len(mm)) * 100 / 1000

(45.7, 54.3)

45.7% of sentences were labelled as austen and 54.3% of the sentences were labelled as melville. The classifier might not have (and often time do not have) the same predictive power for each of the labels (in this case, austen and melville). For example, if the classifier does a poor job at correctly classifying austen sentences, it might classify more sentences as melville and this will cause the number of sentences labelled as melville to blow up. This is why it is very rare to achieve 50-50 in terms of the % of the sentences the classifier label as austen and melville.

**d. What % of the classifier's 'austen' rulings is correct? That is, when the classifier labels a sentence as 'Austen', what is the likelihood of this prediction to be correct?**

In [74]:
len(aa) / (len(ma) + len(aa))

0.9562363238512035

When the classifier labels a sentence as Austen, the likelihood of this prediction to be correct is about 95.6%.

**e. Likewise, when the classifier labels a sentence as 'melville', what is the likelihood of this prediction being correct?**

In [75]:
len(mm) / (len(mm) + len(am))

0.9576427255985267

When the classifier labels a sentence as melville, the likelihood of this prediction to be correct is about 95.76%.

## 13. Error analysis
The list am contains all sentences from the dev-test set that are in fact Austen but were mis-labeled as Melville by whosaid. Let's take a look at these errors.

**a. Print out all mis-classified Austen sentences by: for x in am: print(' '.join(x[2]))**

**What do you think of these sentences? Do they sound Melville-like to you?**

In [76]:
for x in am: 
    print(' '.join(x[2]))

" Dating from three o ' clock yesterday .
or a mermaid ?
It was badly done , indeed !
" Well -- as you please ; only don ' t have a great set out .
He came four times a day for a week .
The heat overcame me ."
They are not far off .
But fetch them both .
The gentlemen spoke of his horse .
Lord of the earth and sea , he bends a slave , And woman , lovely woman , reigns alone .
It might be safely viewed with all its appendages of prosperity and beauty , its rich pastures , spreading flocks , orchard in blossom , and light column of smoke ascending .-- She joined them at the wall , and found them more engaged in talking than in looking around .
" Good God !"
They are ripening fast ."
In a moment he went on --
What an air of probability sometimes runs through a dream !
" Good God !"
that ' s all .
He had gone beyond the sweep -- some way along the Highbury road -- the snow was nowhere above half an inch deep -- in many places hardly enough to whiten the ground ; a very few flakes were fall

A lot of the sentences sound like Melville to me because of words like "ground", "sea", "earth", "He" and "mermaid" that represent tangible entities and masculinity. Earlier we noticed that such words are more likely to appear in Melville's text.

**b. When a classifier mislabels, it is hoped that it at least did so with low confidence (say, 55%) than high (98%). Pick some sentence from the list and see what likelihood whosaid assigned to them for being Melville. Of all sentences you tried, which was judged Melville with the lowest confidence? Which one was it most sure about being Melville?**

In [94]:
for s in am:
    print(s, whosaid.prob_classify(gen_feats(str(s[2]))).prob('melville'))

('austen', 'melville', ['"', 'Dating', 'from', 'three', 'o', "'", 'clock', 'yesterday', '.']) 0.9997479147589855
('austen', 'melville', ['or', 'a', 'mermaid', '?']) 0.999221365737171
('austen', 'melville', ['It', 'was', ('austen', 'melville', ['"', 'Dating', 'from', 'three', 'o', "'", 'clock', 'yesterday', '.']), 'done', ',', 'indeed', '!']) 0.999981457560398
('austen', 'melville', ['"', 'Well', '--', 'as', 'you', 'please', ';', 'only', 'don', "'", 't', 'have', 'a', 'great', 'set', 'out', '.']) 0.999999866164424
('austen', 'melville', ['He', 'came', 'four', 'times', 'a', 'day', 'for', 'a', 'week', '.']) 0.9940807645151838
('austen', 'melville', ['The', 'heat', 'overcame', 'me', '."']) 0.9999191922846851
('austen', 'melville', ['They', 'are', 'not', 'far', 'off', '.']) 0.9994441003426138
('austen', 'melville', ['But', 'fetch', 'them', 'both', '.']) 0.9969821026446707
('austen', 'melville', ['The', 'gentlemen', 'spoke', 'of', 'his', 'horse', '.']) 0.9997983060281489
('austen', 'melville'

- The sentence **['It', 'might', 'be', 'safely', 'viewed', 'with', 'all', 'its', 'appendages', 'of', 'prosperity', 'and', 'beauty', ',', 'its', 'rich', 'pastures', ',', 'spreading', 'flocks', ',', 'orchard', 'in', 'blossom', ',', 'and', 'light', 'column', 'of', 'smoke', 'ascending', '.--', 'She', 'joined', 'them', 'at', 'the', 'wall', ',', 'and', 'found', 'them', 'more', 'engaged', 'in', 'talking', 'than', 'in', 'looking', 'around', '.']** had a confidence level of 0.999998808915152 and thus it was what the classifier was most sure about being melville (wrongly classified with the most confidence)

- **['"', 'What', '!']** had a confidence level of 0.9396440965916052 and thus it was what the classifier was least sure about being melville