### POS Tagging: Agenda
* <a href="#section1">What are parts of speech? Why are they useful?</a>
* <a href="#section2">How do you use them with SpaCy?</a>
* <a href="#section3">How do we infer them?</a>
* <a href="#section4"> How do we learn them with SpaCy?</a>


In [14]:
def from_scratch():
    !pip install spacy >> ~/spacy.log
    !python -m spacy download en >> ~/spacy.log
    !jupyter nbextension enable --py --sys-prefix widgetsnbextension

    
from_scratch()

In [18]:
from functools import wraps

def as_list(f):
    @wraps(f)
    def wrapper(*args, **kwargs):
        return list(f(*args, **kwargs))
    
    return wrapper



In [20]:
map_(lambda x: x + 1, range(10))

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [22]:
import pandas as pd
import spacy
from spacy.matcher import Matcher
from spacy.attrs import POS
from spacy.en import English
import matplotlib.pyplot as plt
from functools import partial
import nltk
from operator import itemgetter
from itertools import groupby
from nltk.corpus import brown
from collections import defaultdict, Counter
import numpy as np
from spacy.tokens import Doc
from IPython.display import HTML
import warnings
import pandas as pd
from functools import wraps

def as_list(f):
    @wraps(f)
    def wrapper(*args, **kwargs):
        return list(f(*args, **kwargs))
    
    return wrapper
    


warnings.filterwarnings('ignore')
%matplotlib inline

map_ = as_list(map)
filter_ = as_list(filter)
zip_ = as_list(zip)

def rep_sentences(texts):
    html = []
    for text in texts:
        html.append(rep_sentence(text))
    return HTML("".join(html))

def rep_sentence(text, display_pos = True):
    html_colors = ['SkyBlue'
               ,'red'
               ,'YellowGreen'
               ,'yellow'
               ,'orange'
               ,'pink'
               ,'brown'
               ,'purple'
               , 'CadetBlue'
                ,'DarkKhaki'
                ,'DarkSalmon'
                ,'Gold'    
              ]
    doc = nlp(text)
    n_words = len(doc)
    unique_pos = list(set(map(lambda x: x.pos_, doc)))
    pos_to_color = {i:html_colors[unique_pos.index(i)] for i in unique_pos}
    css = ["<style>.word{font-weight:bold;}</style>"]
    for pos in unique_pos:
        css.append('<style>.{}{{background-color:{};}}</style>'.format(*[pos, pos_to_color[pos]]))
    css = "".join(css)

    html = ["<table width=100%>"]
    html.append(css)
    html.append("<tr>")            
    for i in range(n_words):
        word_string= doc[i].orth_
        html.append("<td><span class='word'>{0}</span></td>".format(word_string))
    html.append("</tr>")
    if display_pos:
        html.append("<tr>")            
        for i in range(n_words):
            pos = doc[i].pos_
            color = pos_to_color[pos]
            html.append("<td><span class='{0}'>{0}</span></td>".format(pos))
        html.append("</tr>")
    html = "".join(html)
    return html



def custom_tag_table(list_of_word_tag_tuples):
    html_colors = ['SkyBlue'
               ,'red'
               ,'YellowGreen'
               ,'yellow'
               ,'orange'
               ,'pink'
               ,'brown'
               ,'MediumPurple'
               , 'CadetBlue'
                ,'DarkKhaki'
                ,'DarkSalmon'
                ,'Gold'    
              ]
    
    n_words = len(list_of_word_tag_tuples)
    words, pos_list = zip(*list_of_word_tag_tuples)
    unique_pos = list(set([pos for pair in pos_list for pos in pair]))
    pos_to_color = {i:html_colors[unique_pos.index(i)] for i in unique_pos}
    css = ["<style>.word{font-weight:bold;}</style>"]
    for pos in unique_pos:
        css.append('<style>.{}{{background-color:{};}}</style>'.format(*[pos, pos_to_color[pos]]))
    css = "".join(css)

    html = ["<table width=100%>"]
    html.append(css)
    for i in range(n_words):
        html.append("<tr>")            
        word_string= words[i]
        html.append("<td><span class='word'>{0}</span></td>".format(word_string))
        row = []
        pos_sublist = pos_list[i]
        for pos in pos_sublist:
            entry = "<span class='{0}'>{0}</span> ".format(pos)
            #print entry
            row.append(entry)
        row = "".join(row)
        html.append("<td>{}</td>".format(row))
        html.append("</tr>")
    return "".join(html)
        
    

def nltk_corpus(corpus_name):
    corpus = getattr(nltk.corpus, corpus_name)
    try:
        corpus.ensure_loaded()
    except:
        nltk.download(corpus_name)
    return corpus

#read nltk corpora
def nltk_reader(corpus_name, limit = None):
    corpus = nltk_corpus(corpus_name)
    fileids = corpus.fileids()
    
    if limit:
        doc_iter = (" ".join([" ".join(j) for j in corpus.sents(fileid)]) for fileid in fileids[:limit])
    else:
        doc_iter = (" ".join([" ".join(j) for j in corpus.sents(fileid)]) for fileid in fileids)
    return doc_iter

universal_tags = [
     ['Open Class Words','ADJ','adjective']
    ,['Open Class Words','ADV','adverb']
    ,['Open Class Words','INTJ','interjection']
    ,['Open Class Words','NOUN','noun']
    ,['Open Class Words','PROPN','proper noun']
    ,['Open Class Words','VERB','verb']
    ,['Closed Class Words','ADP','adposition']
    ,['Closed Class Words','AUX','auxiliary']
    ,['Closed Class Words','CCONJ','coordination conjunction']
    ,['Closed Class Words','DET','determiner']
    ,['Closed Class Words','NUM','numeral']
    ,['Closed Class Words','PART','particle']
    ,['Closed Class Words','PRON','pronoun']
    ,['Closed Class Words','SCONJ','subordinating conjection']
    ,['Other','PUNCT','punctuation']
    ,['Other','SYM','symbol']
    ,['Other','X','other']
]
tag_table = pd.DataFrame(universal_tags, columns = ['Category','Abbrev','Part of Speech'])
tag_table = tag_table.set_index(['Category','Abbrev'])

nltk.download('tagsets')
nltk.download('universal_tagset')
nlp = spacy.load('en')

[nltk_data] Downloading package tagsets to /home/jupyter/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


<a name="section1"></a>

### What are Parts of Speech?

In [5]:
tag_table

Unnamed: 0_level_0,Unnamed: 1_level_0,Part of Speech
Category,Abbrev,Unnamed: 2_level_1
Open Class Words,ADJ,adjective
Open Class Words,ADV,adverb
Open Class Words,INTJ,interjection
Open Class Words,NOUN,noun
Open Class Words,PROPN,proper noun
Open Class Words,VERB,verb
Closed Class Words,ADP,adposition
Closed Class Words,AUX,auxiliary
Closed Class Words,CCONJ,coordination conjunction
Closed Class Words,DET,determiner


In [6]:
sentence1 = 'I get a discount on newspapers.'
sentence2 = 'I discount that newspaper.'

rep_sentences([sentence1, sentence2])

0,1,2,3,4,5,6
I,get,a,discount,on,newspapers,.
PRON,VERB,DET,NOUN,ADP,NOUN,PUNCT

0,1,2,3,4
I,discount,that,newspaper,.
PRON,VERB,ADP,NOUN,PUNCT


<a name='applications'></a>
### Applications
* Rule based systems:
    * <a href="#qacode">Example of rule based question answering component</a>
* Feature engineering for statistical models
    * <a href="#wordsense">Feature for word disambiguation</a>

<a name="section2"></a>
### Parts of Speech with SpaCy

In [7]:
### Accessing
doc = nlp('I get a discount on newspapers')
tags = {}

for word in doc:
    tags[word.orth_] = {'lemma': word.lemma_, 
                        'pos (coarse)': word.pos_, 
                        'pos (fine)':word.tag_}
pd.DataFrame(tags).T

Unnamed: 0,lemma,pos (coarse),pos (fine)
I,-PRON-,PRON,PRP
a,a,DET,DT
discount,discount,NOUN,NN
get,get,VERB,VBP
newspapers,newspaper,NOUN,NNS
on,on,ADP,IN


### Exercise: Building word vectors that are Part of Speech specific
Steps:
* get documents
* tokenize the documents, and append the part of speech to each token, e.g. dog|NOUN
* train a word2vec model with gensim
* compare the most similar words of 'back||||VERB' vs 'back||||NOUN' (or other combo)

* Hints:
    * model.wv.vocab contains the vocabulary.
    * using a completely unique join character will make it easier to split later.

In [191]:
from gensim.models import Word2Vec

def return_documents():
    from sklearn.datasets import fetch_20newsgroups
    dataset = fetch_20newsgroups()
    corpus = dataset.data
    return corpus

def tokenize_and_tag_documents(documents, nlp, sep_char="|"):
    pass

def build_model(tokenized_docs):
    pass

def compare_most_similar_words_across_pos(word):
    pass

documents = return_documents()
tokenized_and_tagged_documents = tokenize_and_tag_documents(documents, nlp)
model = build_model(tokenized_and_tagged_documents)
compare_most_similar_words_across_pos('back')

<a name="section3"></a>
### How do we infer parts of speech?

In [13]:
from IPython.display import clear_output, display
from ipywidgets import Button
class reveal(object):
    def __init__(self):
        self.text = 'I was loble to find the effix by klepping the Dongle search engine.'
        self.toggle = Button(description='Toggle POS', )
        self.toggle.on_click(self.toggle_pos)
        self.state = False
        display(self.toggle)
        self.display()
        
    def toggle_pos(self, b):
        self.state = not self.state
        self.display()
        
    def display(self):
        clear_output()
        display(HTML(rep_sentence(self.text, display_pos = self.state)))
        
r = reveal()

0,1,2,3,4,5,6,7,8,9,10,11,12,13
I,was,loble,to,find,the,effix,by,klepping,the,Dongle,search,engine,.


### Determinants of Part of Speech:
* Word: some words can only be used in a single way; we can memorize these.
* Word shape: if the first letter is capitalized, its likely a proper noun.
* Neighboring part of speech: there are common patterns, such as noun phrases commonly following a determiner. to the beach



| Feature | Notes | Example|
|------|------|------|
|   Word Identity  | Some words can only be used in a single way; we can memorize these.| "the" -> determiner| 
| Word Shape|Capitalization, dashes,  |"I stayed at the Park Hotel."|
|Neighboring parts of speech|There are common patterns what tags can neighbor others|"to the beach" (noun following determiner)|
|Morphological Structures|Word prefixes and suffixes can rule out certain tag types|"-ly" -> adverb|
|Syntactic Dependencies|Syntax may establish expectations that only certain tags can logically fill|"I was told __" -> adpositional phrase or object entity|
|?|?|?|


<a name="section4"></a>
### Training your own tagger

**Steps** : 
* load <a href="#load_data">**training data**</a>, where each observation is represented as (list_of_words, list_of_tags)
* Pick a <a href="#model_dir">**model directory**</a>. Using the existing English model will allow us to leverage lexeme information, including Brown clusters, which is an excellent feature for tagging.
* Look at your dataset. Build a <a href="#tagmap">**tag map**</a> mapping from the part of speech tags to the <a href="#universaltagset">universal tagset</a>.
* Decide which <a href="#featureextractors">**features**</a> to use
* create a <a href="#vocab">**vocabulary object, a statistical model, and a tagger**</a>
* <a href="#training">**Train the model**</a>
* <a href="#save">**Save the model**</a>

<a name="load_data"></a>

In [23]:
#load conll2000 corpus
from sklearn.model_selection import train_test_split
import nltk
def nltk_corpus(corpus_name):
    '''returns nltk corpus by name. if not loaded, download.'''
    corpus = getattr(nltk.corpus, corpus_name)
    try:
        corpus.ensure_loaded()
    except:
        nltk.download(corpus_name)
    return corpus


def clean(x):
    if x == '-LCB-':
        return '{'
    elif x=='-RCB-':
        return '}'
    elif x == '-RRB-':
        return ")"
    elif x == '-LRB-':
        return "("
    else:
        return x
    
def conll_to_data():
    corpus = nltk_corpus('conll2000')
    all_data= map_(lambda x: [(clean(i[0]), i[1]) for i in x], corpus.iob_sents())
    all_data = [zip_(*i) for i in all_data]
    return all_data

c2000 = conll_to_data()
training_data, testing_data = train_test_split(c2000, test_size = .1)

<a name="model_dir"></a>

In [174]:
from pathlib import PosixPath
modelpath = PosixPath('/home/jupyter/mymodel')
if not modelpath.exists():
    modelpath.mkdir()
    
nlp.save_to_directory(modelpath)

<a href="tagmap"></a>

In [180]:
from spacy.language_data import TAG_MAP
from spacy.attrs import POS
from spacy.symbols import PUNCT

def adjust_tagmap(tagmap):
    tagmap['('] = tagmap['-LRB-']
    tagmap[')'] = tagmap['-RRB-']
    tagmap['{'] = tagmap['-LRB-']
    tagmap['}'] = tagmap['-RRB-']
    tagmap['$'] = {POS: PUNCT}
    return tagmap

tagmap = adjust_tagmap(TAG_MAP)

<a name="featureextractors"></a>
### Features

* Example features: pos of previous word, identity of current word, etc...
* spacy.tagger.N1_cluster, spacy.tagger.N1_pos, etc...
* which word?
    * N1: Next
    * N0: Current
    * P1: Previous
    * etc...
* which attribute:
    * prefix
    * tag
    * cluster
    * etc...

In [181]:
from spacy.tagger import *


features = [
    #current word attributes
    (W_orth,),(W_shape,),(W_cluster,),(W_flags,),(W_suffix,),(W_prefix,),

    #-1 word attributes    
    (P1_pos,),(P1_cluster,),(P1_flags,),(P1_suffix,),

    #-2 word attributes     
    (P2_pos,),(P2_cluster,),(P2_flags,),

    #+1 word attributes    
    (N1_orth,),(N1_suffix,),(N1_cluster,),(N1_flags,),    

    #+2 word attributes    
    (N2_orth,),(N2_cluster,),(N2_flags,),

    #combination attributes
    (P1_lemma, P1_pos),(P2_lemma, P2_pos), (P1_pos, P2_pos),(P1_pos, W_orth)
]

<a name="vocab"></a>
### Vocabulary, Tagger, and Statistical Model
* The **Vocab** object will receive all the lexeme data (Brown clusters, word vectors, etc) from the English model.
* The **Statistical Model**  will consume the features we defined, using them to make predictions.
* The **Tagger** will consume our vocabulary object, and our statistical model.

In [188]:

def get_lemmatizer():
    return None

def make_tagger(vocab, templates):
    model = spacy.tagger.TaggerModel(templates)
    return spacy.tagger.Tagger(vocab,model)

def get_feature_extractors(nlp):
    #return nlp.vocab.lex_attr_getters
    return nlp.Defaults.lex_attr_getters

#get requirements

feature_extractors = get_feature_extractors(nlp)
lemmatizer = get_lemmatizer()
vocab = Vocab.load(modelpath, feature_extractors, lemmatizer, tagmap)
statistical_model = spacy.tagger.TaggerModel(features)
tagger = spacy.tagger.Tagger(vocab, statistical_model)

In [183]:
# The untrained tagger is awful...
words = ['You','can','always','learn','more','by','reading']
doc = Doc(vocab, words = words)
tagger(doc)
map_(lambda x: (x.orth_, x.pos_), doc)

[('You', 'PUNCT'),
 ('can', 'PUNCT'),
 ('always', 'PUNCT'),
 ('learn', 'PUNCT'),
 ('more', 'PUNCT'),
 ('by', 'PUNCT'),
 ('reading', 'PUNCT')]

<a name="training"></a>
### Training the Model

##### Neuron Prediction:

**Inputs:** 

$<x_1, x_2, ..., x_n>$

**Each Neuron j:**

$prediction_j  = \bigg[\sum_{d=1}^D w_dx_{dj} \bigg] + b > 0$

##### Perceptron Learning:
```
For each document, label:
    prediction = weights * document + bias
    
    if sign(label) != sign(prediction):
        weights = weights + (label*features)
        bias = bias + label
```

### Exercise: 
Implement a perceptron algorithm:

In [287]:
from sklearn.utils import shuffle


In [316]:
from sklearn.utils import shuffle
class PerceptronClassifier(object):
    def __init__(self):
        self.weights = None
        self.bias = None
        self.historical_weights = {}
        self.iter = 0

    def fit(self, X, y, epochs=100):
        """Fits self.weights, self.biases """
        self.weights = np.zeros(X.shape[1])
        self.bias = 0

        for _ in range(epochs):
            for x, label in zip(X, y):
                
                prediction = self.predict(x)
                if np.sign(prediction) != np.sign(label):
                    self.update(label, x)
                    self.historical_weights[self.iter] = self.weights
                self.iter += 1
                
        
            X, y = shuffle(X, y)
                    
    def update(self, label, row):
        """Updates weights and biases based on the ground truth label
        and the row"""
        self.weights = self.weights + (label * row)
        self.bias += self.bias + label
        
    def predict(self, x):
        """Generates 1s and 0s by doing a linear transformation 
        of "x". Uses self.weights and self.bias"""
        pred = ((np.dot(x, self.weights)) + self.bias > 0).astype(float)
        return pred
    

In [317]:
#generate some data
X = np.random.normal(0, 10, size=(1000, 3))
true_b = np.array([1.2, -1.2, 0])
labels = (np.dot(X, true_b) > 5.0).astype(float)

b = PerceptronClassifier()
b.fit(X, labels, epochs=1000)

acc_rate = (b.predict(X) == labels).mean()
print("Training Accuracy Rate: {}".format(acc_rate))

Training Accuracy Rate: 0.391


In [300]:
pd.DataFrame(np.array(b.historical_weights))

Unnamed: 0,0,1,2
0,32.461313,-32.860281,6.467768
1,32.461313,-32.860281,6.467768
2,32.461313,-32.860281,6.467768
3,32.461313,-32.860281,6.467768
4,32.461313,-32.860281,6.467768
5,32.461313,-32.860281,6.467768
6,32.461313,-32.860281,6.467768
7,32.461313,-32.860281,6.467768
8,32.461313,-32.860281,6.467768
9,32.461313,-32.860281,6.467768


In [195]:
from IPython.display import YouTubeVideo
YouTubeVideo('JD_NtVl7o8c', width=560, height=315)

In [189]:
#train
from spacy.scorer import Scorer
from spacy.gold import GoldParse


def predict_cycle(vocab, tagger, data, train=True):
    """For each document in data, creates a document and tags
    it. Creates a goldparse object to hold the ground truth label.
    If train=True, the tagger's statistical model is updated with the 
    result."""
    
    scorer = Scorer()
    
    for words, tags in data:
        #create a document, passing in words to become tokens
        doc = Doc(vocab, words = words)
        tagger(doc)
        gold = GoldParse(doc, tags=tags)   

        scorer.score(doc, gold)              
        
        if train:
            #train the model        
            tagger.update(doc, gold)
    return tagger, scorer

def train(vocab, tagger, training_data, testing_data, epochs = 10):
    """
    Parameters
    ----------
    tagger (spacy.tagger.Tagger): 
        The tagger to train.
        
    training_data (list):
        Training data containing words and annotated tags. 
        Should have form: [(word1, word2,...),(tag1, tag2, .....)]
        
    epochs (int):
        number of training iterations
        
    verbose (Bool):
        whether to track and print training accuracy
    """
    
    tagger, pre_train_scorer = predict_cycle(vocab, tagger, training_data, train=False)
    tagger, pre_test_scorer = predict_cycle(vocab, tagger, testing_data, train=False)
    
    print("Pre Training Accuracy: {0} Pre Test Accuracy {1}".format(pre_train_scorer.tags_acc, 
                                                                    pre_test_scorer.tags_acc))
    
    for train_cycle in range(epochs):
            
        tagger, _ = predict_cycle(vocab, tagger, training_data, train=True)
        
        
        tagger, train_scorer = predict_cycle(vocab, tagger, training_data, train=False)
        tagger, test_scorer = predict_cycle(vocab, tagger, testing_data, train=False)

        print("Iteration {0} Training Accuracy: {1} Test Accuracy {2}".format(train_cycle, 
                                                                              train_scorer.tags_acc, 
                                                                              test_scorer.tags_acc))
        #shuffle data    
        np.random.shuffle(training_data)
    
    tagger.model.end_training()
        
    return tagger

tagger = train(vocab, tagger, training_data, testing_data)

Pre Training Accuracy: 0.0 Pre Test Accuracy 0.0
Iteration 0 Training Accuracy: 94.6284744833259 Test Accuracy 93.4171597633136
Iteration 1 Training Accuracy: 96.83783459574322 Test Accuracy 95.48816568047337
Iteration 2 Training Accuracy: 98.09396099667546 Test Accuracy 96.5587044534413
Iteration 3 Training Accuracy: 98.58107413373548 Test Accuracy 96.7650264715042
Iteration 4 Training Accuracy: 99.12859444082667 Test Accuracy 97.38788539395827
Iteration 5 Training Accuracy: 99.1024608424444 Test Accuracy 97.12706322018063
Iteration 6 Training Accuracy: 99.62856016725503 Test Accuracy 97.71099345998131
Iteration 7 Training Accuracy: 99.6088528635569 Test Accuracy 97.59810028028652
Iteration 8 Training Accuracy: 99.43791342495801 Test Accuracy 97.38009965742759
Iteration 9 Training Accuracy: 99.70096308736333 Test Accuracy 97.67595764559327


<a name="save"></a>
### Saving the Model

In [88]:
def save_tagger(tagger, model_dir):
    if model_dir is not None:
        tagger.model.dump(str(model_dir / 'pos' / 'model'))
        with (model_dir / 'vocab' / 'strings.json').open('w') as file_:
            tagger.vocab.strings.dump(file_)
            
save_tagger(tagger, modelpath)            

### Appendix

<a name='qacode'></a>
### Example Rule Based QA Component Code

In [None]:
def get_answer_requirements(token):
    if token.tag_ == 'WRB':
        if token.lower_ == 'where':
            #Where was Star Wars Filmed
            return ['LOCATION']
        elif token.lower_ == 'when':
            #When was Star Wars Filmed
            return ['DATE']
        elif token.lower_ == 'how':
            #How much did Star Wars make?
            if token.nbor().lower_ in ('much', 'many'):
                return ['QUANTITY']

            #How old is star wars?
            elif token.nbor().lower_ in ('long', 'old'):
                return ['DURATION']
            else:
                return False
        elif token.lower() == 'whom':
            #Whom did you see?
            return ['PERSON','ORG']      
        else:
            return False
    elif token.tag_ == 'WP':
        #Asking for Identity
        if token.lower_ in ('who', 'whose'):
            #Who directed Star Wars?
            return ['PERSON','ORG']
        if token.lower_ in ('which','what'):
            #What is Star Wars
            return False 
        else: 
            return False
    elif token.tag_ == 'WDT':
        #asking for a choice among options
        if token.lower_ in ('which','what'):
            #which Star Wars did you like best?
            return [token.nbor().lower_] #return neighbor
        else:
            return False
    else:
        return False

<a href='#applications'>back</a>
<a name="wordsense"></a>
##### Word sense disambiguation

In [131]:
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
for syn in wn.synsets('shower'):
    print(syn, syn.definition())

[nltk_data] Downloading package wordnet to /home/jupyter/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Synset('shower.n.01') a plumbing fixture that sprays water over you
Synset('shower.n.02') washing yourself by standing upright under water sprayed from a nozzle
Synset('shower.n.03') a brief period of precipitation
Synset('shower.n.04') a sudden downpour (as of tears or sparks etc) likened to a rain shower
Synset('exhibitor.n.01') someone who organizes an exhibit for others to see
Synset('shower.n.06') a party of friends assembled to present gifts (usually of a specified kind) to a person
Synset('lavish.v.01') expend profusely; also used with abstract nouns
Synset('shower.v.02') spray or sprinkle with
Synset('shower.v.03') take a shower; wash one's body in the shower
Synset('shower.v.04') rain abundantly
Synset('shower.v.05') provide abundantly with


<a href='#applications'>back</a>