In [8]:
import glob
import os

data_home = "data"

In [17]:
def open_files(path):
    with open(path, 'r', encoding='utf-8') as f:
        text = f.readlines()
    text_strip = list([i.strip() for i in text if i != '\n'])
    text_join = ' '.join(text_strip)
    
    return text_join

contents = open_files(data)

In [16]:
from nltk import pos_tag, sent_tokenize, wordpunct_tokenize

def sents(paragraph):
    for sentence in sent_tokenize(paragraph):
        yield sentence

def tokenize(paragraph):
    for sentence in sents(paragraph):
        yield pos_tag(wordpunct_tokenize(sentence))

In [122]:
list(tokenize(contents))

['\ufeffthe',
 'project',
 'gutenberg',
 'ebook',
 'of',
 'alic',
 'in',
 'wonderland',
 'by',
 'lewi',
 'carrol',
 'this',
 'ebook',
 'is',
 'for',
 'the',
 'use',
 'of',
 'anyon',
 'anywher',
 'at',
 'no',
 'cost',
 'and',
 'with',
 'almost',
 'no',
 'restrict',
 'whatsoev',
 'you',
 'may',
 'copi',
 'it',
 'give',
 'it',
 'away',
 'or',
 're-us',
 'it',
 'under',
 'the',
 'term',
 'of',
 'the',
 'project',
 'gutenberg',
 'licens',
 'includ',
 'with',
 'this',
 'ebook',
 'or',
 'onlin',
 'at',
 'www.gutenberg.org',
 'titl',
 'alic',
 'in',
 'wonderland',
 'author',
 'lewi',
 'carrol',
 'illustr',
 'gordon',
 'robinson',
 'releas',
 'date',
 'august',
 '12',
 '2006',
 'ebook',
 '19033',
 'languag',
 'english',
 'start',
 'of',
 'this',
 'project',
 'gutenberg',
 'ebook',
 'alic',
 'in',
 'wonderland',
 'produc',
 'by',
 'jason',
 'isbel',
 'irma',
 'spehar',
 'and',
 'the',
 'onlin',
 'distribut',
 'proofread',
 'team',
 'at',
 'http',
 '//www.pgdp.net',
 'illustr',
 'alic',
 'in',
 '

In [84]:
import nltk 
import string

def tokenize(text):
    stem = nltk.stem.SnowballStemmer('english')
    text = text.lower()

    for token in nltk.word_tokenize(text):
        if token in string.punctuation: continue
        yield stem.stem(token)


# The corpus object
corpus = [
    "The elephant sneezed at the sight of potatoes.",
    "Bats can see via echolocation. See the bat sight sneeze!",
    "Wondering, she opened the door to the studio.",
]

def nltk_frequency_vectorize(corpus):

    # The NLTK frequency vectorize method
    from collections import defaultdict

    def vectorize(doc):
        features = defaultdict(int)

        for token in tokenize(doc):
            features[token] += 1

        return features

    return map(vectorize, corpus)


def sklearn_frequency_vectorize(corpus):
    # The Scikit-Learn frequency vectorize method
    from sklearn.feature_extraction.text import CountVectorizer

    vectorizer = CountVectorizer()
    return vectorizer.fit_transform(corpus)


def gensim_frequency_vectorize(corpus):
    # The Gensim frequency vectorize method
    import gensim
    
    tokenized_corpus = [list(tokenize(doc)) for doc in corpus]
    id2word = gensim.corpora.Dictionary(tokenized_corpus)
    return [id2word.doc2bow(doc) for doc in tokenized_corpus]


def nltk_one_hot_vectorize(corpus):
    # The NLTK one hot vectorize method
    def vectorize(doc):
        return {
            token: True
            for token in tokenize(doc)
        }

    return map(vectorize, corpus)


def sklearn_one_hot_vectorize(corpus):
    # The Sklearn one hot vectorize method

    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.preprocessing import Binarizer

    freq    = CountVectorizer()
    vectors = freq.fit_transform(corpus)

    print(len(vectors.toarray()[0]))

    onehot  = Binarizer()
    vectors = onehot.fit_transform(vectors.toarray())

    print(len(vectors[0]))


def gensim_one_hot_vectorize(corpus):
    # The Gensim one hot vectorize method
    import gensim
    import numpy as np

    corpus  = [list(tokenize(doc)) for doc in corpus]
    id2word = gensim.corpora.Dictionary(corpus)

    corpus  = np.array([
        [(token[0], 1) for token in id2word.doc2bow(doc)]
        for doc in corpus
    ])

    return corpus


def nltk_tfidf_vectorize(corpus):

    from nltk.text import TextCollection

    corpus = [list(tokenize(doc)) for doc in corpus]
    texts = TextCollection(corpus)

    for doc in corpus:
        yield {
            term: texts.tf_idf(term, doc)
            for term in doc
        }


def sklearn_tfidf_vectorize(corpus):
    from sklearn.feature_extraction.text import TfidfVectorizer

    tfidf = TfidfVectorizer()
    return tfidf.fit_transform(corpus)


def gensim_tfidf_vectorize(corpus):
    import gensim

    corpus  = [list(tokenize(doc)) for doc in corpus]
    lexicon = gensim.corpora.Dictionary(corpus)

    tfidf   = gensim.models.TfidfModel(dictionary=lexicon, normalize=True)
    vectors = [tfidf[lexicon.doc2bow(vector)] for vector in corpus]

    lexicon.save_as_text('test.txt')
    tfidf.save('tfidf.pkl')

    return vectors


def gensim_doc2vec_vectorize(corpus):
    from gensim.models.doc2vec import TaggedDocument, Doc2Vec

    corpus = [list(tokenize(doc)) for doc in corpus]
    docs   = [
        TaggedDocument(words, ['d{}'.format(idx)])
        for idx, words in enumerate(corpus)
    ]
    model = Doc2Vec(docs, size=5, min_count=0)
    return model.docvecs


print(gensim_doc2vec_vectorize(corpus)[0])

[-0.06424516 -0.05657114  0.09078488  0.02659524  0.09938674]


In [82]:
[''gensim_doc2vec_vectorize(corpus)[1]

array([ 0.0326486 ,  0.05295769, -0.05507323,  0.00833519,  0.01148714],
      dtype=float32)

In [40]:
from nltk import ne_chunk
from nltk.chunk import tree2conlltags

from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import Binarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

GOODLABELS = frozenset(['PERSON', 'ORGANIZATION', 'FACILITY', 'GPE', 'GSP'])
# GPE is Geo-Political Entity, GSP is Geo-Socio-Political group


def identity(words):
    return words


class EntityExtractor(BaseEstimator, TransformerMixin):
    """
    Perform entity extraction
    Output is saved
    """
    def __init__(self, labels=GOODLABELS, **kwargs):
        self.labels = labels

    def get_entities(self, document):
        """
        Extract entities from a single document using the
        nltk.tree.ne_chunk method
        This method is called multiple times by the tranform method
        :param document: a list of lists of tuples
        :return entities: a list of comma-separated strings
        """
        entities = []
        for paragraph in document:
            for sentence in paragraph:
                # classifier chunk the sentences, adds category labels, e.g. PERSON
                trees = ne_chunk(sentence)
                # select only trees with the kinds of entities we want
                for tree in trees:
                    if hasattr(tree, 'label'):
                        if tree.label() in self.labels:
                            # entities is a list, each entry is a list of entities
                            # for a document
                            entities.append(
                                ' '.join([child[0].lower() for child in tree])
                                )
        return entities

    def fit(self, documents, labels=None):
        return self

    def transform(self, documents):
        """
        Create a representation of the documents as a list of their entities
        """
        for document in documents:
            yield self.get_entities(document[0])

def create_pipeline(estimator):
    steps = [
        ('extract', EntityExtractor()),
        ('vectorize', TfidfVectorizer(
            tokenizer=identity, preprocessor=None, lowercase=False
        )),
        ('classifier', estimator)
    ]

    return Pipeline(steps)

def score_models(models, loader):
    for model in models:

        name = model.named_steps['classifier'].__class__.__name__
        scores = {
            'model': str(model),
            'name': name,
            'accuracy': [],
            'precision': [],
            'recall': [],
            'f1': [],
            'time': [],
        }

        for X_train, X_test, y_train, y_test in loader:
            start = time.time()
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

            scores['time'].append(time.time() - start)
            scores['accuracy'].append(accuracy_score(y_test, y_pred))
            scores['precision'].append(precision_score(y_test, y_pred, average='weighted'))
            scores['recall'].append(recall_score(y_test, y_pred, average='weighted'))
            scores['f1'].append(f1_score(y_test, y_pred, average='weighted'))

        yield scores


In [76]:
extractor = EntityExtractor()

In [77]:
extractor.fit_transform(tagged_contents)

<generator object EntityExtractor.transform at 0x0000024A1C01D270>

In [79]:
estimator = MultinomialNB()

In [80]:
create_pipeline(estimator)

Pipeline(steps=[('extract', EntityExtractor()),
                ('vectorize',
                 TfidfVectorizer(lowercase=False,
                                 tokenizer=<function identity at 0x0000024A10D065E0>)),
                ('classifier', MultinomialNB())])

In [92]:
from nltk import ngrams

list(ngrams(["I", "love", "eating", "pizza."], 2, ))

[('I', 'love'), ('love', 'eating'), ('eating', 'pizza.')]

In [126]:
from nltk.collocations import QuadgramCollocationFinder
from nltk.metrics.association import QuadgramAssocMeasures

ngrams = QuadgramCollocationFinder.from_words(word)

socred = ngrams.score_ngrams(QuadgramAssocMeasures.likelihood_ratio)
socred

[(('project', 'gutenberg-tm', 'project', 'gutenberg-tm'), 2514.103333075824),
 (("''", 'said', 'alic', '``'), 2150.45967446),
 (('project', 'gutenberg-tm', 'electron', 'work'), 2140.2341849888867),
 (("''", 'said', 'the', 'king'), 2020.9065678373418),
 (("''", 'said', 'the', 'queen'), 1993.5537846154057),
 (('pardon', "''", 'said', 'alic'), 1984.9205240551523),
 (('majesti', "''", 'said', 'alic'), 1976.642579495851),
 (("''", 'said', 'alic', 'look'), 1969.6401193363708),
 (("''", 'said', 'alic', 'hastili'), 1969.3104535038979),
 (('me', "''", 'said', 'alic'), 1966.8284156889695),
 (("''", 'said', 'alic', 'indign'), 1966.358862124659),
 (('serpent', "''", 'said', 'alic'), 1961.9075429600302),
 (("''", 'said', 'alic', 'in'), 1959.5878005046616),
 (('nonsens', "''", 'said', 'alic'), 1959.4010807828827),
 (('here', "''", 'said', 'alic'), 1957.2857849575162),
 (("''", 'said', 'alic', 'loud'), 1955.5660621962938),
 (("''", 'said', 'alic', 'a'), 1955.5277249318665),
 (('inde', "''", 'said', '

In [123]:
word = list(tokenize(contents))

In [128]:
from sklearn.base import BaseEstimator, TransformerMixin 

class SignificantCollocations(BaseEstimator, TransformerMixin):
    
    def __init__(self, ngram_class=QuadgramCollocationFinder,
                metric=QuadgramAssocMeasures.pmi):
        self.ngram_class = ngram_class 
        self.metric = metric 
        
    def fit(self, docs, target):
        ngrams = self.ngram_class.from_documents(docs)
        self.scored_ = dict(ngrams.score_ngrams(self.metric))
        
    def transform(self, docs):
        for doc in docs:
            ngrams = self.ngram_class.from_words(docs)
            yield {
                ngram: self.scored_.get(ngram, 0.0)
                for ngram in ngrams.nbest(QuadgramAssocMeasures.raw_freq, 50)
            }

In [139]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction import DictVectorizer 

estimator = [('entity', EntityExtractor()), ('collocations', SignificantCollocations())]
estimator

[('entity', EntityExtractor()), ('collocations', SignificantCollocations())]

In [143]:
combined = FeatureUnion(estimator)
combined

FeatureUnion(transformer_list=[('entity', EntityExtractor()),
                               ('collocations', SignificantCollocations())])

In [149]:
Pipeline[('combined', combined)]

array([<generator object EntityExtractor.transform at 0x0000024A1F5A52E0>,
       <generator object SignificantCollocations.transform at 0x0000024A1F5A5040>],
      dtype=object)