In [1]:
import string
import numpy as np

from nltk.util import skipgrams, ngrams
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [4]:
test_sent = 'I am a test sentence.  How cool is that, Mr. Joe?'
test_sent = word_tokenize(test_sent.lower())
test_sent = list(filter(lambda c: c not in string.punctuation, test_sent))
print(test_sent)
%timeit list(filter(lambda c: c not in string.punctuation, test_sent))

['i', 'am', 'a', 'test', 'sentence', 'how', 'cool', 'is', 'that', 'mr.', 'joe']
1.93 µs ± 11.8 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [10]:
%timeit ngrams(test_sent, 2)
list(ngrams(test_sent, 2))

203 ns ± 2.15 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


[('i', 'am'),
 ('am', 'a'),
 ('a', 'test'),
 ('test', 'sentence'),
 ('sentence', 'how'),
 ('how', 'cool'),
 ('cool', 'is'),
 ('is', 'that'),
 ('that', 'mr.'),
 ('mr.', 'joe')]

In [11]:
%timeit ngrams(test_sent, 3, 3)
list(skipgrams(test_sent, 3, 3))

207 ns ± 1.64 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


[('i', 'am', 'a'),
 ('i', 'am', 'test'),
 ('i', 'am', 'sentence'),
 ('i', 'am', 'how'),
 ('i', 'a', 'test'),
 ('i', 'a', 'sentence'),
 ('i', 'a', 'how'),
 ('i', 'test', 'sentence'),
 ('i', 'test', 'how'),
 ('i', 'sentence', 'how'),
 ('am', 'a', 'test'),
 ('am', 'a', 'sentence'),
 ('am', 'a', 'how'),
 ('am', 'a', 'cool'),
 ('am', 'test', 'sentence'),
 ('am', 'test', 'how'),
 ('am', 'test', 'cool'),
 ('am', 'sentence', 'how'),
 ('am', 'sentence', 'cool'),
 ('am', 'how', 'cool'),
 ('a', 'test', 'sentence'),
 ('a', 'test', 'how'),
 ('a', 'test', 'cool'),
 ('a', 'test', 'is'),
 ('a', 'sentence', 'how'),
 ('a', 'sentence', 'cool'),
 ('a', 'sentence', 'is'),
 ('a', 'how', 'cool'),
 ('a', 'how', 'is'),
 ('a', 'cool', 'is'),
 ('test', 'sentence', 'how'),
 ('test', 'sentence', 'cool'),
 ('test', 'sentence', 'is'),
 ('test', 'sentence', 'that'),
 ('test', 'how', 'cool'),
 ('test', 'how', 'is'),
 ('test', 'how', 'that'),
 ('test', 'cool', 'is'),
 ('test', 'cool', 'that'),
 ('test', 'is', 'that'),


In [14]:
def skipgram_analyzer(s, skipgram_list=[(1, 0)]):
    '''
    An analyzer that splits a string s into a list of (n, k) skipgrams
    for each (n, k) pair in the skipgrams list.
    '''
    s = word_tokenize(s.lower())
    s = list(filter(lambda c: c not in string.punctuation, s))

    ret = []
    for n, k in skipgram_list:
        if k == 0:
            ret += list(ngrams(s, n))
        else:
            ret += skipgrams(s, n, k)
    return ret

In [15]:
test_sent = 'I am a test sentence.  How cool is that, Mr. Joe?'
feat = skipgram_analyzer(test_sent, skipgram_list=[(1, 0), (2, 3), (3, 2)])
print(len(feat))
print(feat)

91
[('i',), ('am',), ('a',), ('test',), ('sentence',), ('how',), ('cool',), ('is',), ('that',), ('mr.',), ('joe',), ('i', 'am'), ('i', 'a'), ('i', 'test'), ('i', 'sentence'), ('am', 'a'), ('am', 'test'), ('am', 'sentence'), ('am', 'how'), ('a', 'test'), ('a', 'sentence'), ('a', 'how'), ('a', 'cool'), ('test', 'sentence'), ('test', 'how'), ('test', 'cool'), ('test', 'is'), ('sentence', 'how'), ('sentence', 'cool'), ('sentence', 'is'), ('sentence', 'that'), ('how', 'cool'), ('how', 'is'), ('how', 'that'), ('how', 'mr.'), ('cool', 'is'), ('cool', 'that'), ('cool', 'mr.'), ('cool', 'joe'), ('is', 'that'), ('is', 'mr.'), ('is', 'joe'), ('that', 'mr.'), ('that', 'joe'), ('mr.', 'joe'), ('i', 'am', 'a'), ('i', 'am', 'test'), ('i', 'am', 'sentence'), ('i', 'a', 'test'), ('i', 'a', 'sentence'), ('i', 'test', 'sentence'), ('am', 'a', 'test'), ('am', 'a', 'sentence'), ('am', 'a', 'how'), ('am', 'test', 'sentence'), ('am', 'test', 'how'), ('am', 'sentence', 'how'), ('a', 'test', 'sentence'), ('a',

In [16]:
t = TfidfVectorizer(stop_words='english', max_features=10000, 
                    analyzer = lambda s: skipgram_analyzer(s, skipgram_list=[(1, 0), (2, 3), (3, 2)]))

In [29]:
T = t.fit_transform(['I am a test sentence.  How cool is that, Mr. Joe?',
                     'Greetings, human.  I am another test sentence.']).toarray()
print(T.shape)

(2, 124)


In [30]:
t.vocabulary_

{('a',): 0,
 ('a', 'cool'): 1,
 ('a', 'how'): 2,
 ('a', 'how', 'cool'): 3,
 ('a', 'sentence'): 4,
 ('a', 'sentence', 'cool'): 5,
 ('a', 'sentence', 'how'): 6,
 ('a', 'test'): 7,
 ('a', 'test', 'cool'): 8,
 ('a', 'test', 'how'): 9,
 ('a', 'test', 'sentence'): 10,
 ('am',): 11,
 ('am', 'a'): 12,
 ('am', 'a', 'how'): 13,
 ('am', 'a', 'sentence'): 14,
 ('am', 'a', 'test'): 15,
 ('am', 'another'): 16,
 ('am', 'another', 'sentence'): 17,
 ('am', 'another', 'test'): 18,
 ('am', 'how'): 19,
 ('am', 'sentence'): 20,
 ('am', 'sentence', 'how'): 21,
 ('am', 'test'): 22,
 ('am', 'test', 'how'): 23,
 ('am', 'test', 'sentence'): 24,
 ('another',): 25,
 ('another', 'sentence'): 26,
 ('another', 'test'): 27,
 ('another', 'test', 'sentence'): 28,
 ('cool',): 29,
 ('cool', 'is'): 30,
 ('cool', 'is', 'joe'): 31,
 ('cool', 'is', 'mr.'): 32,
 ('cool', 'is', 'that'): 33,
 ('cool', 'joe'): 34,
 ('cool', 'mr.'): 35,
 ('cool', 'mr.', 'joe'): 36,
 ('cool', 'that'): 37,
 ('cool', 'that', 'joe'): 38,
 ('cool', 't

In [31]:
from sklearn.decomposition import NMF

In [32]:
nmf = NMF(n_components=2)

In [35]:
W = nmf.fit_transform(T)
W

array([[  2.74026997e-05,   1.05356462e+00],
       [  8.81770694e-01,   0.00000000e+00]])

In [41]:
W.dump?