# No intelligence

In [1]:
from nltk import ngrams

In [2]:
sentence = "oh my god, the chocolate is 15% of today"
n = 2
bigrams = ngrams(sentence.split(), n)
for grams in bigrams:
    print( grams)

('oh', 'my')
('my', 'god,')
('god,', 'the')
('the', 'chocolate')
('chocolate', 'is')
('is', '15%')
('15%', 'of')
('of', 'today')


# Some intelligence

In [3]:
from nltk import RegexpParser

In [4]:
chunkbiGram = r"""NA: {<NOUN><ADJ> }
                        AN: {<ADJ><NOUN> }
                        NN: {<NOUN><NOUN> }
               """

In [5]:
chunkparserbigram = RegexpParser(chunkbiGram)

In [6]:
example = [('oh', 'INTJ'),
 ('my', 'INTJ'),
 ('god', 'INTJ'),
 (',', 'PUNCT'),
 ('the', 'DET'),
 ('dark', 'ADJ'),
 ('chocolate', 'NOUN'),
 ('is', 'VERB'),
 ('15', 'NUM'),
 ('%', 'NOUN'),
 ('of', 'ADP'),
 ('today', 'NOUN')]

In [7]:
chunked = chunkparserbigram.parse(example)

In [8]:
for subtree in chunked.subtrees():
    if subtree.label() == 'NA':
        print('found noun + adjective')
        print([leaf[0] for leaf in subtree.leaves()])
    elif subtree.label() == 'AN':
        print('found adjective + noun')
        print([leaf[0] for leaf in subtree.leaves()])
    elif subtree.label() == 'NN':
        print('found noun + noun')
        print([leaf[0] for leaf in subtree.leaves()])

found adjective + noun
['dark', 'chocolate']


# Intelligence: statistical approach

### Using NLTK

In [9]:
import itertools
from nltk.corpus import genesis
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures

In [10]:
def bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return bigrams

In [11]:
bigrams = bigram_word_feats(genesis.words('english-web.txt'), n=25)

In [12]:
bigrams

[('Allon', 'Bacuth'),
 ('Ashteroth', 'Karnaim'),
 ('Baal', 'Hanan'),
 ('Beer', 'Lahai'),
 ('Ben', 'Ammi'),
 ('En', 'Mishpat'),
 ('Jegar', 'Sahadutha'),
 ('Kiriath', 'Arba'),
 ('Lahai', 'Roi'),
 ('Most', 'High'),
 ('Salt', 'Sea'),
 ('Whoever', 'sheds'),
 ('appoint', 'overseers'),
 ('aromatic', 'resin'),
 ('cutting', 'instrument'),
 ('direct', 'descendants'),
 ('droves', 'apart'),
 ('during', 'mating'),
 ('falls', 'backward'),
 ('fig', 'leaves'),
 ('flaming', 'torch'),
 ('fresh', 'poplar'),
 ('fully', 'pay'),
 ('fury', 'turns'),
 ('gray', 'hairs')]

### Using Spacy

In [13]:
from nltk.corpus import inaugural

In [14]:
from gensim.models.phrases import Phraser, Phrases



In [15]:
all_words = [inaugural.words(x) for x in inaugural.fileids()]

In [16]:
phrases = Phrases(all_words, min_count= 100, threshold= 10)

In [17]:
bigram = Phraser(phrases)

In [18]:
bigram[["Finest","people","of","the","United","States"]]

['Finest', 'people', 'of', 'the', 'United_States']