In [1]:
from nltk.util import pad_sequence
from nltk.util import bigrams
from nltk.util import ngrams
from nltk.util import everygrams
from nltk.lm.preprocessing import pad_both_ends
from nltk.lm.preprocessing import flatten

In [2]:
text = [['a','b','c'],['a','c','d','c','e','f']]
text

[['a', 'b', 'c'], ['a', 'c', 'd', 'c', 'e', 'f']]

In [3]:
list(bigrams(text[0]))

[('a', 'b'), ('b', 'c')]

In [4]:
list(ngrams(text[1], n=3))

[('a', 'c', 'd'), ('c', 'd', 'c'), ('d', 'c', 'e'), ('c', 'e', 'f')]

In [5]:
index = 1 #@param {type: "number"}
data = text[index] 

In [6]:
from nltk.util import pad_sequence
list(pad_sequence(data,
                  pad_left=True, left_pad_symbol="<s>",
                  pad_right=True, right_pad_symbol="</s>",
                  n=3 #@param {type: "number"}
                  )) 

['<s>', '<s>', 'a', 'c', 'd', 'c', 'e', 'f', '</s>', '</s>']

In [7]:
padded_sent = list(pad_sequence(data, 
                                pad_left=True, 
                                left_pad_symbol="<s>", 
                                pad_right=True, 
                                right_pad_symbol="</s>", 
                                n=2 #@param {type: "number"}
                                ))
padded_sent

['<s>', 'a', 'c', 'd', 'c', 'e', 'f', '</s>']

In [8]:
list(ngrams(padded_sent, 
            n=3 #@param {type: "number"}
            )) 

[('<s>', 'a', 'c'),
 ('a', 'c', 'd'),
 ('c', 'd', 'c'),
 ('d', 'c', 'e'),
 ('c', 'e', 'f'),
 ('e', 'f', '</s>')]

In [9]:
from nltk.lm.preprocessing import pad_both_ends
list(pad_both_ends(data, 
                   n=3 #@param {type: "number"}
                   ))

['<s>', '<s>', 'a', 'c', 'd', 'c', 'e', 'f', '</s>', '</s>']

In [10]:
list(bigrams(pad_both_ends(data, 
                           n=2 #@param {type: "number"}
                           )))

[('<s>', 'a'),
 ('a', 'c'),
 ('c', 'd'),
 ('d', 'c'),
 ('c', 'e'),
 ('e', 'f'),
 ('f', '</s>')]

In [11]:
from nltk.util import everygrams
padded_bigrams = list(pad_both_ends(data, 
                                    n=2 #@param {type: "number"}
                                    ))

padded_bigrams

['<s>', 'a', 'c', 'd', 'c', 'e', 'f', '</s>']

In [12]:
list(everygrams(padded_bigrams, 
                max_len=3 #@param {type: "number"}
                ))

[('<s>',),
 ('a',),
 ('c',),
 ('d',),
 ('c',),
 ('e',),
 ('f',),
 ('</s>',),
 ('<s>', 'a'),
 ('a', 'c'),
 ('c', 'd'),
 ('d', 'c'),
 ('c', 'e'),
 ('e', 'f'),
 ('f', '</s>'),
 ('<s>', 'a', 'c'),
 ('a', 'c', 'd'),
 ('c', 'd', 'c'),
 ('d', 'c', 'e'),
 ('c', 'e', 'f'),
 ('e', 'f', '</s>')]

In [13]:
from nltk.lm.preprocessing import flatten
list(flatten(pad_both_ends(sent, 
                           n=2 #@param {type: "number"}
                           ) for sent in text))

['<s>', 'a', 'b', 'c', '</s>', '<s>', 'a', 'c', 'd', 'c', 'e', 'f', '</s>']

In [14]:
from nltk.lm.preprocessing import padded_everygram_pipeline
train, vocab = padded_everygram_pipeline(4, text)
train

<generator object padded_everygram_pipeline.<locals>.<genexpr> at 0x7fb30ded64c0>

In [15]:
training_ngrams, padded_sentences = padded_everygram_pipeline(3, text)
for ngramlize_sent in training_ngrams:
    print(list(ngramlize_sent))
print('#############')
list(padded_sentences)

[('<s>',), ('<s>',), ('a',), ('b',), ('c',), ('</s>',), ('</s>',), ('<s>', '<s>'), ('<s>', 'a'), ('a', 'b'), ('b', 'c'), ('c', '</s>'), ('</s>', '</s>'), ('<s>', '<s>', 'a'), ('<s>', 'a', 'b'), ('a', 'b', 'c'), ('b', 'c', '</s>'), ('c', '</s>', '</s>')]
[('<s>',), ('<s>',), ('a',), ('c',), ('d',), ('c',), ('e',), ('f',), ('</s>',), ('</s>',), ('<s>', '<s>'), ('<s>', 'a'), ('a', 'c'), ('c', 'd'), ('d', 'c'), ('c', 'e'), ('e', 'f'), ('f', '</s>'), ('</s>', '</s>'), ('<s>', '<s>', 'a'), ('<s>', 'a', 'c'), ('a', 'c', 'd'), ('c', 'd', 'c'), ('d', 'c', 'e'), ('c', 'e', 'f'), ('e', 'f', '</s>'), ('f', '</s>', '</s>')]
#############


['<s>',
 '<s>',
 'a',
 'b',
 'c',
 '</s>',
 '</s>',
 '<s>',
 '<s>',
 'a',
 'c',
 'd',
 'c',
 'e',
 'f',
 '</s>',
 '</s>']

In [16]:
try: # Use the default NLTK tokenizer.
    from nltk import word_tokenize, sent_tokenize 
    # Testing whether it works. 
    # Sometimes it doesn't work on some machines because of setup issues.
    word_tokenize(sent_tokenize("This is a foobar sentence. Yes it is.")[0])
except: # Use a naive sentence tokenizer and toktok.
    import re
    from nltk.tokenize import ToktokTokenizer
    # See https://stackoverflow.com/a/25736515/610569
    sent_tokenize = lambda x: re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', x)
    # Use the toktok tokenizer that requires no dependencies.
    toktok = ToktokTokenizer()
    word_tokenize = word_tokenize = toktok.tokenize

In [17]:
import os
import requests
import io #codecs


In [18]:
# Text version of https://kilgarriff.co.uk/Publications/2005-K-lineer.pdf
if os.path.isfile('language-never-random.txt'):
    with io.open('language-never-random.txt', encoding='utf8') as fin:
        text = fin.read()
else:
    url = "https://gist.githubusercontent.com/alvations/53b01e4076573fea47c6057120bb017a/raw/b01ff96a5f76848450e648f35da6497ca9454e4a/language-never-random.txt"
    text = requests.get(url).content.decode('utf8')
    with io.open('language-never-random.txt', 'w', encoding='utf8') as fout:
        fout.write(text)

In [19]:
!ls

data			   language-never-random.txt  Practica_2.ipynb
kilgariff_ngram_model.pkl  Practica_1.ipynb	      Practica_5.ipynb


In [20]:
text = text[:500]
text

'                       Language is never, ever, ever, random\n\n                                                               ADAM KILGARRIFF\n\n\n\n\nAbstract\nLanguage users never choose words randomly, and language is essentially\nnon-random. Statistical hypothesis testing uses a null hypothesis, which\nposits randomness. Hence, when we look at linguistic phenomena in cor-\npora, the null hypothesis will never be true. Moreover, where there is enough\ndata, we shall (almost) always be able to establish '

In [21]:
tokenized_text = [list(map(str.lower, word_tokenize(sent))) 
                  for sent in sent_tokenize(text)]

print(tokenized_text)

[['language', 'is', 'never', ',', 'ever', ',', 'ever', ',', 'random', 'adam', 'kilgarriff', 'abstract', 'language', 'users', 'never', 'choose', 'words', 'randomly', ',', 'and', 'language', 'is', 'essentially', 'non-random', '.'], ['statistical', 'hypothesis', 'testing', 'uses', 'a', 'null', 'hypothesis', ',', 'which', 'posits', 'randomness', '.'], ['hence', ',', 'when', 'we', 'look', 'at', 'linguistic', 'phenomena', 'in', 'cor-', 'pora', ',', 'the', 'null', 'hypothesis', 'will', 'never', 'be', 'true', '.'], ['moreover', ',', 'where', 'there', 'is', 'enough', 'data', ',', 'we', 'shall', '(', 'almost', ')', 'always', 'be', 'able', 'to', 'establish']]


In [22]:
print(text[:500])

                       Language is never, ever, ever, random

                                                               ADAM KILGARRIFF




Abstract
Language users never choose words randomly, and language is essentially
non-random. Statistical hypothesis testing uses a null hypothesis, which
posits randomness. Hence, when we look at linguistic phenomena in cor-
pora, the null hypothesis will never be true. Moreover, where there is enough
data, we shall (almost) always be able to establish 


In [23]:
# Preprocess the tokenized text for 3-grams language modelling
from nltk.lm.preprocessing import padded_everygram_pipeline
n = 3 #@param {type: "number"}
train_data, padded_sents = padded_everygram_pipeline(n, tokenized_text)

In [24]:
from nltk.lm import MLE
n = 3
model = MLE(n) # Lets train a 3-grams model

In [25]:
len(model.vocab)

0

In [26]:
model.fit(train_data, padded_sents)
print(model.vocab)

<Vocabulary with cutoff=1 unk_label='<UNK>' and 56 items>


In [27]:
len(model.vocab)

56

In [28]:
print(model.vocab.lookup(tokenized_text[0]))

('language', 'is', 'never', ',', 'ever', ',', 'ever', ',', 'random', 'adam', 'kilgarriff', 'abstract', 'language', 'users', 'never', 'choose', 'words', 'randomly', ',', 'and', 'language', 'is', 'essentially', 'non-random', '.')


In [29]:
print(model.vocab.lookup('language is never fgfhj .'.split()))

('language', 'is', 'never', '<UNK>', '.')


In [30]:
print(model.counts)

<NgramCounter with 3 ngram orders and 261 ngrams>


In [31]:
# i.e. Count('language')
model.counts['language'] 

3

In [32]:
model.counts[['language']]['is'] 

2

In [33]:
model.counts[['language', 'is']]['never'] 

1

In [34]:
model.score('language') 

0.03296703296703297

In [35]:
model.score('is', 'language'.split())  

0.6666666666666666

In [36]:
model.score('never', 'language is'.split())  

0.5

In [37]:
words =  10#@param {type: "number"}
seed =  11 #@param {type: "number"}
print(model.generate(num_words= words, random_seed=seed))
print(model.generate(num_words= words, text_seed=['random']))

['cor-', 'pora', ',', 'the', 'null', 'hypothesis', ',', 'which', 'posits', 'randomness']
['adam', 'kilgarriff', 'abstract', 'language', 'users', 'never', 'choose', 'words', 'randomly', ',']


In [38]:
from nltk.tokenize.treebank import TreebankWordDetokenizer

detokenize = TreebankWordDetokenizer().detokenize

def generate_sent(model, num_words, random_seed=42):
    """
    :param model: An ngram language model from `nltk.lm.model`.
    :param num_words: Max no. of words to generate.
    :param random_seed: Seed value for random.
    """
    content = []
    for token in model.generate(num_words, random_seed=random_seed):
        if token == '<s>':
            continue
        if token == '</s>':
            break
        content.append(token)
    return detokenize(content)

In [39]:
words = 10 #@param {type: "number"}
seed =  15#@param {type: "number"}
generate_sent(model, num_words = words, random_seed=seed)

'where there is enough data, we shall (almost'

In [40]:
import dill as pickle 

with open('kilgariff_ngram_model.pkl', 'wb') as fout:
    pickle.dump(model, fout)

In [41]:
with open('kilgariff_ngram_model.pkl', 'rb') as fin:
    model_loaded = pickle.load(fin)

In [42]:
generate_sent(model_loaded, words, random_seed=seed)

'where there is enough data, we shall (almost'

In [43]:
import pandas as pd
df = pd.read_csv('data/Donald-Tweets.csv.zip')
df.head()

Unnamed: 0,Date,Time,Tweet_Text,Type,Media_Type,Hashtags,Tweet_Id,Tweet_Url,twt_favourites_IS_THIS_LIKE_QUESTION_MARK,Retweets,Unnamed: 10,Unnamed: 11
0,16-11-11,15:26:37,Today we express our deepest gratitude to all ...,text,photo,ThankAVet,7.97e+17,https://twitter.com/realDonaldTrump/status/797...,127213,41112,,
1,16-11-11,13:33:35,Busy day planned in New York. Will soon be mak...,text,,,7.97e+17,https://twitter.com/realDonaldTrump/status/797...,141527,28654,,
2,16-11-11,11:14:20,Love the fact that the small groups of protest...,text,,,7.97e+17,https://twitter.com/realDonaldTrump/status/797...,183729,50039,,
3,16-11-11,2:19:44,Just had a very open and successful presidenti...,text,,,7.97e+17,https://twitter.com/realDonaldTrump/status/796...,214001,67010,,
4,16-11-11,2:10:46,A fantastic day in D.C. Met with President Oba...,text,,,7.97e+17,https://twitter.com/realDonaldTrump/status/796...,178499,36688,,


In [44]:
trump_corpus = list(df['Tweet_Text'].apply(word_tokenize))

In [45]:
# Preprocess the tokenized text for 3-grams language modelling
n = 3
train_data, padded_sents = padded_everygram_pipeline(n, trump_corpus)

In [46]:
from nltk.lm import MLE
trump_model = MLE(n) # Lets train a 3-grams model, previously we set n=3
trump_model.fit(train_data, padded_sents)

In [47]:
generate_sent(trump_model, num_words=10, random_seed=42)

'do, and almost everyone of my beautiful mother ,'

In [48]:
generate_sent(trump_model, num_words=20, random_seed=42)

'do, and almost everyone of my beautiful mother, amazing father, @realDonaldTrump leading GOP in #Wisconsin.'

In [49]:
generate_sent(trump_model, num_words=50, random_seed=15)

'were AWESOME. WISDOM and HUMOR ARE A POWERFUL COMBINATION!!! #Trump2016 #MakeAmericaGreatAgain 3 : 30pm EST). Will be doing a great baseball player, he wouldnt have nominated John Roberts. Really? https://t.co/gBnsNgomom'

In [50]:
generate_sent(trump_model, num_words=100, random_seed=15)

'were AWESOME. WISDOM and HUMOR ARE A POWERFUL COMBINATION!!! #Trump2016 #MakeAmericaGreatAgain 3 : 30pm EST). Will be doing a great baseball player, he wouldnt have nominated John Roberts. Really? https://t.co/gBnsNgomom'

In [51]:
# Preprocess the tokenized text for 2-grams language modelling
n = 2 #@param {type: "number"}
train_data, padded_sents = padded_everygram_pipeline(n, tokenized_text)

model = MLE(n) # Lets train a 3-grams model

len(model.vocab)

0

In [52]:
model.fit(train_data, padded_sents)
print(model.vocab)

<Vocabulary with cutoff=1 unk_label='<UNK>' and 56 items>


In [53]:
len(model.vocab)

56

In [54]:
print(model.vocab.lookup(tokenized_text[0]))

('language', 'is', 'never', ',', 'ever', ',', 'ever', ',', 'random', 'adam', 'kilgarriff', 'abstract', 'language', 'users', 'never', 'choose', 'words', 'randomly', ',', 'and', 'language', 'is', 'essentially', 'non-random', '.')


In [55]:
print(model.vocab.lookup('language is never fgfhj .'.split()))

('language', 'is', 'never', '<UNK>', '.')


In [56]:
print(model.counts)

<NgramCounter with 2 ngram orders and 162 ngrams>


In [57]:
model.counts['language'] 


3

In [58]:
model.counts[['language']]['is']

2

In [59]:
model.counts[['language', 'is']]['never'] 

0

In [60]:
model.score('language') 

0.03614457831325301

In [61]:
model.score('is', 'language'.split()) 

0.6666666666666666

In [62]:
model.score('never', 'language is'.split()) 

0

In [63]:
words =  10#@param {type: "number"}
seed =  11 #@param {type: "number"}
print(model.generate(num_words= words, random_seed=seed))
print(model.generate(num_words= words, text_seed=['random']))

['establish', '</s>', 'we', 'look', 'at', 'linguistic', 'phenomena', 'in', 'cor-', 'pora']
['adam', 'kilgarriff', 'abstract', 'language', 'users', 'never', 'be', 'true', '.', '</s>']


In [64]:
detokenize = TreebankWordDetokenizer().detokenize

In [65]:
def generate_sent(model, num_words, random_seed=42):
    """
    :param model: An ngram language model from `nltk.lm.model`.
    :param num_words: Max no. of words to generate.
    :param random_seed: Seed value for random.
    """
    content = []
    for token in model.generate(num_words, random_seed=random_seed):
        if token == '<s>':
            continue
        if token == '</s>':
            break
        content.append(token)
    return detokenize(content)

In [66]:
words = 10 #@param {type: "number"}
seed =  15#@param {type: "number"}
generate_sent(model, num_words = words, random_seed=seed)

'which posits randomness.'

In [67]:
with open('kilgariff_ngram_model.pkl', 'wb') as fout:
    pickle.dump(model, fout)

with open('kilgariff_ngram_model.pkl', 'rb') as fin:
    model_loaded = pickle.load(fin)


In [68]:
generate_sent(model_loaded, words, random_seed=seed)

'which posits randomness.'

In [69]:
df = pd.read_csv('data/Donald-Tweets.csv.zip')
df.head()

Unnamed: 0,Date,Time,Tweet_Text,Type,Media_Type,Hashtags,Tweet_Id,Tweet_Url,twt_favourites_IS_THIS_LIKE_QUESTION_MARK,Retweets,Unnamed: 10,Unnamed: 11
0,16-11-11,15:26:37,Today we express our deepest gratitude to all ...,text,photo,ThankAVet,7.97e+17,https://twitter.com/realDonaldTrump/status/797...,127213,41112,,
1,16-11-11,13:33:35,Busy day planned in New York. Will soon be mak...,text,,,7.97e+17,https://twitter.com/realDonaldTrump/status/797...,141527,28654,,
2,16-11-11,11:14:20,Love the fact that the small groups of protest...,text,,,7.97e+17,https://twitter.com/realDonaldTrump/status/797...,183729,50039,,
3,16-11-11,2:19:44,Just had a very open and successful presidenti...,text,,,7.97e+17,https://twitter.com/realDonaldTrump/status/796...,214001,67010,,
4,16-11-11,2:10:46,A fantastic day in D.C. Met with President Oba...,text,,,7.97e+17,https://twitter.com/realDonaldTrump/status/796...,178499,36688,,


In [70]:
trump_corpus = list(df['Tweet_Text'].apply(word_tokenize))

In [71]:
train_data, padded_sents = padded_everygram_pipeline(n, trump_corpus)


In [72]:
trump_model = MLE(n) # Lets train a 2-grams model, previously we set n=3
trump_model.fit(train_data, padded_sents)

In [73]:
generate_sent(trump_model, num_words=10, random_seed=42)

'friends! #VoteTrump #MakeAmericaGreatAgain Thank you think @megynkelly I agree'

In [74]:
generate_sent(trump_model, num_words=20, random_seed=42)

'friends! #VoteTrump #MakeAmericaGreatAgain Thank you think @megynkelly I agree!'

In [75]:
generate_sent(trump_model, num_words=50, random_seed=15)

'white Evangelicals. https://t.co/6ohwIh1Q24'

In [76]:
generate_sent(trump_model, num_words=100, random_seed=15)

'white Evangelicals. https://t.co/6ohwIh1Q24'