In [1]:
from deeppavlov.models.bidirectional_lms import elmo_bilm
from deeppavlov.models.tokenizers.lazy_tokenizer import LazyTokenizer
import kenlm
import numpy as np
from scipy.stats.mstats import gmean

[nltk_data] Downloading package punkt to /home/sultanov/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sultanov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package perluniprops to
[nltk_data]     /home/sultanov/nltk_data...
[nltk_data]   Package perluniprops is already up-to-date!
[nltk_data] Downloading package nonbreaking_prefixes to
[nltk_data]     /home/sultanov/nltk_data...
[nltk_data]   Package nonbreaking_prefixes is already up-to-date!


In [9]:
class ELMoAug:
    
    def __init__(self, 
                 language: str,
                 elmo_path: str,
                 kenlm_path: str,
                 isalpha_only: bool,
                 standard_cases_only: dict,
                 ):
        self.lang = language
        assert self.lang in ['rus', 'eng'], 'It supports only russian and english languages'
        self.isalpha_only = isalpha_only
        self.standard_cases_only = standard_cases_only
        
        
        
        self.elmo = elmo_bilm.ELMoEmbedder(model_dir="/cephfs/home/sultanov/elmo_lm/lib/python3.6/site-packages/download/bidirectional_lms/elmo_en_news")
        klm = kenlm.Model('/cephfs/home/sultanov/elmo_lm/lib/python3.6/site-packages/download/ngram_lm/en_wiki_no_punkt.arpa.binary')
        self.elmo_vocab_scores = np.array([klm.score(token, bos=False, eos=False) for token in self.elmo.get_vocab()])
        self.token2idx = dict(zip(self.elmo.get_vocab(),range(len(self.elmo.get_vocab()))))
        
    
    def _softmax(self, a, axis):
        numerator = np.exp(a - np.max(a))
        denominator = np.expand_dims(np.sum(numerator, axis=axis), 2)
        return numerator / denominator
    
    
    def _unite_distr(self, left_and_right_distr, method):
        if method == 'left':
            res = left_and_right_distr[:, 0, :]
        elif method == 'right':
            res = left_and_right_distr[:, 1, :]
        elif method == 'max':
            res = np.max(left_and_right_distr, axis=1)
        elif method == 'min':
            res = np.min(left_and_right_distr, axis=1)
        elif method == 'both':
            res = np.log(left_and_right_distr) # преобразуем в log
            res = np.sum(res, axis=1) # суммируем левый и правый контекст
            res = res - self.elmo_vocab_scores # вычитаем вероятность отдельных токенов
        elif method == 'gmean':
            res = gmean(left_and_right_distr, axis=1)
        res = self._softmax(res, 1)
        return res
    
    
    #def _filter_word(frequence: float, )
        
    def _get_perplexity(self, corpus, method):
        elmo_distr = self.elmo(corpus)
        elmo_distr = [self._unite_distr(elmo_distr_sent, method) for elmo_distr_sent in elmo_distr]
        idx_corpus = [[self.token2idx.get(token, -1) for token in sentence] for sentence in corpus]
        p_perplexity = []
        for num_sent, idxs_sent in enumerate(idx_corpus):
            for num_token, idx_token in enumerate(idxs_sent):
                if idx_token == -1:
                    p_perplexity.append(1)
                else:
                    p_perplexity.append(elmo_distr[num_sent][num_token,idx_token])
        perplexity = np.exp(-np.mean(np.log(p_perplexity)))
        return perplexity
    
    
    def __call__(self, corpus):
        elmo_distr = self.elmo(corpus)
        elmo_distr = [self._unite_distr(elmo_distr_sent, 'both') for elmo_distr_sent in elmo_distr]
        return elmo_distr
                        

In [10]:
import nltk

In [11]:
alice = nltk.corpus.gutenberg.sents('carroll-alice.txt')

In [12]:
el = ELMoAug('eng', 'hz', 'e', 'e', 'e')

Using TensorFlow backend.


****************************************************************************************************
/cephfs/home/sultanov/elmo_lm/lib/python3.6/site-packages/download/bidirectional_lms/elmo_en_news
Instructions for updating:
Colocations handled automatically by placer.


Instructions for updating:
Colocations handled automatically by placer.


Instructions for updating:
Use the `axis` argument instead


Instructions for updating:
Use the `axis` argument instead


USING SKIP CONNECTIONS
Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.


Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.


Instructions for updating:
This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.


Instructions for updating:
This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.


Instructions for updating:
Please use `keras.layers.RNN(cell, unroll=True)`, which is equivalent to this API


Instructions for updating:
Please use `keras.layers.RNN(cell, unroll=True)`, which is equivalent to this API


Instructions for updating:
Use standard file APIs to check for files with this prefix.


Instructions for updating:
Use standard file APIs to check for files with this prefix.


INFO:tensorflow:Restoring parameters from /cephfs/home/sultanov/elmo_lm/lib/python3.6/site-packages/download/bidirectional_lms/elmo_en_news/model.ckpt-935588


2019-04-01 22:03:20.158 INFO in 'tensorflow'['saver'] at line 1270: Restoring parameters from /cephfs/home/sultanov/elmo_lm/lib/python3.6/site-packages/download/bidirectional_lms/elmo_en_news/model.ckpt-935588


In [27]:
test_sentences = \
["Almost half of all iPhone owners have broken their screens, not just once but an average of two times each.",\
   "i really don't understand your point.\xa0 It seems that you are mixing apples and oranges.",\
   "shut the fuck up. you and the rest of your faggot friends should be burned at the stake",\
   "That you are an idiot who understands neither taxation nor women's health.",\
   "What on Earth is that about? Is it what's going to get him fired eventually?",\
   "This is a doctrine of constitutional interpretation that says that a constitution is organic and must be read in a broad and liberal manner so as to adapt it to changing times.",\
   "In the 2000s, music notation typically means the written expression of music notes and rhythms on paper using symbols.",\
   "Most of the mathematical notation in use today was not invented until the 16th century.[52] Before that, mathematics was written out in words, limiting mathematical discovery.",\
   "Physical geography deals with the study of processes and patterns in the natural environment like the atmosphere, hydrosphere, biosphere, and geosphere.",\
   "An autobiography is written by the person himself or herself, sometimes with the assistance of a collaborator or ghostwriter.",\
    "You fuck your dad.",\
    "Yeah and where are you now?",\
    "shut the fuck up. you and the rest of your faggot friends should be burned at the stake",\
    "you are a land creature. You would drown....",\
    "But how would you actually get the key out?",\
    "fucking behave then you prick!",\
    "You right if you are relaxe then you can give better result or perform and your identity should be from your work.",\
    "The laughs you two heard were triggered by memories of his own high-flying exits off moving beasts",\
 "Well, you guys have gone and done it now. You put the words 'China' and 'Chinese' up the required number of times for the dating Asians ad to come up. Evidently, Ms. Zhang, 50Kg and 168cm [for a BMI of 17.8] from 'HuNan China' wants to meet me. She has her little mouth open like she's speaking. What's that you ask, Zhang? Well, yes, as a matter of fact I am a physician.  Why are you clapping your hands together and jumping up and down?  Stop that squealing, young lady and 'exprain' yourself!",\
 "Fact : Georgia passed a strict immigration policy and most of the Latino farm workers left the area. Vidalia Georgia now has over 3000 agriculture job openings and they have been able to fill about 250 of them in past year. All you White Real Americans who are looking for work that the Latinos stole from you..Where are you ? The jobs are i Vadalia just waiting for you..Or maybe its the fact that you would rather collect unemployment like the rest of the Tea Klaners.. You scream..you complain..and you sit at home in your wife beaters and drink beer..Typical Real White Tea Klan..."
]
test_sentences = list(map(lambda x: x.split(), test_sentences))

In [28]:
alice[1:5]

[['CHAPTER', 'I', '.'],
 ['Down', 'the', 'Rabbit', '-', 'Hole'],
 ['Alice',
  'was',
  'beginning',
  'to',
  'get',
  'very',
  'tired',
  'of',
  'sitting',
  'by',
  'her',
  'sister',
  'on',
  'the',
  'bank',
  ',',
  'and',
  'of',
  'having',
  'nothing',
  'to',
  'do',
  ':',
  'once',
  'or',
  'twice',
  'she',
  'had',
  'peeped',
  'into',
  'the',
  'book',
  'her',
  'sister',
  'was',
  'reading',
  ',',
  'but',
  'it',
  'had',
  'no',
  'pictures',
  'or',
  'conversations',
  'in',
  'it',
  ',',
  "'",
  'and',
  'what',
  'is',
  'the',
  'use',
  'of',
  'a',
  'book',
  ",'",
  'thought',
  'Alice',
  "'",
  'without',
  'pictures',
  'or',
  'conversation',
  "?'"],
 ['So',
  'she',
  'was',
  'considering',
  'in',
  'her',
  'own',
  'mind',
  '(',
  'as',
  'well',
  'as',
  'she',
  'could',
  ',',
  'for',
  'the',
  'hot',
  'day',
  'made',
  'her',
  'feel',
  'very',
  'sleepy',
  'and',
  'stupid',
  '),',
  'whether',
  'the',
  'pleasure',
  'of',


In [29]:
test_sentences[1:5]

[['i',
  'really',
  "don't",
  'understand',
  'your',
  'point.',
  'It',
  'seems',
  'that',
  'you',
  'are',
  'mixing',
  'apples',
  'and',
  'oranges.'],
 ['shut',
  'the',
  'fuck',
  'up.',
  'you',
  'and',
  'the',
  'rest',
  'of',
  'your',
  'faggot',
  'friends',
  'should',
  'be',
  'burned',
  'at',
  'the',
  'stake'],
 ['That',
  'you',
  'are',
  'an',
  'idiot',
  'who',
  'understands',
  'neither',
  'taxation',
  'nor',
  "women's",
  'health.'],
 ['What',
  'on',
  'Earth',
  'is',
  'that',
  'about?',
  'Is',
  'it',
  "what's",
  'going',
  'to',
  'get',
  'him',
  'fired',
  'eventually?']]

In [30]:
test_result = el(test_sentences)



In [91]:
mask_punkt = np.array([1 if not x in ['</S>','<S>','<UNK>',',','.','"',')','(','!', '"', '#', '$', '%', '&', "'",] else 0 for x in el.elmo.get_vocab()])

In [92]:
masked_test_result = [dist*mask_punkt for dist in test_result]

In [6]:
el._get_perplexity(alice[:500], 'both')

  


437.9690103768755

In [7]:
el._get_perplexity(alice[:500], 'min')

  


527405.2165161947

In [8]:
el._get_perplexity(alice[:500], 'max')

  


466731.4349294699

In [9]:
el._get_perplexity(alice[:500], 'left')

  


498326.555375938

In [10]:
el._get_perplexity(alice[:500], 'right')

  


493966.43769457657

In [11]:
el._get_perplexity(alice[:500], 'gmean')

  


514554.93212502246

In [59]:
def _multi_argmax(values: np.ndarray, n_instances: int = 1) -> np.ndarray:
        """
        Selects the indices of the n_instances highest values.
        Args:
            values: Contains the values to be selected from.
            n_instances: Specifies how many indices to return.
        Returns:
            Contains the indices of the n_instances largest values.
        """
        assert n_instances <= values.shape[0], 'n_instances must be less or equal than the size of utility'

        max_idx = np.argpartition(-values, n_instances-1, axis=1)[:,:n_instances]
        return max_idx

In [93]:
idx = _multi_argmax(values=masked_test_result[0], n_instances=10)
idx.shape

(20, 10)

In [94]:
test_result[0].shape

(20, 793471)

In [95]:
sent = test_sentences[0]
print(sent)

['Almost', 'half', 'of', 'all', 'iPhone', 'owners', 'have', 'broken', 'their', 'screens,', 'not', 'just', 'once', 'but', 'an', 'average', 'of', 'two', 'times', 'each.']


In [96]:
for num, word in enumerate(idx):
    print(sent[num], [el.elmo.get_vocab()[i] for i in word])
    print('\n\n')

Almost ['Almost', 'Nearly', 'But', 'Over', 'that', "'s", 'About', ':', 'U.S.', 'Around']



half ['two-thirds', 'three-quarters', 'one-third', 'half', '1,000', 'three-fourths', 'percent', '2,000', 'all', '3,000']



of [':', 'that', 'said', 'U.S.', 'of', 'say', '--', "'s", 'the', 'think']



all ['those', 'UK', 'that', 'the', "'s", ':', 'U.S.', 'all', 'American', 'British']



iPhone ['business', 'U.S.', 'home', 'the', "'s", 'new', 'phone', 'those', 'iPhone', 'American']



owners ['users', 'owners', 'subscribers', 'customers', 'and', "'s", 'readers', 'apps', 'consumers', 'buyers']



have ['had', "'s", "'ve", 'have', 'are', '--', 'now', 'who', 'and', 'in']



broken ['on', 'at', 'access', 'used', 'use', 'downloaded', 'using', 'to', 'in', 'accessed']



their ['the', "'s", 'their', 'phone', 'on', 'up', 'off', 'two', 'down', 'U.S.']



screens, ['service', '...', '*', ';', 'phone', ':', 'U.S.', 'or', 'hands', "'s"]



not [';', 'for', 'to', 'of', 'at', ':', 'in', 'and', 'or', 'on']



j