In [1]:
from deeppavlov.models.bidirectional_lms import elmo_bilm
from deeppavlov.models.tokenizers.lazy_tokenizer import LazyTokenizer
from nltk.tokenize.moses import MosesDetokenizer
import numpy as np
from typing import List
from scipy.stats import kurtosis
from scipy.stats.mstats import gmean
import re


import pandas as pd

[nltk_data] Downloading package punkt to /home/sultanov/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sultanov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package perluniprops to
[nltk_data]     /home/sultanov/nltk_data...
[nltk_data]   Package perluniprops is already up-to-date!
[nltk_data] Downloading package nonbreaking_prefixes to
[nltk_data]     /home/sultanov/nltk_data...
[nltk_data]   Package nonbreaking_prefixes is already up-to-date!
  return f(*args, **kwds)
  return f(*args, **kwds)


In [2]:
class ElmoAug:
    
    def __init__(self, model_dir="/cephfs/home/sultanov/elmo_lm/lib/python3.6/site-packages/download/bidirectional_lms/elmo_en_news"):
        self.tokenizer = LazyTokenizer()
        self.elmo_lm   = elmo_bilm.ELMoEmbedder(model_dir=model_dir)
        self.elmo_vocab = np.array(self.elmo_lm.get_vocab())
        self.detokenizer = MosesDetokenizer()
        self.replacement_patterns = [
             (r'won\'t', 'will not'),
             (r'can\'t', 'cannot'),
             (r'i\'m', 'i am'),
             (r'ain\'t', 'is not'),
             (r'(\w+)\'ll', '\g<1> will'),
             (r'(\w+)n\'t', '\g<1> not'),
             (r'(\w+)\'ve', '\g<1> have'),
             (r'(\w+)\'s', '\g<1> is'),
             (r'(\w+)\'re', '\g<1> are'),
             (r'(\w+)\'d', '\g<1> would')
        ]
        self.patterns = [(re.compile(regex), repl) for (regex, repl) in self.replacement_patterns]
        self.logger = []
        self.num_sent = 0
        
    def _preproccesing(self, text):
        s = text
        for (pattern, repl) in self.patterns:
            s = re.sub(pattern, repl, s)
        return s
    
    def _weighted_sum_distr_by_posistion_in_sent(self, distr):
        if len(distr) > 4:
            weights = np.array([0.2, 0.35] + [0.5]*(len(distr) - 4) + [0.65, 0.8])
        else:
            weights = 0.5*np.ones(len(distr))
        left = distr[:,0,:]
        right = distr[:,1,:]
        right = right.transpose([1, 0]) * (1-weights)
        right = right.transpose([1, 0])
        left = left.transpose([1, 0]) * weights
        left = left.transpose([1, 0])
        return right + left
    
    def _get_index_in_vocab(self, token):
        indx = np.where(self.elmo_vocab == token)[0]
        return indx[0] if indx.size > 0 else None

    def _blend_dist(self, batch_distr, num_method):
        """
        blending distr from left and right context
        method 0:
            sum two distr along left right context
        method 1:
            weighted sum by place of word in sentence
        """
        if num_method == 0:
            return [np.sum(distr, axis=1) for distr in batch_distr]
        
        elif num_method == 1:
            return [self._weighted_sum_distr_by_posistion_in_sent(distr) for distr in batch_distr]
        
        elif num_method == 2:
            return [np.min(distr, axis=1) for distr in batch_distr]
        
        elif num_method == 3:
            return [gmean(distr, axis=1) for distr in batch_distr]
    
    def _softmax(self, x):
        """Compute softmax values for each sets of scores in x."""
        e_x = np.exp(x - np.max(x))
        return e_x / e_x.sum()
    
    def _sent_aug(self, source_sentence: List[str], distr: np.ndarray, threshold: float=0.3, replace_freq: float=0.5):
        """Sampling words from the distribution"""
        result = []
        for i, token in enumerate(source_sentence):
            if distr.mask[i].all():
                #source token
                result.append(token)
                log = {'num_sent': self.num_sent, 'num_token': i, 'source': token,\
                       'result': token, 'replaced': False, 'distr': None, 'num_of_candidate': 0}
            else:
                #replace
                idx_word = np.random.choice(len(distr[i]), replace=False, p=self._softmax(distr[i]).filled(0))
                result.append(self.elmo_vocab[idx_word])
                log = {'num_sent': self.num_sent, 'num_token': i, 'source': token,\
                       'result': self.elmo_vocab[idx_word], 'replaced': True, 'distr': self._softmax(distr[i]).filled(0),\
                       'num_of_candidate': len(distr[i].nonzero()[0]),\
                       'candidates': self.elmo_vocab[distr[i].nonzero()[0]]}
            self.logger.append(log)
        self.num_sent += 1
        return result
    
    def _get_threshold_masked(self, data, indx_source_token, threshold, replace_freq):
        """
        Creating mask:
            source word - deleted
            word with probability less that threshold - deleted
            amount of words that will be replaced ≈ replace_freq * len(sentence)
            if amount of words with probability > threshold <= replace_freq * len(sentence)
            then will printed warning and all words with probability > threshold will be replaced
        """
        #Creating a mask based on threshold
        assert(len(data) == len(indx_source_token))
        mask = data > threshold
        #Creating a mask that marks the source tokens, and merging it with previous mask
        onehot_indx = np.zeros((len(indx_source_token), len(el.elmo_vocab)))
        for i, indx in enumerate(indx_source_token):
            if indx:
                onehot_indx[i, indx] = 1
        mask = (mask * ~(onehot_indx).astype(bool)).astype(bool)
        #Creating a mask based on replace frequence, and merging it with previous
        word_mask = mask.any(axis=1)
        if replace_freq >= len(word_mask.nonzero()[0])/len(word_mask):
            print('warning')
            word_mask = word_mask
        else:
            freq = (replace_freq * len(word_mask)) / len(word_mask.nonzero()[0])
            word_mask = (np.random.binomial(1, p=freq, size=(len(word_mask))) * word_mask).astype(bool)
        mask = (mask.T*word_mask).T
        return np.ma.masked_array(data=data, mask=~mask, fill_value=np.nan)
        
    def _batch_sent(self, batch_sent: List[str], threshold: float, replace_freq: float, num_method_blend: int) -> List[str]:
        """
        Replaces some words in the original sentence with words from the language model with frequency p
        Args:
            batch_token: Sentences to be augmented 
            threshold: words with probability < threshold will not be considerated like replacement
            p: frequency of replacing words
            num_method_blend: method of merging two distributions: left-context and right-context
        Returns: 
            Contains the augmented sentences
        """
        batch_sent_prep         = [self._preproccesing(sent) for sent in batch_sent]
        batch_token             = self.tokenizer(batch_sent_prep)
        batch_indx_source_token = [np.array(list(map(self._get_index_in_vocab, sent))) for sent in batch_token]
        batch_distr             = self._blend_dist(self.elmo_lm(batch_token), num_method_blend)
        batch_mask_distr        = [self._get_threshold_masked(batch_distr[i], batch_indx_source_token[i], threshold, replace_freq) for i in range(len(batch_token))]
        self.batch_mask_distr = batch_mask_distr
        batch_aug_token         = [self._sent_aug(batch_token[i], batch_mask_distr[i]) for i in range(len(batch_token))]
        return batch_aug_token        
    
    def __call__(self, batch_sent: List[str], threshold: float, replace_freq: float, num_method_blend: int):
        batch_aug_token = self._batch_sent(batch_sent, threshold, replace_freq, num_method_blend)
        return [self.detokenizer.detokenize(i, return_str=True) for i in batch_aug_token]

In [3]:
el = ElmoAug()

Using TensorFlow backend.


****************************************************************************************************
/cephfs/home/sultanov/elmo_lm/lib/python3.6/site-packages/download/bidirectional_lms/elmo_en_news
Instructions for updating:
Use the `axis` argument instead


Instructions for updating:
Use the `axis` argument instead


USING SKIP CONNECTIONS
INFO:tensorflow:Restoring parameters from /cephfs/home/sultanov/elmo_lm/lib/python3.6/site-packages/download/bidirectional_lms/elmo_en_news/model.ckpt-935588


2018-10-31 19:19:25.564 INFO in 'tensorflow'['tf_logging'] at line 115: Restoring parameters from /cephfs/home/sultanov/elmo_lm/lib/python3.6/site-packages/download/bidirectional_lms/elmo_en_news/model.ckpt-935588


In [4]:
test_sentences = \
["Almost half of all iPhone owners have broken their screens, not just once but an average of two times each.",\
   "i really don't understand your point.\xa0 It seems that you are mixing apples and oranges.",\
   "shut the fuck up. you and the rest of your faggot friends should be burned at the stake",\
   "That you are an idiot who understands neither taxation nor women's health.",\
   "What on Earth is that about? Is it what's going to get him fired eventually?",\
   "This is a doctrine of constitutional interpretation that says that a constitution is organic and must be read in a broad and liberal manner so as to adapt it to changing times.",\
   "In the 2000s, music notation typically means the written expression of music notes and rhythms on paper using symbols.",\
   "Most of the mathematical notation in use today was not invented until the 16th century.[52] Before that, mathematics was written out in words, limiting mathematical discovery.",\
   "Physical geography deals with the study of processes and patterns in the natural environment like the atmosphere, hydrosphere, biosphere, and geosphere.",\
   "An autobiography is written by the person himself or herself, sometimes with the assistance of a collaborator or ghostwriter.",\
    "You fuck your dad.",\
    "Yeah and where are you now?",\
    "shut the fuck up. you and the rest of your faggot friends should be burned at the stake",\
    "you are a land creature. You would drown....",\
    "But how would you actually get the key out?",\
    "fucking behave then you prick!",\
    "You right if you are relaxe then you can give better result or perform and your identity should be from your work.",\
    "The laughs you two heard were triggered by memories of his own high-flying exits off moving beasts",\
 "Well, you guys have gone and done it now. You put the words 'China' and 'Chinese' up the required number of times for the dating Asians ad to come up. Evidently, Ms. Zhang, 50Kg and 168cm [for a BMI of 17.8] from 'HuNan China' wants to meet me. She has her little mouth open like she's speaking. What's that you ask, Zhang? Well, yes, as a matter of fact I am a physician.  Why are you clapping your hands together and jumping up and down?  Stop that squealing, young lady and 'exprain' yourself!",\
 "Fact : Georgia passed a strict immigration policy and most of the Latino farm workers left the area. Vidalia Georgia now has over 3000 agriculture job openings and they have been able to fill about 250 of them in past year. All you White Real Americans who are looking for work that the Latinos stole from you..Where are you ? The jobs are i Vadalia just waiting for you..Or maybe its the fact that you would rather collect unemployment like the rest of the Tea Klaners.. You scream..you complain..and you sit at home in your wife beaters and drink beer..Typical Real White Tea Klan..."
]
len(test_sentences)

20

In [5]:
%%time
el(test_sentences, 3e-4, 0.4, 2)

CPU times: user 2h 54min 55s, sys: 39min 30s, total: 3h 34min 25s
Wall time: 4min 53s


['But three-quarters of all iPhone owners have hit his date two not merely on but an most of few times each.',
 'They really do still understand your hand., seems for you are mixing apples and <UNK> from',
 'Obama the two up., and the good of your faggot friends should be burned at the stake',
 'That you are an official really knows about taxation for women ( health "',
 "How on hell are that about 'Is it a really just to get him's this?",
 'This of this study of The interpretation that says that a policy even and and must be done in a broad and straightforward one "as or adapt -- in changing and (',
 'In the 2006, music notation typically only the written expression of music notes and rhythms on paper using symbols from',
 ', now the mathematical notation at use today was not built until this 16th century -- "12] Before that, writing was written and in words, limiting mathematical,.',
 'Physical geography - with the study like processes but material including the natural settings like

In [7]:
d = (pd.DataFrame(el.logger))
d.to_csv('./result_last.csv')

In [8]:
a  = ['But three-quarters of all iPhone owners have hit his date two not merely on but an most of few times each.',
 'They really do still understand your hand., seems for you are mixing apples and <UNK> from',
 'Obama the two up., and the good of your faggot friends should be burned at the stake',
 'That you are an official really knows about taxation for women ( health "',
 "How on hell are that about 'Is it a really just to get him's this?",
 'This of this study of The interpretation that says that a policy even and and must be done in a broad and straightforward one "as or adapt -- in changing and (',
 'In the 2006, music notation typically only the written expression of music notes and rhythms on paper using symbols from',
 ', now the mathematical notation at use today was not built until this 16th century -- "12] Before that, writing was written and in words, limiting mathematical,.',
 'Physical geography - with the study like processes but material including the natural settings like the atmosphere and power and food, and,.',
 'that order ( written by their person himself or herself, "with the benefit or a collaborator or the.',
 "Parents may old dad '",
 'I and please are you just?',
 'shut the stock up. you and the, are your old friends should be burned that the stake',
 'you are a land open; You would disagree:.',
 'But how would she <UNK> want the key benefit?',
 'fucking does "you prick!',
 'You right like you are relaxe then you are you better ( or perform and your identity should work about your work before',
 'Those laughs you two heard those triggered and memories of his new in, off moving beasts',
 'Well, this, have -- and done enough?. "put the <UNK>\'China on and\'Chinese\'s along <UNK> record good as times for their... search ad time come again. Evidently out Ms. and, a and above [ to 2 BMI of 17.8] from\'HuNan China. goes to meet me. She has her and nose open like she or speaking. What is why do ask, Zhang? And, yes the as a matter, <UNK> and saw <UNK> physician <UNK> Why were" <UNK> your "together and jumping up and in" Stop something about, it lady and\'exprain bring is!',
 "Fact: Georgia had a strict new and for most of the Latino farm drivers left the area. Vidalia also now's over 3000 agriculture industry openings and few have not hoping to fill by 40,000 of them of past week. Just The White will Americans who like voted at is that the obama stole from you..Where are you? The right are a feel) waiting through you..Or maybe of the fact that <UNK> really rather wage unemployment like... the or the real) just <UNK> complain..and will sit back home calling 'wife, and... beer..Typical of White Tea Klan..."]

In [None]:
with open('./ex_last.txt', 'w') as f:
    for line in
    f.write(a)