In [1]:
from utils.seed_setter import set_seed
set_seed()

In [2]:
import pickle,os

with open(os.path.join('./pickle_data/train_test_data/test_data.pickle'), 'rb') as f:
    x_test, y_test = pickle.load(f)
f.close()

## DISTANCE MATRIX CALCULATION

```
using counter fitted word vectors
```

In [3]:
import os
import numpy as np

embeddings_dict = {}


with open(os.path.join("./counter_fitted_word_vectors/counter-fitted-vectors.txt"), "r",errors ='ignore', encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype="float32")
        embeddings_dict[word] = coefs
f.close()

In [4]:
############ PREPROCESSING #############

In [5]:
import string
latin_similar = "’'‘ÆÐƎƏƐƔĲŊŒẞÞǷȜæðǝəɛɣĳŋœĸſßþƿȝĄƁÇĐƊĘĦĮƘŁØƠŞȘŢȚŦŲƯY̨Ƴąɓçđɗęħįƙłøơşșţțŧųưy̨ƴÁÀÂÄǍĂĀÃÅǺĄÆǼǢƁĆĊĈČÇĎḌĐƊÐÉÈĖÊËĚĔĒĘẸƎƏƐĠĜǦĞĢƔáàâäǎăāãåǻąæǽǣɓćċĉčçďḍđɗðéèėêëěĕēęẹǝəɛġĝǧğģɣĤḤĦIÍÌİÎÏǏĬĪĨĮỊĲĴĶƘĹĻŁĽĿʼNŃN̈ŇÑŅŊÓÒÔÖǑŎŌÕŐỌØǾƠŒĥḥħıíìiîïǐĭīĩįịĳĵķƙĸĺļłľŀŉńn̈ňñņŋóòôöǒŏōõőọøǿơœŔŘŖŚŜŠŞȘṢẞŤŢṬŦÞÚÙÛÜǓŬŪŨŰŮŲỤƯẂẀŴẄǷÝỲŶŸȲỸƳŹŻŽẒŕřŗſśŝšşșṣßťţṭŧþúùûüǔŭūũűůųụưẃẁŵẅƿýỳŷÿȳỹƴźżžẓ"
safe_characters = string.ascii_letters + string.digits + latin_similar + ' '
safe_characters += "'"

In [6]:
glove_chars = [c for c in list(embeddings_dict.keys()) if len(c) == 1]
glove_symbols = [c for c in glove_chars if not c in safe_characters]
print('Non latin symbols in the Counter Fitted GloVe Embeddings:')
print(glove_symbols)

Non latin symbols in the Counter Fitted GloVe Embeddings:
['º', 'ª', 'с', 'δ', 'в', 'и']


In [7]:
test_chars = set(w for sentence in x_test for w in sentence)
test_symbols = [c for c in test_chars if not c in safe_characters]
print('Non latin symbols in the Train Data:')
print(test_symbols)

Non latin symbols in the Train Data:
['\x80', '^', '¦', '·', '”', '–', '\\', 'א', '!', 'ן', 'ו', '«', '=', ';', '\t', '~', '¾', 'כ', '\xad', 'ל', '³', '$', 'ג', ')', '¿', '?', '"', '|', '\x91', 'ר', '}', '>', '.', '&', ':', '*', ']', '@', '▼', 'º', '§', '#', '\xa0', '+', '“', '(', '\x97', '¨', '%', '°', '\x95', '\x96', '<', 'י', '\x84', '/', '…', '_', ',', '\x85', '½', '»', '{', '¢', '¡', '`', '£', '´', '-', '[', 'מ']


In [8]:
symbols_to_delete = [c for c in test_symbols if not c in glove_symbols]
print('Symbols to delete in the Train Data:')
print(symbols_to_delete)

Symbols to delete in the Train Data:
['\x80', '^', '¦', '·', '”', '–', '\\', 'א', '!', 'ן', 'ו', '«', '=', ';', '\t', '~', '¾', 'כ', '\xad', 'ל', '³', '$', 'ג', ')', '¿', '?', '"', '|', '\x91', 'ר', '}', '>', '.', '&', ':', '*', ']', '@', '▼', '§', '#', '\xa0', '+', '“', '(', '\x97', '¨', '%', '°', '\x95', '\x96', '<', 'י', '\x84', '/', '…', '_', ',', '\x85', '½', '»', '{', '¢', '¡', '`', '£', '´', '-', '[', 'מ']


In [9]:
symbols_to_isolate = [c for c in test_symbols if c in glove_symbols]
print('Symbols to isolate in the Train Data:')
print(symbols_to_isolate)

Symbols to isolate in the Train Data:
['º']


In [10]:
def handle_symbols(text):
    for symbol in symbols_to_delete:
        text = text.replace(symbol, ' ')
    for symbol in symbols_to_isolate:
        text = text.replace(symbol, ' ' + symbol + ' ')
    return text

from nltk.tokenize.treebank import TreebankWordTokenizer


def handle_contractions(text):
    tokenizer = TreebankWordTokenizer()
    text = tokenizer.tokenize(text)
    text = ' '.join(text)
    return text

def fix_quote(text):
    return ' '.join(w[1:] if w.startswith("'") and len(w) > 1 else w for w in text.split())

In [11]:
def test_data_preprocessing_for_tokenization(text, embeddings_dict):
    text = text.lower()
    text = handle_symbols(text)
    text = handle_contractions(text)
    text = fix_quote(text)
    words_in_embedding_dict = [w for w in text.split() if w in embeddings_dict.keys()]
    text = ' '.join(words_in_embedding_dict)
    return text

In [12]:
'''%%writefile ./utils/attack_preprocessing.py

import pickle
from nltk.tokenize.treebank import TreebankWordTokenizer

class AttackPreprocesser(object):
    
    def __init__(self):
        with open('./pickle_data/preprocesser_utils/utils.pickle', 'rb') as f:
            symbols_to_delete, symbols_to_isolate = pickle.load(f)
            self.__symbols_to_delete = symbols_to_delete
            self.__symbols_to_isolate = symbols_to_isolate
        f.close()
        self.__tree_bank_word_tokenizer = TreebankWordTokenizer()
    
    def __handle_symbols(self, text):
        for symbol in self.__symbols_to_delete:
            text = text.replace(symbol, ' ')
        for symbol in self.__symbols_to_isolate:
            text = text.replace(symbol, ' ' + symbol + ' ')
        return text
    
    def __handle_contractions(self, text):
        text = self.__tree_bank_word_tokenizer.tokenize(text)
        return ' '.join(text)
    
    def __fix_quotes(self, text):
        return ' '.join(w[1:] if w.startswith("'") and len(w) > 1 else w for w in text.split())
        
    def preprocess_text(self, text):
        text = text.lower()
        text = self.__handle_symbols(text)
        text = self.__handle_contractions(text)
        text = self.__fix_quotes(text)
        return text
'''

'%%writefile ./utils/attack_preprocessing.py\n\nimport pickle\nfrom nltk.tokenize.treebank import TreebankWordTokenizer\n\nclass AttackPreprocesser(object):\n    \n    def __init__(self):\n        with open(\'./pickle_data/preprocesser_utils/utils.pickle\', \'rb\') as f:\n            symbols_to_delete, symbols_to_isolate = pickle.load(f)\n            self.__symbols_to_delete = symbols_to_delete\n            self.__symbols_to_isolate = symbols_to_isolate\n        f.close()\n        self.__tree_bank_word_tokenizer = TreebankWordTokenizer()\n    \n    def __handle_symbols(self, text):\n        for symbol in self.__symbols_to_delete:\n            text = text.replace(symbol, \' \')\n        for symbol in self.__symbols_to_isolate:\n            text = text.replace(symbol, \' \' + symbol + \' \')\n        return text\n    \n    def __handle_contractions(self, text):\n        text = self.__tree_bank_word_tokenizer.tokenize(text)\n        return \' \'.join(text)\n    \n    def __fix_quotes(self

In [13]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer() #(MAXLEN)

tokenizer.fit_on_texts([test_data_preprocessing_for_tokenization(text, embeddings_dict) for text in x_test])

print(len(tokenizer.word_index))

40567


In [14]:
'''
def words_in_embedding(embeddings_dict, tokenizer):
    elem = []
    for w in tokenizer.word_index.keys():
        if w not in embeddings_dict.keys():
            elem += [w]
    print(elem)

words_in_embedding(embeddings_dict, tokenizer)
'''

'\ndef words_in_embedding(embeddings_dict, tokenizer):\n    elem = []\n    for w in tokenizer.word_index.keys():\n        if w not in embeddings_dict.keys():\n            elem += [w]\n    print(elem)\n\nwords_in_embedding(embeddings_dict, tokenizer)\n'

In [15]:
#################################################################################
def embeddings_not_in_tokenizer(embeddings_dict, tokenizer):
    elems = []
    for w in embeddings_dict.keys():
        if w not in tokenizer.word_index.keys():
            elems += [w]
    return elems
    
embeddings_not_in_tokenizer(embeddings_dict, tokenizer)

['fawn',
 'schlegel',
 'tilton',
 'clotted',
 'trawling',
 'kalmar',
 'tasos',
 'canes',
 'sprague',
 'brockton',
 'mutinies',
 'vano',
 'crossbar',
 'hermano',
 'jemmy',
 'grenadiers',
 'stipulate',
 'capoeira',
 'broward',
 'caramels',
 'chameleons',
 'asami',
 'immunities',
 'fuera',
 'thrace',
 'kublai',
 'gaskets',
 'snuggles',
 'splendiferous',
 'scraper',
 'ffor',
 'deadheads',
 'selassie',
 'centimeter',
 'opportunists',
 'warmongering',
 'numeral',
 'widget',
 'zlotys',
 'chine',
 'chino',
 'sheung',
 'quart',
 'naturel',
 'kumbaya',
 'kido',
 'millimetres',
 'topography',
 'jäger',
 'battista',
 'ramstein',
 'caned',
 'grahams',
 'excu',
 'borstal',
 'hermana',
 'expeditionary',
 'unpack',
 'murchison',
 'lomax',
 'matilde',
 'zinnias',
 'hyatt',
 'wudang',
 'pooper',
 'pinta',
 'carew',
 'rayon',
 'cocksucker',
 'mcmuffin',
 'sugarless',
 'clews',
 'cutback',
 'essie',
 'canaries',
 'shaitan',
 'stoller',
 'pigment',
 'domed',
 'souci',
 'amaya',
 'tulio',
 'farmlands',
 'di

In [16]:
#################################################################################
tokenizer.word_index[list(tokenizer.word_index.keys())[-1]]

40567

In [17]:
#################################################################################
for w in embeddings_not_in_tokenizer(embeddings_dict, tokenizer):
    tokenizer.word_index[w] = len(tokenizer.word_index) + 1  

In [18]:
#################################################################################
len(tokenizer.word_index)

65713

In [19]:
tokens_dictionary = tokenizer.word_index

inverse_tokens_dictionary = {v : k for (k, v) in tokenizer.word_index.items()}

In [20]:
import os
import pickle

os.makedirs(os.path.join('./pickle_data/attack_utils'), exist_ok=True)

with open(os.path.join('./pickle_data/attack_utils/tokens_dictionary.pickle'), 'wb') as f:
    pickle.dump([tokens_dictionary, inverse_tokens_dictionary], f)
f.close()

In [21]:
list(tokens_dictionary.items())[:3]

[('the', 1), ('and', 2), ('a', 3)]

In [22]:
list(inverse_tokens_dictionary.items())[:3]

[(1, 'the'), (2, 'and'), (3, 'a')]

In [23]:
'''import pickle

with open('pickle\\tokens_dicts.pickle', 'wb') as f:
    pickle.dump([tokens_dictionary,inverse_tokens_dictionary], f)
f.close()'''

"import pickle\n\nwith open('pickle\\tokens_dicts.pickle', 'wb') as f:\n    pickle.dump([tokens_dictionary,inverse_tokens_dictionary], f)\nf.close()"

In [24]:
'''import pickle

with open('pickle\\tokens_dicts.pickle', 'rb') as f:
    tokens_dictionary,inverse_tokens_dictionary = pickle.load(f)
f.close()'''

"import pickle\n\nwith open('pickle\\tokens_dicts.pickle', 'rb') as f:\n    tokens_dictionary,inverse_tokens_dictionary = pickle.load(f)\nf.close()"

In [25]:
MAXLEN = 45_000#50_000

embedding_matrix = np.zeros(shape = (MAXLEN + 1, 300), dtype= 'float32')

for w, i in tokenizer.word_index.items():
    if w in embeddings_dict and i < MAXLEN + 1:
        embedding_matrix[i,:] = embeddings_dict[w]

In [26]:
embedding_matrix.shape

(45001, 300)

In [27]:
from sklearn.metrics.pairwise import cosine_distances
from scipy import sparse

distance_matrix = cosine_distances(embedding_matrix, embedding_matrix)

In [28]:
distance_matrix.shape

(45001, 45001)

In [29]:
import os

os.makedirs('numpy_files', exist_ok=True)
np.save(os.path.join('./numpy_files/distance_matrix.npy'), distance_matrix)

In [30]:
#distance_matrix = np.load(os.path.join('./numpy_files/distance_matrix.npy'))

In [31]:
#tokens_dictionary['king']

In [32]:
def most_similar(word, delta = 0.5, num_words = 20):
    
    try:
        index = tokens_dictionary[word]
    except:
        return []
    
    if (index > distance_matrix.shape[0]):
        return []
    
    dist_order = np.argsort(distance_matrix[index,:])[1:num_words+1]
    dist_list = distance_matrix[index][dist_order]
    
    mask = np.ones_like(dist_list)
    mask = np.where(dist_list < delta)
    return [inverse_tokens_dictionary[index] for index in dist_order[mask]]

In [33]:
nearest_words_to_great = most_similar('great')

In [34]:
print('Most similar words to "great" are:')
print(nearest_words_to_great)

Most similar words to "great" are:
['grand', 'tremendous', 'awesome', 'phenomenal', 'terrific', 'prodigious', 'magnificent', 'wonderful', 'splendid', 'fantastic', 'marvelous', 'fabulous', 'marvellous', 'whopping', 'wondrous', 'formidable', 'huge', 'super', 'resplendent', 'excellent']


In [35]:
################################################################

## GOOGLE 1 BILLION WORDS LANGUAGE MODEL

To filter out words that are not in the context of a sentence


https://github.com/tensorflow/models/tree/archive/research/lm_1b

DOWNLOADS:<br>
* http://download.tensorflow.org/models/LM_LSTM_CNN/graph-2016-09-10.pbtxt
* http://download.tensorflow.org/models/LM_LSTM_CNN/all_shards-2016-09-10/ckpt-base
* http://download.tensorflow.org/models/LM_LSTM_CNN/all_shards-2016-09-10/ckpt-char-embedding
* http://download.tensorflow.org/models/LM_LSTM_CNN/all_shards-2016-09-10/ckpt-lstm
* http://download.tensorflow.org/models/LM_LSTM_CNN/all_shards-2016-09-10/ckpt-softmax0
* http://download.tensorflow.org/models/LM_LSTM_CNN/all_shards-2016-09-10/ckpt-softmax1
* http://download.tensorflow.org/models/LM_LSTM_CNN/all_shards-2016-09-10/ckpt-softmax2
* http://download.tensorflow.org/models/LM_LSTM_CNN/all_shards-2016-09-10/ckpt-softmax3
* http://download.tensorflow.org/models/LM_LSTM_CNN/all_shards-2016-09-10/ckpt-softmax4
* http://download.tensorflow.org/models/LM_LSTM_CNN/all_shards-2016-09-10/ckpt-softmax5
* http://download.tensorflow.org/models/LM_LSTM_CNN/all_shards-2016-09-10/ckpt-softmax6
* http://download.tensorflow.org/models/LM_LSTM_CNN/all_shards-2016-09-10/ckpt-softmax7
* http://download.tensorflow.org/models/LM_LSTM_CNN/all_shards-2016-09-10/ckpt-softmax8
* http://download.tensorflow.org/models/LM_LSTM_CNN/vocab-2016-09-10.txt

Il codice nella seguente cella è modificato da una base il cui copyright è espresso di seguito:

```
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
```

In [36]:
import licensed_scripts.lm_1b_eval as google_language_model_utils

In [37]:
google_language_model = google_language_model_utils.LM()

LM vocab loading done


Recovering graph.


INFO:tensorflow:Recovering Graph google_language_model\graph-2016-09-10.pbtxt


Recovering checkpoint google_language_model\ckpt-*


In [None]:
print('Original phrase:')
print('"It has been such a great holiday"')

prefix = "It has been such a"
suffix = 'holiday'

lm_preds = google_language_model.get_words_probs(prefix, nearest_words_to_great, suffix)

print('The most probable substitute is:') 
print(nearest_words_to_great[np.argmax(lm_preds)])