In [1]:
import numpy as np
import tensorflow as tf
import random
import os

SEED = 123
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)
random.seed(SEED)
tf.random.set_seed(SEED)
os.environ['TF_DETERMINISTIC_OPS'] = '1'

In [2]:
from __future__ import absolute_import, division, print_function, unicode_literals
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1


In [3]:
from scripts.blackBox import BlackBox

In [4]:
black_box = BlackBox()



In [5]:
import pickle

with open('pickle\\data.pickle', 'rb') as f:
    x_test, y_test = pickle.load(f)

In [6]:
x_test[0]

'Unhinged follows the typical plot of the early 80\'s slasher trend. Pretty Young Girls In Peril. I have to give it up for the filmmaker who used a helicopter for some of the early road-trip shots, you actually think for a second there\'s going to be quality in the production. Watching "Unhinged" was like seeing an amateur acting class go through it\'s warm-up. Some of the most awkward, badly lit, overlong scenes are played out with the gusto of a Valium overdose. I wondered why they didn\'t just put the cue-cards on camera so the actresses wouldn\'t have to constantly shift their gaze. The two main girls were obviously chosen for their T&A factor rather than talent. Laurel Munson as the main chick Terry is as exciting as watching paint dry. Two nude scenes make for an adolescent thrill. Janet Penner and Virginia Settle as the crazy/creepy daughter and mother the chicks find themselves stranded with compete for Worst Acting Ever. Long pauses, weird expressions, emphasis on the wrong wo

In [7]:
y_test[0]

0

In [8]:
print(black_box.predict_sentiment(x_test[0]))

0.0061943284


## Perturb Algorythm

In [9]:
population = list(zip(x_test,y_test))

In [10]:
population[:3]

[('Unhinged follows the typical plot of the early 80\'s slasher trend. Pretty Young Girls In Peril. I have to give it up for the filmmaker who used a helicopter for some of the early road-trip shots, you actually think for a second there\'s going to be quality in the production. Watching "Unhinged" was like seeing an amateur acting class go through it\'s warm-up. Some of the most awkward, badly lit, overlong scenes are played out with the gusto of a Valium overdose. I wondered why they didn\'t just put the cue-cards on camera so the actresses wouldn\'t have to constantly shift their gaze. The two main girls were obviously chosen for their T&A factor rather than talent. Laurel Munson as the main chick Terry is as exciting as watching paint dry. Two nude scenes make for an adolescent thrill. Janet Penner and Virginia Settle as the crazy/creepy daughter and mother the chicks find themselves stranded with compete for Worst Acting Ever. Long pauses, weird expressions, emphasis on the wrong 

In [11]:
population[0][0]
population[0][1]
#a,b = zip(*population)

0

## DISTANCE MATRIX CALCULATION

```
using counter fitted word vectors
```

In [12]:
embeddings_dict = {}

with open("counter_fitted_word_vectors\\counter-fitted-vectors.txt", "r",errors ='ignore', encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype="float32")
        embeddings_dict[word] = coefs
f.close()

In [13]:
from scripts.preprocessing import Preprocesser

x_test = [Preprocesser.raw_text_preprocessing(sentence) for sentence in x_test]

In [14]:
from tensorflow.keras.preprocessing.text import Tokenizer

MAXLEN = 50_000 #60_000

tokenizer = Tokenizer() #(MAXLEN)

tokenizer.fit_on_texts([Preprocesser.test_data_preprocessing_for_tokenization(text, embeddings_dict) for text in x_test])

print(len(tokenizer.word_index))

40570


In [15]:
'''
def words_in_embedding(embeddings_dict, tokenizer):
    elem = []
    for w in tokenizer.word_index.keys():
        if w not in embeddings_dict.keys():
            elem += [w]
    print(elem)

words_in_embedding(embeddings_dict, tokenizer)
'''

'\ndef words_in_embedding(embeddings_dict, tokenizer):\n    elem = []\n    for w in tokenizer.word_index.keys():\n        if w not in embeddings_dict.keys():\n            elem += [w]\n    print(elem)\n\nwords_in_embedding(embeddings_dict, tokenizer)\n'

In [16]:
#################################################################################
def embeddings_not_in_tokenizer(embeddings_dict, tokenizer):
    elems = []
    for w in embeddings_dict.keys():
        if w not in tokenizer.word_index.keys():
            elems += [w]
    return elems
    
embeddings_not_in_tokenizer(embeddings_dict, tokenizer)

['fawn',
 'schlegel',
 'tilton',
 'clotted',
 'trawling',
 'kalmar',
 'tasos',
 'canes',
 'sprague',
 'brockton',
 'mutinies',
 'vano',
 'crossbar',
 'hermano',
 'jemmy',
 'grenadiers',
 'stipulate',
 'capoeira',
 'broward',
 'caramels',
 'chameleons',
 'asami',
 'immunities',
 'fuera',
 'thrace',
 'kublai',
 'gaskets',
 'snuggles',
 'splendiferous',
 'scraper',
 'ffor',
 'deadheads',
 'selassie',
 'centimeter',
 'opportunists',
 'warmongering',
 'numeral',
 'widget',
 'zlotys',
 'chine',
 'chino',
 'sheung',
 'quart',
 'naturel',
 'kumbaya',
 'kido',
 'millimetres',
 'topography',
 'jäger',
 'battista',
 'ramstein',
 'caned',
 'grahams',
 'excu',
 'borstal',
 'hermana',
 'expeditionary',
 'unpack',
 'murchison',
 'lomax',
 'matilde',
 'zinnias',
 'hyatt',
 'wudang',
 'pooper',
 'pinta',
 'carew',
 'rayon',
 'cocksucker',
 'mcmuffin',
 'sugarless',
 'clews',
 'cutback',
 'essie',
 'canaries',
 'shaitan',
 'stoller',
 'pigment',
 'domed',
 'souci',
 'amaya',
 'tulio',
 'farmlands',
 'di

In [17]:
#################################################################################
tokenizer.word_index[list(tokenizer.word_index.keys())[-1]]

40570

In [18]:
#################################################################################
for w in embeddings_not_in_tokenizer(embeddings_dict, tokenizer):
    tokenizer.word_index[w] = len(tokenizer.word_index) + 1  

In [19]:
#################################################################################
len(tokenizer.word_index)

65713

In [20]:
tokens_dictionary = tokenizer.word_index

inverse_tokens_dictionary = {v : k for (k, v) in tokenizer.word_index.items()}

In [21]:
list(tokens_dictionary.items())[:27]

[('the', 1),
 ('and', 2),
 ('a', 3),
 ('of', 4),
 ('to', 5),
 ('is', 6),
 ('it', 7),
 ('in', 8),
 ('i', 9),
 ('this', 10),
 ('that', 11),
 ('s', 12),
 ('was', 13),
 ('as', 14),
 ('with', 15),
 ('movie', 16),
 ('for', 17),
 ('but', 18),
 ('film', 19),
 ('on', 20),
 ('you', 21),
 ('t', 22),
 ('not', 23),
 ('are', 24),
 ('his', 25),
 ('he', 26),
 ('have', 27)]

In [22]:
list(inverse_tokens_dictionary.items())[:3]

[(1, 'the'), (2, 'and'), (3, 'a')]

In [19]:
import pickle

with open('pickle\\tokens_dicts.pickle', 'wb') as f:
    pickle.dump([tokens_dictionary,inverse_tokens_dictionary], f)
f.close()

In [20]:
import pickle

with open('pickle\\tokens_dicts.pickle', 'rb') as f:
    tokens_dictionary,inverse_tokens_dictionary = pickle.load(f)
f.close()

In [24]:
embedding_matrix = np.zeros(shape = (MAXLEN+1, 300), dtype= 'float32')
#embedding_matrix = np.zeros(shape = (len(tokenizer.word_index) + 1, 300), dtype= 'float32')

for w, i in tokenizer.word_index.items():
    if w in embeddings_dict and i < MAXLEN+1:
        embedding_matrix[i,:] = embeddings_dict[w]

In [22]:
import os

os.makedirs('numpy_files', exist_ok=True)
np.save('numpy_files\embedding_matrix.npy', embedding_matrix)

In [23]:
embedding_matrix = np.load('numpy_files\embedding_matrix.npy')

In [24]:
#from sklearn.preprocessing import normalize #minmax_scale

#embedding_matrix = minmax_scale(embedding_matrix, feature_range=(0, 1), axis = 1)

In [25]:
#len(embeddings_dict['king'])

In [25]:
embedding_matrix

array([[ 0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,
         0.      ],
       [ 0.022847, -0.01317 , -0.025261, ..., -0.039248,  0.001481,
         0.055489],
       [ 0.012515, -0.019482, -0.005424, ..., -0.079507,  0.019481,
        -0.01417 ],
       ...,
       [-0.039857,  0.030305,  0.01695 , ...,  0.038111, -0.062805,
        -0.033791],
       [-0.012134,  0.049292, -0.029643, ...,  0.094657, -0.053364,
         0.007016],
       [ 0.08014 , -0.018318,  0.038116, ...,  0.099899,  0.062242,
         0.08629 ]], dtype=float32)

In [27]:
'''
embedding_matrix = np.zeros((len(embeddings_dict), 300))

for w, emb in embeddings_dict.items():
    embedding_matrix[words_glove_dictionary[w],:] = emb
'''

'\nembedding_matrix = np.zeros((len(embeddings_dict), 300))\n\nfor w, emb in embeddings_dict.items():\n    embedding_matrix[words_glove_dictionary[w],:] = emb\n'

In [28]:
#embedding_matrix.shape

In [26]:
embedding_matrix.shape

(65714, 300)

In [25]:
from sklearn.metrics.pairwise import cosine_distances
from scipy import sparse

distance_matrix = cosine_distances(embedding_matrix, embedding_matrix)

In [26]:
'''
from scipy import spatial

def compute_euclidean_distance(X):
    V = spatial.distance.pdist(X.T, 'sqeuclidean')
    return spatial.distance.squareform(V)

distance_matrix = compute_euclidean_distance(embedding_matrix)
#distance_matrix = spatial.distance_matrix(embedding_matrix, embedding_matrix)
'''

"\nfrom scipy import spatial\n\ndef compute_euclidean_distance(X):\n    V = spatial.distance.pdist(X.T, 'sqeuclidean')\n    return spatial.distance.squareform(V)\n\ndistance_matrix = compute_euclidean_distance(embedding_matrix)\n#distance_matrix = spatial.distance_matrix(embedding_matrix, embedding_matrix)\n"

In [27]:
distance_matrix

array([[0.        , 1.        , 1.        , ..., 1.        , 1.        ,
        1.        ],
       [1.        , 0.        , 0.6060601 , ..., 1.1508511 , 1.1063254 ,
        0.95153755],
       [1.        , 0.6060601 , 0.        , ..., 1.036566  , 1.0173967 ,
        0.9427275 ],
       ...,
       [1.        , 1.1508511 , 1.036566  , ..., 0.        , 0.83871764,
        0.94371986],
       [1.        , 1.1063254 , 1.0173967 , ..., 0.83871764, 0.        ,
        0.94929343],
       [1.        , 0.95153755, 0.9427275 , ..., 0.94371986, 0.94929343,
        0.        ]], dtype=float32)

In [33]:
import os

os.makedirs('numpy_files', exist_ok=True)
np.save('numpy_files\distance_matrix.npy', distance_matrix)

#distance_matrix = np.load('numpy_files\distance_matrix.npy')

In [11]:
distance_matrix = np.load('numpy_files\distance_matrix.npy')

In [34]:
tokens_dictionary['king']

689

In [35]:
#max(distance_matrix[:,10])

In [36]:
'''with open('pickle\\distance_matrix.pickle', 'wb') as f:
    pickle.dump(distance_matrix, f)
f.close()'''

"with open('pickle\\distance_matrix.pickle', 'wb') as f:\n    pickle.dump(distance_matrix, f)\nf.close()"

In [37]:
#distance_matrix.shape

In [38]:
'''from sklearn.preprocessing import normalize

normalized_distance_matrix = normalize(distance_matrix, axis = 1, norm = 'l1')'''

"from sklearn.preprocessing import normalize\n\nnormalized_distance_matrix = normalize(distance_matrix, axis = 1, norm = 'l1')"

In [28]:
def most_similar(word, delta = 0.5, num_words = 20):
    
    try:
        index = tokens_dictionary[word]
    except:
        return []
    
    if (index > distance_matrix.shape[0]):
        return []
    
    dist_order = np.argsort(distance_matrix[index,:])[1:num_words+1]
    dist_list = distance_matrix[index][dist_order]
    
    mask = np.ones_like(dist_list)
    mask = np.where(dist_list < delta)
    return dist_order[mask]#, dist_list[mask]

In [40]:
'''def most_similar(word, delta = 0.5, num_words = 20):
    
    try:
        index = tokenizer.word_index[word]
    except:
        return [], []
    
    if (index > distance_matrix.shape[0]):
        return [], []
    
    dist_order = np.argsort(distance_matrix[index,:])[1:num_words+1]
    dist_list = distance_matrix[index][dist_order]
    
    print(dist_order)
    print(dist_list)
    
    #return dist_order, dist_list

    #if dist_list[-1] == 0:
    #    return [], []
    
    mask = np.ones_like(dist_list)
    #print(mask)
    mask = np.where(dist_list < delta)
    return dist_order[mask], dist_list[mask]'''

'def most_similar(word, delta = 0.5, num_words = 20):\n    \n    try:\n        index = tokenizer.word_index[word]\n    except:\n        return [], []\n    \n    if (index > distance_matrix.shape[0]):\n        return [], []\n    \n    dist_order = np.argsort(distance_matrix[index,:])[1:num_words+1]\n    dist_list = distance_matrix[index][dist_order]\n    \n    print(dist_order)\n    print(dist_list)\n    \n    #return dist_order, dist_list\n\n    #if dist_list[-1] == 0:\n    #    return [], []\n    \n    mask = np.ones_like(dist_list)\n    #print(mask)\n    mask = np.where(dist_list < delta)\n    return dist_order[mask], dist_list[mask]'

In [29]:
a = most_similar('great')

In [30]:
a

array([ 1898,  3156,  1218,  6437,  1241, 29888,  1921,   379,  4199,
         806,  2906,  3300,  7644, 19952, 10235,  9447,   617,  1104,
       28723,   303], dtype=int64)

In [31]:
[inverse_tokens_dictionary[index] for index in a]

['grand',
 'tremendous',
 'awesome',
 'phenomenal',
 'terrific',
 'prodigious',
 'magnificent',
 'wonderful',
 'splendid',
 'fantastic',
 'marvelous',
 'fabulous',
 'marvellous',
 'whopping',
 'wondrous',
 'formidable',
 'huge',
 'super',
 'resplendent',
 'excellent']

In [44]:
################################################################

In [94]:
### ALTERNATIVE ###
from scipy import spatial

def find_nearest_neighbours(word, n = 20, delta = 0.5):
    embedding = embeddings_dict[word]
    return sorted(embeddings_dict.keys(), key=lambda w: spatial.distance.cosine(embeddings_dict[w], embedding))[1:20+1]

In [45]:
find_nearest_neighbours('fear')

NameError: name 'find_nearest_neighbours' is not defined

## GOOGLE 1 BILLION WORDS LANGUAGE MODEL

To filter out words that are not in the context of a sentence


https://github.com/tensorflow/models/tree/archive/research/lm_1b

DOWNLOADS:<br>
* http://download.tensorflow.org/models/LM_LSTM_CNN/graph-2016-09-10.pbtxt
* http://download.tensorflow.org/models/LM_LSTM_CNN/all_shards-2016-09-10/ckpt-base
* http://download.tensorflow.org/models/LM_LSTM_CNN/all_shards-2016-09-10/ckpt-char-embedding
* http://download.tensorflow.org/models/LM_LSTM_CNN/all_shards-2016-09-10/ckpt-lstm
* http://download.tensorflow.org/models/LM_LSTM_CNN/all_shards-2016-09-10/ckpt-softmax0
* http://download.tensorflow.org/models/LM_LSTM_CNN/all_shards-2016-09-10/ckpt-softmax1
* http://download.tensorflow.org/models/LM_LSTM_CNN/all_shards-2016-09-10/ckpt-softmax2
* http://download.tensorflow.org/models/LM_LSTM_CNN/all_shards-2016-09-10/ckpt-softmax3
* http://download.tensorflow.org/models/LM_LSTM_CNN/all_shards-2016-09-10/ckpt-softmax4
* http://download.tensorflow.org/models/LM_LSTM_CNN/all_shards-2016-09-10/ckpt-softmax5
* http://download.tensorflow.org/models/LM_LSTM_CNN/all_shards-2016-09-10/ckpt-softmax6
* http://download.tensorflow.org/models/LM_LSTM_CNN/all_shards-2016-09-10/ckpt-softmax7
* http://download.tensorflow.org/models/LM_LSTM_CNN/all_shards-2016-09-10/ckpt-softmax8
* http://download.tensorflow.org/models/LM_LSTM_CNN/vocab-2016-09-10.txt

Il codice nella seguente cella è modificato da una base il cui copyright è espresso di seguito:

```
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
```

In [32]:
import licensed_scripts.lm_1b_eval as google_language_model_utils

In [33]:
google_language_model = google_language_model_utils.LM()

LM vocab loading done


Recovering graph.


INFO:tensorflow:Recovering Graph google_language_model\graph-2016-09-10.pbtxt


Recovering checkpoint google_language_model\ckpt-*


In [34]:
nearest_indexes = most_similar('beautiful')
nearest_words = [inverse_tokens_dictionary[index] for index in nearest_indexes]
print('Closest to having are %s' %(nearest_words))

Closest to having are ['gorgeous', 'wonderful', 'splendid', 'handsome', 'resplendent', 'wondrous', 'marvelous', 'marvellous', 'fantastic', 'sumptuous', 'magnificent', 'terrific', 'lovely', 'ravishing', 'sublime', 'magnifique', 'exquisite', 'fabulous', 'delightful', 'superb']


In [35]:
nearest_words

['gorgeous',
 'wonderful',
 'splendid',
 'handsome',
 'resplendent',
 'wondrous',
 'marvelous',
 'marvellous',
 'fantastic',
 'sumptuous',
 'magnificent',
 'terrific',
 'lovely',
 'ravishing',
 'sublime',
 'magnifique',
 'exquisite',
 'fabulous',
 'delightful',
 'superb']

In [36]:
x_test[0]

'Unhinged follows the typical plot of the early 80\'s slasher trend. Pretty Young Girls In Peril. I have to give it up for the filmmaker who used a helicopter for some of the early road-trip shots, you actually think for a second there\'s going to be quality in the production. Watching "Unhinged" was like seeing an amateur acting class go through it\'s warm-up. Some of the most awkward, badly lit, overlong scenes are played out with the gusto of a Valium overdose. I wondered why they didn\'t just put the cue-cards on camera so the actresses wouldn\'t have to constantly shift their gaze. The two main girls were obviously chosen for their T&A factor rather than talent. Laurel Munson as the main chick Terry is as exciting as watching paint dry. Two nude scenes make for an adolescent thrill. Janet Penner and Virginia Settle as the crazy/creepy daughter and mother the chicks find themselves stranded with compete for Worst Acting Ever. Long pauses, weird expressions, emphasis on the wrong wo

In [37]:
splitted_text = x_test[0].split()
splitted_text

['Unhinged',
 'follows',
 'the',
 'typical',
 'plot',
 'of',
 'the',
 'early',
 "80's",
 'slasher',
 'trend.',
 'Pretty',
 'Young',
 'Girls',
 'In',
 'Peril.',
 'I',
 'have',
 'to',
 'give',
 'it',
 'up',
 'for',
 'the',
 'filmmaker',
 'who',
 'used',
 'a',
 'helicopter',
 'for',
 'some',
 'of',
 'the',
 'early',
 'road-trip',
 'shots,',
 'you',
 'actually',
 'think',
 'for',
 'a',
 'second',
 "there's",
 'going',
 'to',
 'be',
 'quality',
 'in',
 'the',
 'production.',
 'Watching',
 '"Unhinged"',
 'was',
 'like',
 'seeing',
 'an',
 'amateur',
 'acting',
 'class',
 'go',
 'through',
 "it's",
 'warm-up.',
 'Some',
 'of',
 'the',
 'most',
 'awkward,',
 'badly',
 'lit,',
 'overlong',
 'scenes',
 'are',
 'played',
 'out',
 'with',
 'the',
 'gusto',
 'of',
 'a',
 'Valium',
 'overdose.',
 'I',
 'wondered',
 'why',
 'they',
 "didn't",
 'just',
 'put',
 'the',
 'cue-cards',
 'on',
 'camera',
 'so',
 'the',
 'actresses',
 "wouldn't",
 'have',
 'to',
 'constantly',
 'shift',
 'their',
 'gaze.',


In [38]:
import re
pattern = r"[\w]+|[^\s\w]"

re.findall(pattern, splitted_text[51])

['"', 'Unhinged', '"']

In [39]:
import nltk.data
import nltk

nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\utente\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [52]:
import re
from scripts.preprocessing import Preprocesser
import nltk.data
import nltk


def substitute_with_best_word(test, sentence_position = 0, text_position = 3):
    
    sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    sentences = sentence_tokenizer.tokenize(x_test[0])
    
    splitted_text = sentences[sentence_position].split()
    
    pattern = r"[\w]+|[^\s\w]"

    splitted_word = re.findall(pattern, splitted_text[text_position])
    
    for i,w in enumerate(splitted_word):
        preprocessed_w = Preprocesser.test_data_preprocessing_for_tokenization(w, embeddings_dict)
        if len(preprocessed_w.split()):
            print('substituting: %s' %(preprocessed_w))
            nearest_indexes = most_similar(preprocessed_w)
            nearest_words = [inverse_tokens_dictionary[index] for index in nearest_indexes]
            if len(nearest_words):
                prefix = ' '.join(splitted_text[:text_position])
                print(prefix)
                suffix = ' '.join(splitted_text[text_position + 1:])
                print(suffix)
                lm_preds = google_language_model.get_words_probs(prefix, nearest_words, suffix)
                splitted_word[i] = nearest_words[np.argmax(lm_preds)]
    splitted_text[text_position] = ''.join(splitted_word)
    sentences[sentence_position] = ' '.join(splitted_text)
    return ' '.join(sentences)

In [53]:
'''
import re
from scripts.preprocessing import Preprocesser

def substitute_with_best_word(test, text_position = 10, offset  = 1):
    splitted_text = test.split()
    
    pattern = r"[\w]+|[^\s\w]"

    splitted_word = re.findall(pattern, splitted_text[text_position])
    
    for i,w in enumerate(splitted_word):
        preprocessed_w = Preprocesser.test_data_preprocessing_for_tokenization(w)
        if len(preprocessed_w.split()):
            print('substituting: %s' %(preprocessed_w))
            nearest_indexes = most_similar(preprocessed_w)
            nearest_words = [inverse_tokens_dictionary[index] for index in nearest_indexes]
            if len(nearest_words):
                minimum = 0 if text_position-offset < 0 else text_position-offset
                maximum = len(splitted_text) - 1 if text_position+offset+1 > len(splitted_text) - 1 else text_position+offset+1  
                prefix = ' '.join(splitted_text[minimum : text_position])
                print(prefix)
                suffix = ' '.join(splitted_text[text_position + 1 : maximum])
                print(suffix)
                lm_preds = google_language_model.get_words_probs(prefix, nearest_words, suffix)
                splitted_word[i] = nearest_words[np.argmax(lm_preds)]
    splitted_text[text_position] = ''.join(splitted_word)
    return ' '.join(splitted_text)
    
'''

'\nimport re\nfrom scripts.preprocessing import Preprocesser\n\ndef substitute_with_best_word(test, text_position = 10, offset  = 1):\n    splitted_text = test.split()\n    \n    pattern = r"[\\w]+|[^\\s\\w]"\n\n    splitted_word = re.findall(pattern, splitted_text[text_position])\n    \n    for i,w in enumerate(splitted_word):\n        preprocessed_w = Preprocesser.test_data_preprocessing_for_tokenization(w)\n        if len(preprocessed_w.split()):\n            print(\'substituting: %s\' %(preprocessed_w))\n            nearest_indexes = most_similar(preprocessed_w)\n            nearest_words = [inverse_tokens_dictionary[index] for index in nearest_indexes]\n            if len(nearest_words):\n                minimum = 0 if text_position-offset < 0 else text_position-offset\n                maximum = len(splitted_text) - 1 if text_position+offset+1 > len(splitted_text) - 1 else text_position+offset+1  \n                prefix = \' \'.join(splitted_text[minimum : text_position])\n      

In [54]:
x_test[0]

'Unhinged follows the typical plot of the early 80\'s slasher trend. Pretty Young Girls In Peril. I have to give it up for the filmmaker who used a helicopter for some of the early road-trip shots, you actually think for a second there\'s going to be quality in the production. Watching "Unhinged" was like seeing an amateur acting class go through it\'s warm-up. Some of the most awkward, badly lit, overlong scenes are played out with the gusto of a Valium overdose. I wondered why they didn\'t just put the cue-cards on camera so the actresses wouldn\'t have to constantly shift their gaze. The two main girls were obviously chosen for their T&A factor rather than talent. Laurel Munson as the main chick Terry is as exciting as watching paint dry. Two nude scenes make for an adolescent thrill. Janet Penner and Virginia Settle as the crazy/creepy daughter and mother the chicks find themselves stranded with compete for Worst Acting Ever. Long pauses, weird expressions, emphasis on the wrong wo

In [55]:
substitute_with_best_word(x_test[0])

substituting: typical
Unhinged follows the
plot of the early 80's slasher trend.


'Unhinged follows the usual plot of the early 80\'s slasher trend. Pretty Young Girls In Peril. I have to give it up for the filmmaker who used a helicopter for some of the early road-trip shots, you actually think for a second there\'s going to be quality in the production. Watching "Unhinged" was like seeing an amateur acting class go through it\'s warm-up. Some of the most awkward, badly lit, overlong scenes are played out with the gusto of a Valium overdose. I wondered why they didn\'t just put the cue-cards on camera so the actresses wouldn\'t have to constantly shift their gaze. The two main girls were obviously chosen for their T&A factor rather than talent. Laurel Munson as the main chick Terry is as exciting as watching paint dry. Two nude scenes make for an adolescent thrill. Janet Penner and Virginia Settle as the crazy/creepy daughter and mother the chicks find themselves stranded with compete for Worst Acting Ever. Long pauses, weird expressions, emphasis on the wrong word

In [114]:
nearest_indexes = most_similar('trend')
nearest_words = [inverse_tokens_dictionary[index] for index in nearest_indexes]
print('Closest to having are %s' %(nearest_words))

Closest to having are ['trends', 'tendencies', 'tendency', 'tended', 'tending', 'tend', 'propensity', 'inclination', 'proclivities', 'leanings', 'penchant', 'tends', 'orientation', 'directions', 'sensibilities', 'evolution', 'drift', 'thrusts', 'gravitate', 'minded']


In [83]:
prefix = "I am a"
suffix = 'man'
lm_preds = google_language_model.get_words_probs(prefix, nearest_words, suffix)
print('most probable is ', nearest_words[np.argmax(lm_preds)])

ValueError: attempt to get argmax of an empty sequence

In [84]:
for index in np.argsort(lm_preds)[::-1]:
    print(nearest_words[index]) 