In [47]:
import numpy as np
import tensorflow as tf
import random
import os

SEED = 123
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)
random.seed(SEED)
tf.random.set_seed(SEED)
os.environ['TF_DETERMINISTIC_OPS'] = '1'

In [48]:
from __future__ import absolute_import, division, print_function, unicode_literals
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  0


In [49]:
from scripts.blackBox import BlackBox

In [50]:
black_box = BlackBox()



In [51]:
import pickle

with open('pickle\\data.pickle', 'rb') as f:
    x_test, y_test = pickle.load(f)

In [52]:
x_test[0]

'Unhinged follows the typical plot of the early 80\'s slasher trend. Pretty Young Girls In Peril. I have to give it up for the filmmaker who used a helicopter for some of the early road-trip shots, you actually think for a second there\'s going to be quality in the production. Watching "Unhinged" was like seeing an amateur acting class go through it\'s warm-up. Some of the most awkward, badly lit, overlong scenes are played out with the gusto of a Valium overdose. I wondered why they didn\'t just put the cue-cards on camera so the actresses wouldn\'t have to constantly shift their gaze. The two main girls were obviously chosen for their T&A factor rather than talent. Laurel Munson as the main chick Terry is as exciting as watching paint dry. Two nude scenes make for an adolescent thrill. Janet Penner and Virginia Settle as the crazy/creepy daughter and mother the chicks find themselves stranded with compete for Worst Acting Ever. Long pauses, weird expressions, emphasis on the wrong wo

In [53]:
y_test[0]

0

In [54]:
print(black_box.predict_sentiment(x_test[0]))

0.006194331


## Perturb Algorythm

In [55]:
population = list(zip(x_test,y_test))

In [56]:
population[:3]

[('Unhinged follows the typical plot of the early 80\'s slasher trend. Pretty Young Girls In Peril. I have to give it up for the filmmaker who used a helicopter for some of the early road-trip shots, you actually think for a second there\'s going to be quality in the production. Watching "Unhinged" was like seeing an amateur acting class go through it\'s warm-up. Some of the most awkward, badly lit, overlong scenes are played out with the gusto of a Valium overdose. I wondered why they didn\'t just put the cue-cards on camera so the actresses wouldn\'t have to constantly shift their gaze. The two main girls were obviously chosen for their T&A factor rather than talent. Laurel Munson as the main chick Terry is as exciting as watching paint dry. Two nude scenes make for an adolescent thrill. Janet Penner and Virginia Settle as the crazy/creepy daughter and mother the chicks find themselves stranded with compete for Worst Acting Ever. Long pauses, weird expressions, emphasis on the wrong 

In [57]:
population[0][0]
population[0][1]
#a,b = zip(*population)

0

## GLOVE

```
@inproceedings{pennington2014glove,
  author = {Jeffrey Pennington and Richard Socher and Christopher D. Manning},
  booktitle = {Empirical Methods in Natural Language Processing (EMNLP)},
  title = {GloVe: Global Vectors for Word Representation},
  year = {2014},
  pages = {1532--1543},
  url = {http://www.aclweb.org/anthology/D14-1162},
}
```

In [58]:
embeddings_dict = {}

with open("counter_fitted_word_vectors\\counter-fitted-vectors.txt", "r",errors ='ignore', encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype="float32")
        embeddings_dict[word] = coefs
f.close()

In [59]:
from scripts.preprocessing import Preprocesser

x_test = [Preprocesser.raw_text_preprocessing(sentence) for sentence in x_test]

In [60]:
from tensorflow.keras.preprocessing.text import Tokenizer

MAXLEN = 10_000 #60_000

tokenizer = Tokenizer(MAXLEN)

tokenizer.fit_on_texts(x_test)

print(len(tokenizer.word_index))

76229


In [61]:
tokens_dictionary = tokenizer.word_index

inverse_tokens_dictionary = {v : k for (k, v) in tokenizer.word_index.items()}

In [67]:
list(tokens_dictionary.items())[:3]

[('the', 1), ('and', 2), ('a', 3)]

In [68]:
list(inverse_tokens_dictionary.items())[:3]

[(1, 'the'), (2, 'and'), (3, 'a')]

In [69]:
import pickle

with open('pickle\\tokens_dicts.pickle', 'wb') as f:
    pickle.dump([tokens_dictionary,inverse_tokens_dictionary], f)
f.close()

In [70]:
embedding_matrix = np.zeros(shape = (MAXLEN+1, 300), dtype= 'float32')

for w, i in tokenizer.word_index.items():
    if w in embeddings_dict and i < MAXLEN+1:
        embedding_matrix[i,:] = embeddings_dict[w]

In [71]:
import os

os.makedirs('numpy_files', exist_ok=True)
np.save('numpy_files\embedding_matrix.npy', embedding_matrix)

In [72]:
embedding_matrix = np.load('numpy_files\embedding_matrix.npy')

In [73]:
#from sklearn.preprocessing import normalize #minmax_scale

#embedding_matrix = minmax_scale(embedding_matrix, feature_range=(0, 1), axis = 1)

In [74]:
#len(embeddings_dict['king'])

In [75]:
embedding_matrix

array([[ 0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,
         0.      ],
       [ 0.022847, -0.01317 , -0.025261, ..., -0.039248,  0.001481,
         0.055489],
       [ 0.012515, -0.019482, -0.005424, ..., -0.079507,  0.019481,
        -0.01417 ],
       ...,
       [-0.001293, -0.011681, -0.02068 , ..., -0.013038, -0.032717,
         0.10921 ],
       [-0.051234, -0.010331, -0.043611, ..., -0.032198, -0.006921,
        -0.025031],
       [-0.033761, -0.095648, -0.071449, ...,  0.089715, -0.011441,
         0.021954]], dtype=float32)

In [76]:
'''
embedding_matrix = np.zeros((len(embeddings_dict), 300))

for w, emb in embeddings_dict.items():
    embedding_matrix[words_glove_dictionary[w],:] = emb
'''

'\nembedding_matrix = np.zeros((len(embeddings_dict), 300))\n\nfor w, emb in embeddings_dict.items():\n    embedding_matrix[words_glove_dictionary[w],:] = emb\n'

In [77]:
#embedding_matrix.shape

In [78]:
embedding_matrix.shape

(10001, 300)

In [79]:
from sklearn.metrics.pairwise import cosine_distances

distance_matrix = cosine_distances(embedding_matrix, embedding_matrix)

In [80]:
'''
from scipy import spatial

def compute_euclidean_distance(X):
    V = spatial.distance.pdist(X.T, 'sqeuclidean')
    return spatial.distance.squareform(V)

distance_matrix = compute_euclidean_distance(embedding_matrix)
#distance_matrix = spatial.distance_matrix(embedding_matrix, embedding_matrix)
'''

"\nfrom scipy import spatial\n\ndef compute_euclidean_distance(X):\n    V = spatial.distance.pdist(X.T, 'sqeuclidean')\n    return spatial.distance.squareform(V)\n\ndistance_matrix = compute_euclidean_distance(embedding_matrix)\n#distance_matrix = spatial.distance_matrix(embedding_matrix, embedding_matrix)\n"

In [81]:
distance_matrix

array([[0.        , 1.        , 1.        , ..., 1.        , 1.        ,
        1.        ],
       [1.        , 0.        , 0.6060601 , ..., 0.9962219 , 0.87981826,
        1.0407182 ],
       [1.        , 0.6060601 , 0.        , ..., 0.9895136 , 0.8719094 ,
        0.8907523 ],
       ...,
       [1.        , 0.9962219 , 0.9895136 , ..., 0.        , 1.0547255 ,
        0.926983  ],
       [1.        , 0.87981826, 0.8719094 , ..., 1.0547255 , 0.        ,
        0.83378804],
       [1.        , 1.0407182 , 0.8907523 , ..., 0.926983  , 0.83378804,
        0.        ]], dtype=float32)

In [82]:
import os

os.makedirs('numpy_files', exist_ok=True)
np.save('numpy_files\distance_matrix.npy', distance_matrix)

#distance_matrix = np.load('numpy_files\distance_matrix.npy')

In [83]:
distance_matrix = np.load('numpy_files\distance_matrix.npy')

In [84]:
tokens_dictionary['king']

784

In [85]:
#max(distance_matrix[:,10])

In [86]:
'''with open('pickle\\distance_matrix.pickle', 'wb') as f:
    pickle.dump(distance_matrix, f)
f.close()'''

"with open('pickle\\distance_matrix.pickle', 'wb') as f:\n    pickle.dump(distance_matrix, f)\nf.close()"

In [87]:
#distance_matrix.shape

In [88]:
'''from sklearn.preprocessing import normalize

normalized_distance_matrix = normalize(distance_matrix, axis = 1, norm = 'l1')'''

"from sklearn.preprocessing import normalize\n\nnormalized_distance_matrix = normalize(distance_matrix, axis = 1, norm = 'l1')"

In [105]:
def most_similar(word, delta = 0.5, num_words = 20):
    
    try:
        index = tokens_dictionary[word]
    except:
        return []
    
    if (index > distance_matrix.shape[0]):
        return []
    
    dist_order = np.argsort(distance_matrix[index,:])[1:num_words+1]
    dist_list = distance_matrix[index][dist_order]
    
    mask = np.ones_like(dist_list)
    mask = np.where(dist_list < delta)
    return dist_order[mask]#, dist_list[mask]

In [106]:
'''def most_similar(word, delta = 0.5, num_words = 20):
    
    try:
        index = tokenizer.word_index[word]
    except:
        return [], []
    
    if (index > distance_matrix.shape[0]):
        return [], []
    
    dist_order = np.argsort(distance_matrix[index,:])[1:num_words+1]
    dist_list = distance_matrix[index][dist_order]
    
    print(dist_order)
    print(dist_list)
    
    #return dist_order, dist_list

    #if dist_list[-1] == 0:
    #    return [], []
    
    mask = np.ones_like(dist_list)
    #print(mask)
    mask = np.where(dist_list < delta)
    return dist_order[mask], dist_list[mask]'''

'def most_similar(word, delta = 0.5, num_words = 20):\n    \n    try:\n        index = tokenizer.word_index[word]\n    except:\n        return [], []\n    \n    if (index > distance_matrix.shape[0]):\n        return [], []\n    \n    dist_order = np.argsort(distance_matrix[index,:])[1:num_words+1]\n    dist_list = distance_matrix[index][dist_order]\n    \n    print(dist_order)\n    print(dist_list)\n    \n    #return dist_order, dist_list\n\n    #if dist_list[-1] == 0:\n    #    return [], []\n    \n    mask = np.ones_like(dist_list)\n    #print(mask)\n    mask = np.where(dist_list < delta)\n    return dist_order[mask], dist_list[mask]'

In [125]:
a = most_similar('fear')

In [126]:
a

array([7706, 1650, 1829, 2581, 5315, 5009, 5396, 2409, 3356, 5998, 8992,
       6061, 3680, 4962,  179, 4288,  679, 9200, 4037, 9026], dtype=int64)

In [127]:
[inverse_tokens_dictionary[index] for index in a]

['fright',
 'afraid',
 'scared',
 'scare',
 'frightened',
 'panic',
 'angst',
 'terror',
 'worry',
 'terrified',
 'anxiety',
 'dread',
 'worried',
 'bang',
 'horror',
 'concern',
 'scary',
 'feared',
 'fears',
 'worries']

In [93]:
################################################################

In [94]:
### ALTERNATIVE ###
from scipy import spatial

def find_nearest_neighbours(word, n = 20, delta = 0.5):
    embedding = embeddings_dict[word]
    return sorted(embeddings_dict.keys(), key=lambda w: spatial.distance.cosine(embeddings_dict[w], embedding))[1:20+1]

In [128]:
find_nearest_neighbours('fear')

['fright',
 'afraid',
 'scared',
 'scare',
 'frightened',
 'panic',
 'fearful',
 'affraid',
 'angst',
 'terror',
 'worry',
 'terrified',
 'freaked',
 'spooked',
 'anxiety',
 'frighten',
 'dread',
 'trepidation',
 'worried',
 'apprehensive']