In [None]:
!pip install pandas==0.25.3
!pip install numpy==1.17.3
!pip install Keras==2.3.1
!pip install tensorflow==2.0.0
!pip install tqdm==4.43.0
!pip install matplotlib==3.1.3

In [11]:
import re
import numpy as np

def create_unique_word_dict(text:list) -> dict:
    """
    A method that creates a dictionary where the keys are unique words
    and key values are indices
    """
    # Getting all the unique words from our text and sorting them alphabetically
    words = list(set(text))
    words.sort()

    # Creating the dictionary for the unique words
    unique_word_dict = {}
    for i, word in enumerate(words):
        unique_word_dict.update({
            word: i
        })
    print(unique_word_dict)
    return unique_word_dict    

def text_preprocessing(
    text:list,
    punctuations = r'''!()-[]{};:'"\,<>./?@#$%^&*_“~''',
    stop_words=['and', 'a', 'is', 'the', 'in', 'be', 'will']
    )->list:
    """
    A method to preproces text
    """
    for x in text.lower(): 
        if x in punctuations: 
            text = text.replace(x, "")

    # Removing words that have numbers in them
    text = re.sub(r'\w*\d\w*', '', text)

    # Removing digits
    text = re.sub(r'[0-9]+', '', text)

    # Cleaning the whitespaces
    text = re.sub(r'\s+', ' ', text).strip()

    # Setting every word to lower
    text = text.lower()

    # Converting all our text to a list 
    text = text.split(' ')

    # Droping empty strings
    text = [x for x in text if x!='']

    # Droping stop words
    text = [x for x in text if x not in stop_words]

    return text

# Functions to find the most similar word 
def euclidean(vec1:np.array, vec2:np.array) -> float:
    """
    A function to calculate the euclidean distance between two vectors
    """
    return np.sqrt(np.sum((vec1 - vec2)**2))

def find_similar(word:str, embedding_dict:dict, top_n=10)->list:
    """
    A method to find the most similar word based on the learnt embeddings
    """
    dist_dict = {}
    word_vector = embedding_dict.get(word, [])
    if len(word_vector) > 0:
        for key, value in embedding_dict.items():
            if key!=word:
                dist = euclidean(word_vector, value)
                dist_dict.update({
                    key: dist
                })

        return sorted(dist_dict.items(), key=lambda x: x[1])[0:top_n]       

In [6]:
texts

['The future king is the prince',
 'Daughter is the princess ',
 'Son is the prince',
 'Only a man can be a king ',
 'Only a woman can be a queen',
 'The princess will be a queen',
 'Queen and king rule the realm',
 'The prince is a strong man',
 'The princess is a beautiful woman ',
 'The royal family is the king and queen and their children',
 'Prince is only a boy now',
 'A boy will be a man']

In [12]:
import itertools
import pandas as pd
import numpy as np
import re
import os
from tqdm import tqdm

# Drawing the embeddings
import matplotlib.pyplot as plt

# Deep learning: 
from keras.models import Input, Model
from keras.layers import Dense

from scipy import sparse

# Custom functions
#from utility import text_preprocessing, create_unique_word_dict

# Reading the text from the input folder
texts = pd.read_csv('input/sample.csv')
texts = [x for x in texts['text']]

# Defining the window for context
window = 2

# Creating a placeholder for the scanning of the word list
word_lists = []
all_text = []

for text in texts:

    # Cleaning the text
    text = text_preprocessing(text)
    print(text)
    # Appending to the all text list
    all_text += text 

    # Creating a context dictionary
    for i, word in enumerate(text):
        print("......i....." ,i,"....word....",word)
        for w in range(3):
            print(w)
            # Getting the context that is ahead by *window* words
            if i + 1 + w < len(text): 
                word_lists.append([word] + [text[(i + 1 + w)]])
            # Getting the context that is behind by *window* words    
            if i - w - 1 >= 0:
                word_lists.append([word] + [text[(i - w - 1)]])

                
print("word list " , word_lists)
unique_word_dict = create_unique_word_dict(all_text)
print("unique_word_dict",unique_word_dict)

# Defining the number of features (unique words)
n_words = len(unique_word_dict)

# Getting all the unique words 
words = list(unique_word_dict.keys())

print("words",words)

# Creating the X and Y matrices using one hot encoding
X = []
Y = []

for i, word_list in tqdm(enumerate(word_lists)):
    # Getting the indices
    
    main_word_index = unique_word_dict.get(word_list[0])
    context_word_index = unique_word_dict.get(word_list[1])
    print("......i....." ,i,"....word_list....",word_list , "word_list[0]",word_list[0],"word_list[1] ",word_list[1])
    # Creating the placeholders   
    X_row = np.zeros(n_words)
    Y_row = np.zeros(n_words)

    # One hot encoding the main word
    X_row[main_word_index] = 1

    # One hot encoding the Y matrix words 
    Y_row[context_word_index] = 1

    # Appending to the main matrices
    X.append(X_row)
    Y.append(Y_row)

# Converting the matrices into a sparse format because the vast majority of the data are 0s
X = sparse.csr_matrix(X)
Y = sparse.csr_matrix(Y)

# Defining the size of the embedding
embed_size = 2

# Defining the neural network
inp = Input(shape=(X.shape[1],))
x = Dense(units=embed_size, activation='linear')(inp)
x = Dense(units=Y.shape[1], activation='softmax')(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')

# Optimizing the network weights
model.fit(
    x=X, 
    y=Y, 
    batch_size=256,
    epochs=1000
    )

# Obtaining the weights from the neural network. 
# These are the so called word embeddings

# The input layer 
weights = model.get_weights()[0]

# Creating a dictionary to store the embeddings in. The key is a unique word and 
# the value is the numeric vector
embedding_dict = {}
for word in words: 
    embedding_dict.update({
        word: weights[unique_word_dict.get(word)]
        })

# Ploting the embeddings
plt.figure(figsize=(10, 10))
for word in list(unique_word_dict.keys()):
    coord = embedding_dict.get(word)
    plt.scatter(coord[0], coord[1])
    plt.annotate(word, (coord[0], coord[1]))       

# Saving the embedding vector to a txt file
try:
    os.mkdir(f'{os.getcwd()}\\output')        
except Exception as e:
    print(f'Cannot create output folder: {e}')

with open(f'{os.getcwd()}\\output\\embedding.txt', 'w') as f:
    for key, value in embedding_dict.items():
        try:
            f.write(f'{key}: {value}\n')   
        except Exception as e:
            print(f'Cannot write word {key} to dict: {e}') 

['future', 'king', 'prince']
......i..... 0 ....word.... future
0
1
2
......i..... 1 ....word.... king
0
1
2
......i..... 2 ....word.... prince
0
1
2
['daughter', 'princess']
......i..... 0 ....word.... daughter
0
1
2
......i..... 1 ....word.... princess
0
1
2
['son', 'prince']
......i..... 0 ....word.... son
0
1
2
......i..... 1 ....word.... prince
0
1
2
['only', 'man', 'can', 'king']
......i..... 0 ....word.... only
0
1
2
......i..... 1 ....word.... man
0
1
2
......i..... 2 ....word.... can
0
1
2
......i..... 3 ....word.... king
0
1
2
['only', 'woman', 'can', 'queen']
......i..... 0 ....word.... only
0
1
2
......i..... 1 ....word.... woman
0
1
2
......i..... 2 ....word.... can
0
1
2
......i..... 3 ....word.... queen
0
1
2
['princess', 'queen']
......i..... 0 ....word.... princess
0
1
2
......i..... 1 ....word.... queen
0
1
2
['queen', 'king', 'rule', 'realm']
......i..... 0 ....word.... queen
0
1
2
......i..... 1 ....word.... king
0
1
2
......i..... 2 ....word.... rule
0
1
2
......i.

98it [00:00, 631.99it/s]

......i..... 0 ....word_list.... ['future', 'king'] word_list[0] future word_list[1]  king
......i..... 1 ....word_list.... ['future', 'prince'] word_list[0] future word_list[1]  prince
......i..... 2 ....word_list.... ['king', 'prince'] word_list[0] king word_list[1]  prince
......i..... 3 ....word_list.... ['king', 'future'] word_list[0] king word_list[1]  future
......i..... 4 ....word_list.... ['prince', 'king'] word_list[0] prince word_list[1]  king
......i..... 5 ....word_list.... ['prince', 'future'] word_list[0] prince word_list[1]  future
......i..... 6 ....word_list.... ['daughter', 'princess'] word_list[0] daughter word_list[1]  princess
......i..... 7 ....word_list.... ['princess', 'daughter'] word_list[0] princess word_list[1]  daughter
......i..... 8 ....word_list.... ['son', 'prince'] word_list[0] son word_list[1]  prince
......i..... 9 ....word_list.... ['prince', 'son'] word_list[0] prince word_list[1]  son
......i..... 10 ....word_list.... ['only', 'man'] word_list[0]




Epoch 1/1000


2022-06-24 21:24:38.140825: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


TypeError: in user code:

    File "/Users/dms/opt/anaconda3/lib/python3.9/site-packages/keras/engine/training.py", line 1021, in train_function  *
        return step_function(self, iterator)
    File "/Users/dms/opt/anaconda3/lib/python3.9/site-packages/keras/engine/training.py", line 1010, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/dms/opt/anaconda3/lib/python3.9/site-packages/keras/engine/training.py", line 1000, in run_step  **
        outputs = model.train_step(data)
    File "/Users/dms/opt/anaconda3/lib/python3.9/site-packages/keras/engine/training.py", line 860, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "/Users/dms/opt/anaconda3/lib/python3.9/site-packages/keras/engine/training.py", line 918, in compute_loss
        return self.compiled_loss(
    File "/Users/dms/opt/anaconda3/lib/python3.9/site-packages/keras/engine/compile_utils.py", line 201, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "/Users/dms/opt/anaconda3/lib/python3.9/site-packages/keras/losses.py", line 141, in __call__
        losses = call_fn(y_true, y_pred)
    File "/Users/dms/opt/anaconda3/lib/python3.9/site-packages/keras/losses.py", line 242, in call  **
        y_pred, y_true = losses_utils.squeeze_or_expand_dimensions(y_pred, y_true)
    File "/Users/dms/opt/anaconda3/lib/python3.9/site-packages/keras/utils/losses_utils.py", line 187, in squeeze_or_expand_dimensions
        y_true, y_pred = remove_squeezable_dimensions(
    File "/Users/dms/opt/anaconda3/lib/python3.9/site-packages/keras/utils/losses_utils.py", line 117, in remove_squeezable_dimensions
        labels = tf.convert_to_tensor(labels)

    TypeError: Failed to convert elements of SparseTensor(indices=Tensor("DeserializeSparse_1:0", shape=(None, 2), dtype=int64), values=Tensor("DeserializeSparse_1:1", shape=(None,), dtype=float32), dense_shape=Tensor("stack_1:0", shape=(2,), dtype=int64)) to Tensor. Consider casting elements to a supported type. See https://www.tensorflow.org/api_docs/python/tf/dtypes for supported TF dtypes.


In [3]:
embedding_dict

NameError: name 'embedding_dict' is not defined

In [4]:
model.get_weights()[0]

array([[-0.18738952, -0.37835294],
       [ 0.2355836 ,  0.33867437],
       [-0.15907553,  0.39721954],
       [ 0.4589963 ,  0.08188581],
       [-0.38270926,  0.12757683],
       [ 0.2276771 ,  0.06439   ],
       [ 0.43437314,  0.0524224 ],
       [ 0.10861504,  0.12535983],
       [-0.0515455 , -0.14022624],
       [-0.4143938 ,  0.41141826],
       [-0.22932553, -0.16115797],
       [ 0.36094117, -0.02135694],
       [-0.12745845,  0.10817689],
       [-0.03604123, -0.36303377],
       [-0.07821628, -0.06119713],
       [-0.41177335,  0.26991826],
       [-0.49090514, -0.0929864 ],
       [ 0.21016878, -0.00394106],
       [-0.4260587 , -0.29772568],
       [ 0.0184477 ,  0.07268143],
       [-0.10561186, -0.43330798]], dtype=float32)

In [5]:
X.shape[1]

21

In [6]:
Y.shape[1]

21

In [7]:
n_words

21

In [8]:
unique_word_dict

{'beautiful': 0,
 'boy': 1,
 'can': 2,
 'children': 3,
 'daughter': 4,
 'family': 5,
 'future': 6,
 'king': 7,
 'man': 8,
 'now': 9,
 'only': 10,
 'prince': 11,
 'princess': 12,
 'queen': 13,
 'realm': 14,
 'royal': 15,
 'rule': 16,
 'son': 17,
 'strong': 18,
 'their': 19,
 'woman': 20}

In [None]:
len(word_lists)

In [None]:
[['future', 'king'], ['future', 'prince'], ['king', 'prince'], ['king', 'future'], 
  ['prince', 'king'], ['prince', 'future'], ['daughter', 'princess'], ['princess', 'daughter'],
  ['son', 'prince'], ['prince', 'son'], ['only', 'man'], ['only', 'can'], ['only', 'king'], ['man', 'can'],
  ['man', 'only'], ['man', 'king'], ['can', 'king'], ['can', 'man'], ['can', 'only'], ['king', 'can'], ['king', 'man'],
  ['king', 'only'], ['only', 'woman'], ['only', 'can'], ['only', 'queen'], ['woman', 'can'], ['woman', 'only'], 
  ['woman', 'queen'], ['can', 'queen'], ['can', 'woman'], ['can', 'only'], ['queen', 'can'], ['queen', 'woman'], 
  ['queen', 'only'], ['princess', 'queen'], ['queen', 'princess'], ['queen', 'king'], ['queen', 'rule'],
  ['queen', 'realm'], ['king', 'rule'], ['king', 'queen'], ['king', 'realm'], ['rule', 'realm'], 
  ['rule', 'king'], ['rule', 'queen'], ['realm', 'rule'], ['realm', 'king'], ['realm', 'queen'], ['prince', 'strong'], 
  ['prince', 'man'], ['strong', 'man'], ['strong', 'prince'], ['man', 'strong'], ['man', 'prince'], 
  ['princess', 'beautiful'], ['princess', 'woman'], ['beautiful', 'woman'], ['beautiful', 'princess'],
  ['woman', 'beautiful'], ['woman', 'princess'], ['royal', 'family'], ['royal', 'king'], ['royal', 'queen'], 
  ['family', 'king'], ['family', 'royal'], ['family', 'queen'], ['family', 'their'], ['king', 'queen'], 
  ['king', 'family'], ['king', 'their'], ['king', 'royal'], ['king', 'children'], ['queen', 'their'], ['queen', 'king'],
  ['queen', 'children'], ['queen', 'family'], ['queen', 'royal'], ['their', 'children'], ['their', 'queen'],
  ['their', 'king'], ['their', 'family'], ['children', 'their'], ['children', 'queen'], ['children', 'king'],
  ['prince', 'only'], ['prince', 'boy'], ['prince', 'now'], ['only', 'boy'], ['only', 'prince'], ['only', 'now'],
  ['boy', 'now'], ['boy', 'only'], ['boy', 'prince'], ['now', 'boy'], ['now', 'only'], ['now', 'prince'], 
  ['boy', 'man'], ['man', 'boy']]

In [15]:
word_lists

[['future', 'king'],
 ['future', 'prince'],
 ['king', 'prince'],
 ['king', 'future'],
 ['prince', 'king'],
 ['prince', 'future'],
 ['daughter', 'princess'],
 ['princess', 'daughter'],
 ['son', 'prince'],
 ['prince', 'son'],
 ['only', 'man'],
 ['only', 'can'],
 ['only', 'king'],
 ['man', 'can'],
 ['man', 'only'],
 ['man', 'king'],
 ['can', 'king'],
 ['can', 'man'],
 ['can', 'only'],
 ['king', 'can'],
 ['king', 'man'],
 ['king', 'only'],
 ['only', 'woman'],
 ['only', 'can'],
 ['only', 'queen'],
 ['woman', 'can'],
 ['woman', 'only'],
 ['woman', 'queen'],
 ['can', 'queen'],
 ['can', 'woman'],
 ['can', 'only'],
 ['queen', 'can'],
 ['queen', 'woman'],
 ['queen', 'only'],
 ['princess', 'queen'],
 ['queen', 'princess'],
 ['queen', 'king'],
 ['queen', 'rule'],
 ['queen', 'realm'],
 ['king', 'rule'],
 ['king', 'queen'],
 ['king', 'realm'],
 ['rule', 'realm'],
 ['rule', 'king'],
 ['rule', 'queen'],
 ['realm', 'rule'],
 ['realm', 'king'],
 ['realm', 'queen'],
 ['prince', 'strong'],
 ['prince', 'ma

In [14]:
text='Prince'
for i, word in enumerate(text):
        print("......i....." ,i,"....word....",word)
        for w in range(3):
            print(w)
            # Getting the context that is ahead by *window* words
            if i + 1 + w < len(text): 
                word_lists.append([word] + [text[(i + 1 + w)]])
            # Getting the context that is behind by *window* words    
            if i - w - 1 >= 0:
                word_lists.append([word] + [text[(i - w - 1)]])


......i..... 0 ....word.... P
0
1
2
......i..... 1 ....word.... r
0
1
2
......i..... 2 ....word.... i
0
1
2
......i..... 3 ....word.... n
0
1
2
......i..... 4 ....word.... c
0
1
2
......i..... 5 ....word.... e
0
1
2
