#### Loading Data

In [1]:
file = open("data.txt","r")
data = file.readlines()
file.close()

In [2]:
data

['The future king is the prince\n',
 'Daughter is the princess\n',
 'Son is the prince\n',
 'Only a man can be a king\n',
 'Only a woman can be a queen\n',
 'The princess will be a queen\n',
 'The prince is a strong man\n',
 'The princess is a beautiful woman\n',
 'Prince is only a boy now\n',
 'Prince will be king\n',
 'A boy will be a man']

#### Removing '\n' from the end of every sentence and convert the sentence into lowercase

In [3]:
for i in range(len(data)):
    data[i] = data[i].lower().replace('\n','')
print(data)

['the future king is the prince', 'daughter is the princess', 'son is the prince', 'only a man can be a king', 'only a woman can be a queen', 'the princess will be a queen', 'the prince is a strong man', 'the princess is a beautiful woman', 'prince is only a boy now', 'prince will be king', 'a boy will be a man']


#### Removing stop words

In [5]:
stopwords = ['the', 'is', 'will', 'be', 'a', 'only', 'can', 'their', 'now', 'and', 'at', 'it']

filtered_data = []
for sentence in data:
    temp = []
    for word in sentence.split():
        if word not in stopwords:
            temp.append(word)
    filtered_data.append(temp)

In [6]:
filtered_data

[['future', 'king', 'prince'],
 ['daughter', 'princess'],
 ['son', 'prince'],
 ['man', 'king'],
 ['woman', 'queen'],
 ['princess', 'queen'],
 ['prince', 'strong', 'man'],
 ['princess', 'beautiful', 'woman'],
 ['prince', 'boy'],
 ['prince', 'king'],
 ['boy', 'man']]

#### Creating Biagrams

In [10]:
bigrams = []
for word_list in filtered_data:
    for i in range(len(word_list) -1):
        for j in range(i+1, len(word_list)):
            bigrams.append([word_list[i],word_list[j]])
            bigrams.append([word_list[j],word_list[i]])

In [11]:
bigrams

[['future', 'king'],
 ['king', 'future'],
 ['future', 'prince'],
 ['prince', 'future'],
 ['king', 'prince'],
 ['prince', 'king'],
 ['daughter', 'princess'],
 ['princess', 'daughter'],
 ['son', 'prince'],
 ['prince', 'son'],
 ['man', 'king'],
 ['king', 'man'],
 ['woman', 'queen'],
 ['queen', 'woman'],
 ['princess', 'queen'],
 ['queen', 'princess'],
 ['prince', 'strong'],
 ['strong', 'prince'],
 ['prince', 'man'],
 ['man', 'prince'],
 ['strong', 'man'],
 ['man', 'strong'],
 ['princess', 'beautiful'],
 ['beautiful', 'princess'],
 ['princess', 'woman'],
 ['woman', 'princess'],
 ['beautiful', 'woman'],
 ['woman', 'beautiful'],
 ['prince', 'boy'],
 ['boy', 'prince'],
 ['prince', 'king'],
 ['king', 'prince'],
 ['boy', 'man'],
 ['man', 'boy']]

#### Get Vocabulary

In [16]:
all_words = []
for bigram in bigrams:
    all_words.extend(bigram)

all_words = list(set(all_words))
all_words.sort()
print("Total number of unique words are:", len(all_words))
all_words

Total number of unique words are: 12


['beautiful',
 'boy',
 'daughter',
 'future',
 'king',
 'man',
 'prince',
 'princess',
 'queen',
 'son',
 'strong',
 'woman']

#### Creating dictionary of words

In [17]:
words_dict = {}

counter = 0
for word in all_words:
    words_dict[word] = counter
    counter +=1
print(words_dict)

{'beautiful': 0, 'boy': 1, 'daughter': 2, 'future': 3, 'king': 4, 'man': 5, 'prince': 6, 'princess': 7, 'queen': 8, 'son': 9, 'strong': 10, 'woman': 11}


#### Performing one-hot encoding

In [18]:
import numpy as np

In [23]:
onehot_data = np.zeros((len(all_words), len(all_words)))
for i in range(len(all_words)):
    onehot_data[i][i] = 1
    
onehot_dict = {}
counter = 0
for word in all_words:
    onehot_dict[word] = onehot_data[counter]
    counter+=1
onehot_dict

{'beautiful': array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'boy': array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'daughter': array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'future': array([0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'king': array([0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.]),
 'man': array([0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.]),
 'prince': array([0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.]),
 'princess': array([0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.]),
 'queen': array([0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.]),
 'son': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.]),
 'strong': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.]),
 'woman': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.])}

In [24]:
for word in onehot_dict:
    print(word, ":", onehot_dict[word])

beautiful : [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
boy : [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
daughter : [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
future : [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
king : [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
man : [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
prince : [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
princess : [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
queen : [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
son : [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
strong : [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
woman : [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]


In [27]:
X = []
Y = []

for bi in bigrams:
    X.append(onehot_dict[bi[0]])
    Y.append(onehot_dict[bi[1]])
    
X = np.array(X)
Y = np.array(Y)

[array([0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.]), array([0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.]), array([0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.]), array([0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.]), array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.]), array([0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.]), array([0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.]), array([0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]), array([0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0.,

In [28]:
Y.shape

(34, 12)

In [None]:
from keras.models import Sequential
from keras.layers import Dense

embed_size = 2

model = Sequential([
    Dense(embed_size, activation='linear'),
    Dense(Y.shape[1], activation = 'softmax')
])

model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')


In [None]:
model.fit(X, Y, epochs = 1000, batch_size = 256, verbose = False)

In [None]:
weights = model.get_weights()[0]

word_embeddings = {}
for word in all_words:
    word_embeddings[word] = weights[words_dict[word]]

# print(word_embeddings)

In [None]:
import matplotlib.pyplot as plt

# plt.figure(figsize = (10, 10))
for word in list(words_dict.keys()):
    coord = word_embeddings.get(word)
    plt.scatter(coord[0], coord[1])
    plt.annotate(word, (coord[0], coord[1]))

plt.savefig('img.jpg')