<a href="https://colab.research.google.com/github/Nourhan-Adell/Natural-language-processing/blob/main/Word_Embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import sys
!{sys.executable} -m pip install emoji

Collecting emoji
  Downloading emoji-1.7.0.tar.gz (175 kB)
[?25l[K     |█▉                              | 10 kB 16.0 MB/s eta 0:00:01[K     |███▊                            | 20 kB 13.5 MB/s eta 0:00:01[K     |█████▋                          | 30 kB 10.0 MB/s eta 0:00:01[K     |███████▌                        | 40 kB 3.6 MB/s eta 0:00:01[K     |█████████▍                      | 51 kB 3.6 MB/s eta 0:00:01[K     |███████████▏                    | 61 kB 4.2 MB/s eta 0:00:01[K     |█████████████                   | 71 kB 4.4 MB/s eta 0:00:01[K     |███████████████                 | 81 kB 4.4 MB/s eta 0:00:01[K     |████████████████▉               | 92 kB 4.9 MB/s eta 0:00:01[K     |██████████████████▊             | 102 kB 4.1 MB/s eta 0:00:01[K     |████████████████████▌           | 112 kB 4.1 MB/s eta 0:00:01[K     |██████████████████████▍         | 122 kB 4.1 MB/s eta 0:00:01[K     |████████████████████████▎       | 133 kB 4.1 MB/s eta 0:00:01[K     |████████

In [7]:
import re
import numpy as np
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize 
import emoji
from utils2 import get_dict

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# **Data Preparation**
**Steps**

*    Clean and tokenize the corpus.

*    Extract the pairs of context words and center word that will make up the training data set for the CBOW model. The context words are the features that will be fed into the model, and the center words are the target values that the model will learn to predict.

*    Create simple vector representations of the context words (features) and center words (targets) that can be used by the neural network of the CBOW model.


**Clean and tokenization:** 

In [12]:
def tokenize(corpus):
  data = re.sub(r'[,?!;-]+','.', corpus)
  data = nltk.word_tokenize(data)
  data = [ch.lower() for ch in data if ch.isalpha() or ch =='.' or emoji.get_emoji_regexp().search()]
  return data

In [13]:
corpus = 'I am happy because I am learning'
print(f'Corpus:  {corpus}')
words = tokenize(corpus)
print('Words (tokens): ', words)

Corpus:  I am happy because I am learning
Words (tokens):  ['i', 'am', 'happy', 'because', 'i', 'am', 'learning']


**Sliding Window of**

In [16]:
def get_windows(words, c):
  i = c
  while i <len(words) - c:
    center_word = words[i]
    context_words = words[(i - c): i] + words[(i + 1): (i + c + 1)]
    yield context_words, center_word
    i += 1

In [17]:
for x, y in get_windows(['i', 'am', 'happy', 'because', 'i', 'am', 'learning'], 2):
    print(f'{x}\t{y}')

['i', 'am', 'because', 'i']	happy
['am', 'happy', 'i', 'am']	because
['happy', 'because', 'am', 'learning']	i


**Transforming words into vectors for the training set**

In [26]:
# Transforming of the central word
word2Ind, Ind2word = get_dict(words)
V = len(word2Ind)

def word_to_one_hot_vector(word, word2Ind, V):
  one_hot_vector = np.zeros(V)
  one_hot_vector[word2Ind[word]] = 1
  return one_hot_vector

In [27]:
print(word2Ind)
print()
word_to_one_hot_vector('happy', word2Ind, V)

{'am': 0, 'because': 1, 'happy': 2, 'i': 3, 'learning': 4}



array([0., 0., 1., 0., 0.])

In [31]:
# Transforming of the context words
def context_words_to_vectors(context_words, word2Ind, V):
  context_words_to_vectors = [word_to_one_hot_vector(w,word2Ind,V) for w in context_words]
  context_words_to_vectors = np.mean(context_words_to_vectors, axis = 0)
  return context_words_to_vectors

In [32]:
context_words_to_vectors(['i', 'am', 'because', 'i'], word2Ind, V)

array([0.25, 0.25, 0.  , 0.5 , 0.  ])

**Building the training set:**

In [33]:
def get_training_set(words, c, word2Ind, V):
  for context_words, center_word in get_windows(words, c):
    yield context_words_to_vectors(context_words,word2Ind, V), word_to_one_hot_vector(center_word,word2Ind, V)
    

In [35]:
for context_words_vector, center_word_vector in get_training_set(words, 2, word2Ind, V):
    print(f'Context words vector:  {context_words_vector}')
    print(f'Center word vector:  {center_word_vector}')
    print()

Context words vector:  [0.25 0.25 0.   0.5  0.  ]
Center word vector:  [0. 0. 1. 0. 0.]

Context words vector:  [0.5  0.   0.25 0.25 0.  ]
Center word vector:  [0. 1. 0. 0. 0.]

Context words vector:  [0.25 0.25 0.25 0.   0.25]
Center word vector:  [0. 0. 0. 1. 0.]



# **The continous bag-of-word model:**
**Steps:**


*    The two activation functions used in the neural network.

*    Forward propagation.

*    Cross-entropy loss.

*    Backpropagation.

*   Gradient descent.

*    Extracting the word embedding vectors from the weight matrices once the neural network has been trained.


### **1. Activation functions:**

In [45]:
# ReLU function
def ReLU(z):
  result = z.copy()
  result [result < 0] = 0
  return result

In [38]:
#Softmax function
def softmax(z):
  result = np.exp(z)
  sum_result = np.sum(result)
  return result / sum_result

### **2. Forward Propagation:**

In [39]:
# Intializr the weights and bias matrices:
W1 = np.array([[ 0.41687358,  0.08854191, -0.23495225,  0.28320538,  0.41800106],
               [ 0.32735501,  0.22795148, -0.23951958,  0.4117634 , -0.23924344],
               [ 0.26637602, -0.23846886, -0.37770863, -0.11399446,  0.34008124]])

W2 = np.array([[-0.22182064, -0.43008631,  0.13310965],
               [ 0.08476603,  0.08123194,  0.1772054 ],
               [ 0.1871551 , -0.06107263, -0.1790735 ],
               [ 0.07055222, -0.02015138,  0.36107434],
               [ 0.33480474, -0.39423389, -0.43959196]])

b1 = np.array([[ 0.09688219],
               [ 0.29239497],
               [-0.27364426]])

b2 = np.array([[ 0.0352008 ],
               [-0.36393384],
               [-0.12775555],
               [-0.34802326],
               [-0.07017815]])

In [53]:
training_set = get_training_set(words, 2, word2Ind, V)
x_array, y_array = next(training_set)
x = x_array.copy()
x.shape = (V, 1)

# Values of hidden layer
z1 = np.dot(W1, x) + b1
h =ReLU(z1)

# Values of output layer
z2 = np.dot(W2,h) + b2
y_predict = softmax(z2)

### **3. Cross-entropy loss:**

In [54]:
def cross_entropy_loss(y, y_predict):
  loss = np.sum(-np.log(y_predict) * y)
  return loss

### **4. Back Propagation:**


In [57]:
y = y_array.copy()
y.shape = (V, 1)

grad_b2 = y_predict - y

grad_W2 = np.dot((y_predict - y), h.T)

grad_b1 = ReLU(np.dot(W2.T, (y_predict - y) ))

grad_W1 = np.dot(ReLU(np.dot(W2.T, (y_predict - y) )), x.T)

### **5. Gradient Descent:**

In [58]:
alpha = 0.03

W1_new = W1 - (alpha * grad_W1)

W2_new = W2 - (alpha * grad_W2)

b1_new = b1 - (alpha * grad_b1)

b2_new = b2 - (alpha * grad_b2)

### **5. Extracting word embedding vectors:**

In [60]:
# Option 1: extract embedding vectors from 𝐖1
# So the word embedding vectors corresponding to each word are
for word in word2Ind:
  word_embedding_vector = W1[:, word2Ind[word]]
  print(f'{word}: {word_embedding_vector}')

am: [0.41687358 0.32735501 0.26637602]
because: [ 0.08854191  0.22795148 -0.23846886]
happy: [-0.23495225 -0.23951958 -0.37770863]
i: [ 0.28320538  0.4117634  -0.11399446]
learning: [ 0.41800106 -0.23924344  0.34008124]


In [61]:
#Option 2: extract embedding vectors from 𝐖2
for word in word2Ind:
  word_embedding_vector2 = W2.T[:, word2Ind[word]]
  print(f'{word}: {word_embedding_vector2}')

am: [-0.22182064 -0.43008631  0.13310965]
because: [0.08476603 0.08123194 0.1772054 ]
happy: [ 0.1871551  -0.06107263 -0.1790735 ]
i: [ 0.07055222 -0.02015138  0.36107434]
learning: [ 0.33480474 -0.39423389 -0.43959196]


In [65]:
# Option 3: extract embedding vectors from 𝐖1 and 𝐖2
W3 = 0.5 * (W1 + W2.T)

for word in word2Ind:
  word_embedding_vector3 = W3[:, word2Ind[word]]
  print(f'{word}: {word_embedding_vector3}')

am: [ 0.09752647 -0.05136565  0.19974284]
because: [ 0.08665397  0.15459171 -0.03063173]
happy: [-0.02389858 -0.15029611 -0.27839106]
i: [0.1768788  0.19580601 0.12353994]
learning: [ 0.3764029  -0.31673866 -0.04975536]
