In [1]:
import nltk
from nltk.tokenize import word_tokenize
import numpy as np

nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Trung\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

<a name='1'></a>
## 0 - Data preparation
In the data preparation phase, starting with a corpus of text, you will:

- Clean and tokenize the corpus.

- Extract the pairs of context words and center word that will make up the training data set for the CBOW model. The context words are the features that will be fed into the model, and the center words are the target values that the model will learn to predict.

- Create simple vector representations of the context words (features) and center words (targets) that can be used by the neural network of the CBOW model.

In [2]:
import re
with open("./data/shakespeare.txt") as f:
    data = f.read()
data = re.sub(r"[,!?;-]", ".", data)
data = nltk.word_tokenize(data)
data = [word.lower() for word in data if word.isalpha() or word == "."]
print("Number of tokens: ", len(data), "\n", data[:15])

Number of tokens:  60976 
 ['o', 'for', 'a', 'muse', 'of', 'fire', '.', 'that', 'would', 'ascend', 'the', 'brightest', 'heaven', 'of', 'invention']


In [3]:
fdist = nltk.FreqDist(word for word in data)
print("Size of vocabulary: ", len(fdist))
print("Most frequent tokens: ", fdist.most_common(20)) # Print the 20 most common word

Size of vocabulary:  5775
Most frequent tokens:  [('.', 9630), ('the', 1521), ('and', 1394), ('i', 1257), ('to', 1159), ('of', 1093), ('my', 857), ('that', 781), ('in', 770), ('a', 752), ('you', 748), ('is', 630), ('not', 559), ('for', 467), ('it', 460), ('with', 441), ('his', 434), ('but', 417), ('me', 417), ('your', 397)]


In [4]:
def get_dict(data):
    '''
    Inputs:
        data: data input under token format
    Outputs:
        word2Ind: dict with key is word and value is index
        Ind2word: dict with key is index and value is word
    '''
    vocab = sorted(list(set(data)))
    word2Ind = {}
    Ind2word = {}
    
    for ind, word in enumerate(vocab):
        word2Ind[word] = ind
        Ind2word[ind] = word
    return word2Ind, Ind2word

In [5]:
word2Ind, Ind2word = get_dict(data)
V = len(word2Ind)
print("Size of vocabulary: ", V)

Size of vocabulary:  5775


In [6]:
# example of word to index mapping
print("Index of the word 'king' :  ",word2Ind['king'] )
print("Word which has index 2743:  ",Ind2word[2743] )

Index of the word 'king' :   2744
Word which has index 2743:   kinds


## Sliding window of words
Now that you have transformed the corpus into a list of clean tokens, you can slide a window of words across this list. For each window you can extract a center word and the context words.

The `get_windows` function in the next cell was introduced in the lecture.

In [7]:
def get_windows(words, C):
    i = C
    while i < len(words) - C:
        center_word = words[i]
        context_words = words[(i - C):i] + words[(i+1):(i+C+1)]
        yield context_words, center_word
        i += 1

## Transforming words into vectors for the training set
To finish preparing the training set, you need to transform the context words and center words into vectors.

### Getting one-hot word vectors

Recall from the lecture that you can easily convert an integer, $n$, into a one-hot vector.

Consider the word "happy". First, retrieve its numeric index.

In [8]:
def word_to_one_hot_vector(word, word2Ind, V):
    one_hot_vector = np.zeros(V)
    one_hot_vector[word2Ind[word]] = 1
    
    return one_hot_vector

In [9]:
tmp_v = word_to_one_hot_vector('king', word2Ind, V)
print(tmp_v)
print(np.sum(tmp_v))
print(tmp_v.shape)

[0. 0. 0. ... 0. 0. 0.]
1.0
(5775,)


### Getting context word vectors
To create the vectors that represent context words, you will calculate the average of the one-hot vectors representing the individual words.

In [10]:
def context_words_to_vector(context_words, word2Ind, V):
    context_words_vectors = [word_to_one_hot_vector(w, word2Ind, V) for w in context_words]
    context_words_vectors = np.mean(context_words_vectors, axis=0)
    
    return context_words_vectors

In [11]:
tmp_v = context_words_to_vector(['a', 'kingdom', 'a', 'stage'], word2Ind, V)
set(tmp_v)

{0.0, 0.25, 0.5}


## Building the training set
You can now combine the functions that you created in the previous sections, to build a training set for the CBOW model, starting from the following tokenized corpus.

In [12]:
def get_training_example(words, C, word2Ind, V):
    for context_words, center_word in get_windows(words, C):
        yield context_words_to_vector(context_words, word2Ind, V), word_to_one_hot_vector(center_word, word2Ind, V)

In [13]:
new_data = data[:5000]

In [14]:
x_data = np.random.rand(0, V)
y_data = np.random.rand(0, V)
for context_words, center_word in get_training_example(new_data, 2, word2Ind, V):
    context_words.shape = (1, V)
    center_word.shape = (1, V)
    x_data = np.append(x_data, context_words, axis = 0)
    y_data = np.append(y_data, center_word, axis = 0)
    print(x_data.shape)

(1, 5775)
(2, 5775)
(3, 5775)
(4, 5775)
(5, 5775)
(6, 5775)
(7, 5775)
(8, 5775)
(9, 5775)
(10, 5775)
(11, 5775)
(12, 5775)
(13, 5775)
(14, 5775)
(15, 5775)
(16, 5775)
(17, 5775)
(18, 5775)
(19, 5775)
(20, 5775)
(21, 5775)
(22, 5775)
(23, 5775)
(24, 5775)
(25, 5775)
(26, 5775)
(27, 5775)
(28, 5775)
(29, 5775)
(30, 5775)
(31, 5775)
(32, 5775)
(33, 5775)
(34, 5775)
(35, 5775)
(36, 5775)
(37, 5775)
(38, 5775)
(39, 5775)
(40, 5775)
(41, 5775)
(42, 5775)
(43, 5775)
(44, 5775)
(45, 5775)
(46, 5775)
(47, 5775)
(48, 5775)
(49, 5775)
(50, 5775)
(51, 5775)
(52, 5775)
(53, 5775)
(54, 5775)
(55, 5775)
(56, 5775)
(57, 5775)
(58, 5775)
(59, 5775)
(60, 5775)
(61, 5775)
(62, 5775)
(63, 5775)
(64, 5775)
(65, 5775)
(66, 5775)
(67, 5775)
(68, 5775)
(69, 5775)
(70, 5775)
(71, 5775)
(72, 5775)
(73, 5775)
(74, 5775)
(75, 5775)
(76, 5775)
(77, 5775)
(78, 5775)
(79, 5775)
(80, 5775)
(81, 5775)
(82, 5775)
(83, 5775)
(84, 5775)
(85, 5775)
(86, 5775)
(87, 5775)
(88, 5775)
(89, 5775)
(90, 5775)
(91, 5775)
(92, 577

(698, 5775)
(699, 5775)
(700, 5775)
(701, 5775)
(702, 5775)
(703, 5775)
(704, 5775)
(705, 5775)
(706, 5775)
(707, 5775)
(708, 5775)
(709, 5775)
(710, 5775)
(711, 5775)
(712, 5775)
(713, 5775)
(714, 5775)
(715, 5775)
(716, 5775)
(717, 5775)
(718, 5775)
(719, 5775)
(720, 5775)
(721, 5775)
(722, 5775)
(723, 5775)
(724, 5775)
(725, 5775)
(726, 5775)
(727, 5775)
(728, 5775)
(729, 5775)
(730, 5775)
(731, 5775)
(732, 5775)
(733, 5775)
(734, 5775)
(735, 5775)
(736, 5775)
(737, 5775)
(738, 5775)
(739, 5775)
(740, 5775)
(741, 5775)
(742, 5775)
(743, 5775)
(744, 5775)
(745, 5775)
(746, 5775)
(747, 5775)
(748, 5775)
(749, 5775)
(750, 5775)
(751, 5775)
(752, 5775)
(753, 5775)
(754, 5775)
(755, 5775)
(756, 5775)
(757, 5775)
(758, 5775)
(759, 5775)
(760, 5775)
(761, 5775)
(762, 5775)
(763, 5775)
(764, 5775)
(765, 5775)
(766, 5775)
(767, 5775)
(768, 5775)
(769, 5775)
(770, 5775)
(771, 5775)
(772, 5775)
(773, 5775)
(774, 5775)
(775, 5775)
(776, 5775)
(777, 5775)
(778, 5775)
(779, 5775)
(780, 5775)
(781

(1353, 5775)
(1354, 5775)
(1355, 5775)
(1356, 5775)
(1357, 5775)
(1358, 5775)
(1359, 5775)
(1360, 5775)
(1361, 5775)
(1362, 5775)
(1363, 5775)
(1364, 5775)
(1365, 5775)
(1366, 5775)
(1367, 5775)
(1368, 5775)
(1369, 5775)
(1370, 5775)
(1371, 5775)
(1372, 5775)
(1373, 5775)
(1374, 5775)
(1375, 5775)
(1376, 5775)
(1377, 5775)
(1378, 5775)
(1379, 5775)
(1380, 5775)
(1381, 5775)
(1382, 5775)
(1383, 5775)
(1384, 5775)
(1385, 5775)
(1386, 5775)
(1387, 5775)
(1388, 5775)
(1389, 5775)
(1390, 5775)
(1391, 5775)
(1392, 5775)
(1393, 5775)
(1394, 5775)
(1395, 5775)
(1396, 5775)
(1397, 5775)
(1398, 5775)
(1399, 5775)
(1400, 5775)
(1401, 5775)
(1402, 5775)
(1403, 5775)
(1404, 5775)
(1405, 5775)
(1406, 5775)
(1407, 5775)
(1408, 5775)
(1409, 5775)
(1410, 5775)
(1411, 5775)
(1412, 5775)
(1413, 5775)
(1414, 5775)
(1415, 5775)
(1416, 5775)
(1417, 5775)
(1418, 5775)
(1419, 5775)
(1420, 5775)
(1421, 5775)
(1422, 5775)
(1423, 5775)
(1424, 5775)
(1425, 5775)
(1426, 5775)
(1427, 5775)
(1428, 5775)
(1429, 5775)

(1984, 5775)
(1985, 5775)
(1986, 5775)
(1987, 5775)
(1988, 5775)
(1989, 5775)
(1990, 5775)
(1991, 5775)
(1992, 5775)
(1993, 5775)
(1994, 5775)
(1995, 5775)
(1996, 5775)
(1997, 5775)
(1998, 5775)
(1999, 5775)
(2000, 5775)
(2001, 5775)
(2002, 5775)
(2003, 5775)
(2004, 5775)
(2005, 5775)
(2006, 5775)
(2007, 5775)
(2008, 5775)
(2009, 5775)
(2010, 5775)
(2011, 5775)
(2012, 5775)
(2013, 5775)
(2014, 5775)
(2015, 5775)
(2016, 5775)
(2017, 5775)
(2018, 5775)
(2019, 5775)
(2020, 5775)
(2021, 5775)
(2022, 5775)
(2023, 5775)
(2024, 5775)
(2025, 5775)
(2026, 5775)
(2027, 5775)
(2028, 5775)
(2029, 5775)
(2030, 5775)
(2031, 5775)
(2032, 5775)
(2033, 5775)
(2034, 5775)
(2035, 5775)
(2036, 5775)
(2037, 5775)
(2038, 5775)
(2039, 5775)
(2040, 5775)
(2041, 5775)
(2042, 5775)
(2043, 5775)
(2044, 5775)
(2045, 5775)
(2046, 5775)
(2047, 5775)
(2048, 5775)
(2049, 5775)
(2050, 5775)
(2051, 5775)
(2052, 5775)
(2053, 5775)
(2054, 5775)
(2055, 5775)
(2056, 5775)
(2057, 5775)
(2058, 5775)
(2059, 5775)
(2060, 5775)

(2616, 5775)
(2617, 5775)
(2618, 5775)
(2619, 5775)
(2620, 5775)
(2621, 5775)
(2622, 5775)
(2623, 5775)
(2624, 5775)
(2625, 5775)
(2626, 5775)
(2627, 5775)
(2628, 5775)
(2629, 5775)
(2630, 5775)
(2631, 5775)
(2632, 5775)
(2633, 5775)
(2634, 5775)
(2635, 5775)
(2636, 5775)
(2637, 5775)
(2638, 5775)
(2639, 5775)
(2640, 5775)
(2641, 5775)
(2642, 5775)
(2643, 5775)
(2644, 5775)
(2645, 5775)
(2646, 5775)
(2647, 5775)
(2648, 5775)
(2649, 5775)
(2650, 5775)
(2651, 5775)
(2652, 5775)
(2653, 5775)
(2654, 5775)
(2655, 5775)
(2656, 5775)
(2657, 5775)
(2658, 5775)
(2659, 5775)
(2660, 5775)
(2661, 5775)
(2662, 5775)
(2663, 5775)
(2664, 5775)
(2665, 5775)
(2666, 5775)
(2667, 5775)
(2668, 5775)
(2669, 5775)
(2670, 5775)
(2671, 5775)
(2672, 5775)
(2673, 5775)
(2674, 5775)
(2675, 5775)
(2676, 5775)
(2677, 5775)
(2678, 5775)
(2679, 5775)
(2680, 5775)
(2681, 5775)
(2682, 5775)
(2683, 5775)
(2684, 5775)
(2685, 5775)
(2686, 5775)
(2687, 5775)
(2688, 5775)
(2689, 5775)
(2690, 5775)
(2691, 5775)
(2692, 5775)

(3247, 5775)
(3248, 5775)
(3249, 5775)
(3250, 5775)
(3251, 5775)
(3252, 5775)
(3253, 5775)
(3254, 5775)
(3255, 5775)
(3256, 5775)
(3257, 5775)
(3258, 5775)
(3259, 5775)
(3260, 5775)
(3261, 5775)
(3262, 5775)
(3263, 5775)
(3264, 5775)
(3265, 5775)
(3266, 5775)
(3267, 5775)
(3268, 5775)
(3269, 5775)
(3270, 5775)
(3271, 5775)
(3272, 5775)
(3273, 5775)
(3274, 5775)
(3275, 5775)
(3276, 5775)
(3277, 5775)
(3278, 5775)
(3279, 5775)
(3280, 5775)
(3281, 5775)
(3282, 5775)
(3283, 5775)
(3284, 5775)
(3285, 5775)
(3286, 5775)
(3287, 5775)
(3288, 5775)
(3289, 5775)
(3290, 5775)
(3291, 5775)
(3292, 5775)
(3293, 5775)
(3294, 5775)
(3295, 5775)
(3296, 5775)
(3297, 5775)
(3298, 5775)
(3299, 5775)
(3300, 5775)
(3301, 5775)
(3302, 5775)
(3303, 5775)
(3304, 5775)
(3305, 5775)
(3306, 5775)
(3307, 5775)
(3308, 5775)
(3309, 5775)
(3310, 5775)
(3311, 5775)
(3312, 5775)
(3313, 5775)
(3314, 5775)
(3315, 5775)
(3316, 5775)
(3317, 5775)
(3318, 5775)
(3319, 5775)
(3320, 5775)
(3321, 5775)
(3322, 5775)
(3323, 5775)

(3878, 5775)
(3879, 5775)
(3880, 5775)
(3881, 5775)
(3882, 5775)
(3883, 5775)
(3884, 5775)
(3885, 5775)
(3886, 5775)
(3887, 5775)
(3888, 5775)
(3889, 5775)
(3890, 5775)
(3891, 5775)
(3892, 5775)
(3893, 5775)
(3894, 5775)
(3895, 5775)
(3896, 5775)
(3897, 5775)
(3898, 5775)
(3899, 5775)
(3900, 5775)
(3901, 5775)
(3902, 5775)
(3903, 5775)
(3904, 5775)
(3905, 5775)
(3906, 5775)
(3907, 5775)
(3908, 5775)
(3909, 5775)
(3910, 5775)
(3911, 5775)
(3912, 5775)
(3913, 5775)
(3914, 5775)
(3915, 5775)
(3916, 5775)
(3917, 5775)
(3918, 5775)
(3919, 5775)
(3920, 5775)
(3921, 5775)
(3922, 5775)
(3923, 5775)
(3924, 5775)
(3925, 5775)
(3926, 5775)
(3927, 5775)
(3928, 5775)
(3929, 5775)
(3930, 5775)
(3931, 5775)
(3932, 5775)
(3933, 5775)
(3934, 5775)
(3935, 5775)
(3936, 5775)
(3937, 5775)
(3938, 5775)
(3939, 5775)
(3940, 5775)
(3941, 5775)
(3942, 5775)
(3943, 5775)
(3944, 5775)
(3945, 5775)
(3946, 5775)
(3947, 5775)
(3948, 5775)
(3949, 5775)
(3950, 5775)
(3951, 5775)
(3952, 5775)
(3953, 5775)
(3954, 5775)

(4510, 5775)
(4511, 5775)
(4512, 5775)
(4513, 5775)
(4514, 5775)
(4515, 5775)
(4516, 5775)
(4517, 5775)
(4518, 5775)
(4519, 5775)
(4520, 5775)
(4521, 5775)
(4522, 5775)
(4523, 5775)
(4524, 5775)
(4525, 5775)
(4526, 5775)
(4527, 5775)
(4528, 5775)
(4529, 5775)
(4530, 5775)
(4531, 5775)
(4532, 5775)
(4533, 5775)
(4534, 5775)
(4535, 5775)
(4536, 5775)
(4537, 5775)
(4538, 5775)
(4539, 5775)
(4540, 5775)
(4541, 5775)
(4542, 5775)
(4543, 5775)
(4544, 5775)
(4545, 5775)
(4546, 5775)
(4547, 5775)
(4548, 5775)
(4549, 5775)
(4550, 5775)
(4551, 5775)
(4552, 5775)
(4553, 5775)
(4554, 5775)
(4555, 5775)
(4556, 5775)
(4557, 5775)
(4558, 5775)
(4559, 5775)
(4560, 5775)
(4561, 5775)
(4562, 5775)
(4563, 5775)
(4564, 5775)
(4565, 5775)
(4566, 5775)
(4567, 5775)
(4568, 5775)
(4569, 5775)
(4570, 5775)
(4571, 5775)
(4572, 5775)
(4573, 5775)
(4574, 5775)
(4575, 5775)
(4576, 5775)
(4577, 5775)
(4578, 5775)
(4579, 5775)
(4580, 5775)
(4581, 5775)
(4582, 5775)
(4583, 5775)
(4584, 5775)
(4585, 5775)
(4586, 5775)

<a name='1'></a>
## 1 - The Continuous Bag of Words Model

Let's take a look at the following sentence: 
>**'I am happy because I am learning'**. 

- In continuous bag of words (CBOW) modeling, we try to predict the center word given a few context words (the words around the center word).
- For example, if you were to choose a context half-size of say $C = 2$, then you would try to predict the word **happy** given the context that includes 2 words before and 2 words after the center word:

> $C$ words before: [I, am] 

> $C$ words after: [because, I] 

- In other words:

$$context = [I,am, because, I]$$
$$target = happy$$

The structure of your model will look like this:

<div style="width:image width px; font-size:100%; text-align:center;"><img src='images/word2.png' alt="alternate text" width="width" height="height" style="width:600px;height:250px;" /> Figure 1 </div>

Where $\bar x$ is the average of all the one hot vectors of the context words. 

<div style="width:image width px; font-size:100%; text-align:center;"><img src='images/mean_vec2.png' alt="alternate text" width="width" height="height" style="width:600px;height:250px;" /> Figure 2 </div>

Once you have encoded all the context words, you can use $\bar x$ as the input to your model. 

The architecture you will be implementing is as follows:

\begin{align}
 h &= W_1 \  X + b_1  \tag{1} \\
 a &= ReLU(h)  \tag{2} \\
 z &= W_2 \  a + b_2   \tag{3} \\
 \hat y &= softmax(z)   \tag{4} \\
\end{align}

<a name='2'></a>
## 2 - Training the Model

<a name='2.1'></a>
### 2.1 - Initializing the Model

You will now initialize two matrices and two vectors. 
- The first matrix ($W_1$) is of dimension $N \times V$, where $V$ is the number of words in your vocabulary and $N$ is the dimension of your word vector.
- The second matrix ($W_2$) is of dimension $V \times N$. 
- Vector $b_1$ has dimensions $N\times 1$
- Vector $b_2$ has dimensions  $V\times 1$. 
- $b_1$ and $b_2$ are the bias vectors of the linear layers from matrices $W_1$ and $W_2$.

The overall structure of the model will look as in Figure 1, but at this stage we are just initializing the parameters. 

In [15]:
word2Ind, Ind2word = get_dict(data)
V = len(word2Ind)
print("Size of vocabulary: ", V)

Size of vocabulary:  5775


In [16]:
# example of word to index mapping
print("Index of the word 'king' :  ",word2Ind['king'] )
print("Word which has index 2743:  ",Ind2word[2743] )

Index of the word 'king' :   2744
Word which has index 2743:   kinds


In [17]:
def initialize_model(N, V, batche_size, random_seed = 1):
    '''
    Inputs: 
        N:  dimension of hidden vector 
        V:  dimension of vocabulary
        random_seed: random seed for consistent results in the unit tests
     Outputs: 
        W1, W2, b1, b2: initialized weights and biases
    '''
    np.random.seed(random_seed)
    W1 = np.random.rand(N, V)
    W2 = np.random.rand(V, N)
    b1 = np.random.rand(N, batche_size)
    b2 = np.random.rand(V, batche_size)
    return W1, W2, b1, b2

In [18]:
# Test your function example.
tmp_N = 4
tmp_V = 10
tmp_batch_size = 128
tmp_W1, tmp_W2, tmp_b1, tmp_b2 = initialize_model(tmp_N,tmp_V, tmp_batch_size)
assert tmp_W1.shape == ((tmp_N,tmp_V))
assert tmp_W2.shape == ((tmp_V,tmp_N))
print(f"tmp_W1.shape: {tmp_W1.shape}")
print(f"tmp_W2.shape: {tmp_W2.shape}")
print(f"tmp_b1.shape: {tmp_b1.shape}")
print(f"tmp_b2.shape: {tmp_b2.shape}")

tmp_W1.shape: (4, 10)
tmp_W2.shape: (10, 4)
tmp_b1.shape: (4, 128)
tmp_b2.shape: (10, 128)


<a name='2.2'></a>
### 2.2 - Softmax
Before we can start training the model, we need to implement the softmax function as defined in equation 5:  

<br>
$$ \text{softmax}(z_i) = \frac{e^{z_i} }{\sum_{i=0}^{V-1} e^{z_i} }  \tag{5} $$

- Array indexing in code starts at 0.
- $V$ is the number of words in the vocabulary (which is also the number of rows of $z$).
- $i$ goes from 0 to |V| - 1.

In [19]:
def softmax(z):
    '''
    Inputs: 
        z: output scores from the hidden layer
    Outputs: 
        yhat: prediction (estimate of y)
    '''
    e_z = np.exp(z)
    sum_ez = np.sum(e_z, axis = 0)
    yhat = e_z / sum_ez
    return yhat

In [20]:
# Test the function
tmp = np.array([[1,2,3],
                [1,1,1]
               ])
tmp_sm = softmax(tmp)
display(tmp_sm)

array([[0.5       , 0.73105858, 0.88079708],
       [0.5       , 0.26894142, 0.11920292]])

<a name='2.3'></a>
### 2.3 - Forward Propagation
Implement the forward propagation $z$ according to equations (1) to (3). <br>

\begin{align}
 h &= W_1 \  X + b_1  \tag{1} \\
 h &= ReLU(h)  \tag{2} \\
 z &= W_2 \  h + b_2   \tag{3} \\
\end{align}

For that, you will use as activation the Rectified Linear Unit (ReLU) given by:

$$f(h)=\max (0,h) \tag{6}$$

In [21]:
def forward_prop(x, W1, W2, b1, b2):
    '''
    Inputs: 
        x:  average one hot vector for the context 
        W1, W2, b1, b2:  matrices and biases to be learned
     Outputs: 
        z:  output score vector
    '''
    # Calculate h
    h = np.dot(W1, x) + b1
    # ReLU activation
    h = np.maximum(0, h)
    
    # Calculate z
    z = np.dot(W2, h) + b2
    # Softmax activation
#    z = softmax(z)
    
    return z, h

In [22]:
# Test the function

# Create some inputs
tmp_N = 2
tmp_V = 3
tmp_x = np.array([[0,1,0],
                 [0,0,1]]).T
#print(tmp_x)
tmp_W1, tmp_W2, tmp_b1, tmp_b2 = initialize_model(N=tmp_N,V=tmp_V,batche_size= tmp_x.shape[1], random_seed=1)

print(f"x has shape {tmp_x.shape}")
print(f"N is {tmp_N} and vocabulary size V is {tmp_V}")

# call function
tmp_z, tmp_h = forward_prop(tmp_x, tmp_W1, tmp_W2, tmp_b1, tmp_b2)
print("call forward_prop")
print()
# Look at output
print(f"z has shape {tmp_z.shape}")
print("z has values:")
print(tmp_z)

print()

print(f"h has shape {tmp_h.shape}")
print("h has values:")
print(tmp_h)

x has shape (3, 2)
N is 2 and vocabulary size V is 3
call forward_prop

z has shape (3, 2)
z has values:
[[0.64973106 0.9858653 ]
 [0.60113969 0.957568  ]
 [1.30773242 1.85910115]]

h has shape (2, 2)
h has values:
[[0.92477674 0.87823181]
 [0.17414348 0.7628061 ]]


<a name='2.4'></a>
### 2.4 - Cost Function

In [23]:
# compute_cost: cross-entropy cost function
def compute_cost(y, yhat, batch_size):
    logprobs = np.multiply(np.log(yhat),y)
    cost = - 1/batch_size * np.sum(logprobs)
    cost = np.squeeze(cost)
    return cost

In [24]:
# Test the function
tmp_C = 2
tmp_N = 50
tmp_word2Ind, tmp_Ind2word = get_dict(data)
tmp_V = len(word2Ind)

tmp_x = x_data.T
tmp_y = y_data.T
tmp_batch_size = x_data.shape[0]
        
print(f"tmp_x.shape {tmp_x.shape}")
print(f"tmp_y.shape {tmp_y.shape}")

tmp_W1, tmp_W2, tmp_b1, tmp_b2 = initialize_model(tmp_N,tmp_V, tmp_batch_size)

print(f"tmp_W1.shape {tmp_W1.shape}")
print(f"tmp_W2.shape {tmp_W2.shape}")
print(f"tmp_b1.shape {tmp_b1.shape}")
print(f"tmp_b2.shape {tmp_b2.shape}")

tmp_z, tmp_h = forward_prop(tmp_x, tmp_W1, tmp_W2, tmp_b1, tmp_b2)
print(f"tmp_z.shape: {tmp_z.shape}")
print(f"tmp_h.shape: {tmp_h.shape}")

tmp_yhat = softmax(tmp_z)
print(f"tmp_yhat.shape: {tmp_yhat.shape}")

tmp_cost = compute_cost(tmp_y, tmp_yhat, tmp_batch_size)
print("call compute_cost")
print(f"tmp_cost {tmp_cost:.4f}")

tmp_x.shape (5775, 4996)
tmp_y.shape (5775, 4996)
tmp_W1.shape (50, 5775)
tmp_W2.shape (5775, 50)
tmp_b1.shape (50, 4996)
tmp_b2.shape (5775, 4996)
tmp_z.shape: (5775, 4996)
tmp_h.shape: (50, 4996)
tmp_yhat.shape: (5775, 4996)
call compute_cost
tmp_cost 10.6016


<a name='2.5'></a>
### 2.5 - Training the Model - Backpropagation

In [25]:
def back_prop(x, yhat, y, h, W1, W2, b1, b2, batch_size):
    '''
    Inputs: 
        x:  average one hot vector for the context 
        yhat: prediction (estimate of y)
        y:  target vector
        h:  hidden vector (see eq. 1)
        W1, W2, b1, b2:  matrices and biases  
        batch_size: batch size 
     Outputs: 
        grad_W1, grad_W2, grad_b1, grad_b2:  gradients of matrices and biases   
    '''
    # Compute l1 as W2^T (Yhat - Y)
    l1 = np.dot(W2.T, (yhat - y))

    # if z1 < 0, then l1 = 0
    # otherwise l1 = l1
    # (this is already implemented for you)
    
    l1[l1 < 0] = 0 # use "l1" to compute gradients below

    # compute the gradient for W1
    grad_W1 = 1/batch_size * np.dot(l1, x.T)

    # Compute gradient of W2
    grad_W2 = 1/batch_size * np.dot(yhat - y, h.T)
    
    # compute gradient for b1
    ones = np.ones(batch_size)
    ones.shape = (batch_size, 1)
    grad_b1 = 1/batch_size * np.dot(l1, ones)

    # compute gradient for b2
    grad_b2 = 1/batch_size * np.dot(yhat - y, ones)
    
    return grad_W1, grad_W2, grad_b1, grad_b2

In [26]:
# Test the function
tmp_C = 2
tmp_N = 50
tmp_word2Ind, tmp_Ind2word = get_dict(data)
tmp_V = len(word2Ind)


# get a batch of data
tmp_x = x_data.T
tmp_y = y_data.T
tmp_batch_size = x_data.shape[0]

print("get a batch of data")
print(f"tmp_x.shape {tmp_x.shape}")
print(f"tmp_y.shape {tmp_y.shape}")

print()
print("Initialize weights and biases")
tmp_W1, tmp_W2, tmp_b1, tmp_b2 = initialize_model(tmp_N,tmp_V, tmp_batch_size)

print(f"tmp_W1.shape {tmp_W1.shape}")
print(f"tmp_W2.shape {tmp_W2.shape}")
print(f"tmp_b1.shape {tmp_b1.shape}")
print(f"tmp_b2.shape {tmp_b2.shape}")

print()
print("Forwad prop to get z and h")
tmp_z, tmp_h = forward_prop(tmp_x, tmp_W1, tmp_W2, tmp_b1, tmp_b2)
print(f"tmp_z.shape: {tmp_z.shape}")
print(f"tmp_h.shape: {tmp_h.shape}")

print()
print("Get yhat by calling softmax")
tmp_yhat = softmax(tmp_z)
print(f"tmp_yhat.shape: {tmp_yhat.shape}")

tmp_m = (2*tmp_C)
tmp_grad_W1, tmp_grad_W2, tmp_grad_b1, tmp_grad_b2 = back_prop(tmp_x, tmp_yhat, tmp_y, tmp_h, tmp_W1, tmp_W2, tmp_b1, tmp_b2, tmp_batch_size)

print()
print("call back_prop")
print(f"tmp_grad_W1.shape {tmp_grad_W1.shape}")
print(f"tmp_grad_W2.shape {tmp_grad_W2.shape}")
print(f"tmp_grad_b1.shape {tmp_grad_b1.shape}")
print(f"tmp_grad_b2.shape {tmp_grad_b2.shape}")

get a batch of data
tmp_x.shape (5775, 4996)
tmp_y.shape (5775, 4996)

Initialize weights and biases
tmp_W1.shape (50, 5775)
tmp_W2.shape (5775, 50)
tmp_b1.shape (50, 4996)
tmp_b2.shape (5775, 4996)

Forwad prop to get z and h
tmp_z.shape: (5775, 4996)
tmp_h.shape: (50, 4996)

Get yhat by calling softmax
tmp_yhat.shape: (5775, 4996)

call back_prop
tmp_grad_W1.shape (50, 5775)
tmp_grad_W2.shape (5775, 50)
tmp_grad_b1.shape (50, 1)
tmp_grad_b2.shape (5775, 1)


<a name='2.6'></a>
### 2.6 - Gradient Descent

In [27]:
def gradient_descent(x_data, y_data, word2Ind, N, V, num_iters=150, alpha=0.03, 
                     random_seed=282, initialize_model=initialize_model, 
                     forward_prop=forward_prop, softmax=softmax, 
                     compute_cost=compute_cost, 
                     back_prop=back_prop):
    
    '''
    This is the gradient_descent function
    
      Inputs: 
        x_data:      text
        word2Ind:  words to Indices
        N:         dimension of hidden vector  
        V:         dimension of vocabulary 
        num_iters: number of iterations  
        random_seed: random seed to initialize the model's matrices and vectors
        initialize_model: your implementation of the function to initialize the model
        forward_prop: your implementation of the function to perform forward propagation
        softmax: your implementation of the softmax function
        compute_cost: cost function (Cross entropy)
        back_prop: your implementation of the function to perform backward propagation
     Outputs: 
        W1, W2, b1, b2:  updated matrices and biases after num_iters iterations

    '''
    batch_size = x_data.shape[0]
    x = x_data.T
    y = y_data.T
    W1, W2, b1, b2 = initialize_model(N,V,batch_size,random_seed=random_seed) #W1=(N,V) and W2=(V,N)
    C = 2
    
    for it in range(num_iters):
        # get z and h
        z, h = forward_prop(x,W1,W2,b1,b2)
                
        # get yhat
        yhat = softmax(z)
        
        # get cost
        cost = compute_cost(y,yhat,batch_size)
        if ( (it+1) % 10 == 0):
            print(f"iters: {it + 1} cost: {cost:.6f}")
            
        # get gradients
        grad_W1, grad_W2, grad_b1, grad_b2 = back_prop(x,yhat,y,h,W1,W2,b1,b2,batch_size)
        
        # update weights and biases
        W1 = W1 - (alpha * grad_W1)
        W2 = W2 - (alpha * grad_W2)
        b1 = b1 - (alpha * grad_b1)
        b2 = b2 - (alpha * grad_b2)

        ### END CODE HERE ###
        if it == num_iters: 
            break
        if it % 100 == 0:
            alpha *= 0.66
            
    return W1, W2, b1, b2

In [28]:
C = 2
N = 50
word2Ind, Ind2word = get_dict(data)
V = len(word2Ind)
num_iters = 500
print("Call gradient_descent")
W1, W2, b1, b2 = gradient_descent(x_data, y_data, word2Ind, N, V, num_iters)

Call gradient_descent
iters: 10 cost: 10.123619
iters: 20 cost: 9.827763
iters: 30 cost: 9.576239
iters: 40 cost: 9.368893
iters: 50 cost: 9.201100
iters: 60 cost: 9.062347
iters: 70 cost: 8.942650
iters: 80 cost: 8.836100
iters: 90 cost: 8.739790
iters: 100 cost: 8.652287
iters: 110 cost: 8.593666
iters: 120 cost: 8.544033
iters: 130 cost: 8.497551
iters: 140 cost: 8.454063
iters: 150 cost: 8.413435
iters: 160 cost: 8.375632
iters: 170 cost: 8.340536
iters: 180 cost: 8.308011
iters: 190 cost: 8.277979
iters: 200 cost: 8.250324
iters: 210 cost: 8.231671
iters: 220 cost: 8.215760
iters: 230 cost: 8.200806
iters: 240 cost: 8.186779
iters: 250 cost: 8.173661
iters: 260 cost: 8.161444
iters: 270 cost: 8.150107
iters: 280 cost: 8.139598
iters: 290 cost: 8.129856
iters: 300 cost: 8.120856
iters: 310 cost: 8.114770
iters: 320 cost: 8.109608
iters: 330 cost: 8.104771
iters: 340 cost: 8.100264
iters: 350 cost: 8.096057
iters: 360 cost: 8.092163
iters: 370 cost: 8.088564
iters: 380 cost: 8.08526

In [29]:
W1.shape, W2.shape

((50, 5775), (5775, 50))

In [30]:
W1[:,0]

array([ 0.28249817,  0.12850495, -0.01243035, -0.03096692, -0.00569351,
        0.67277207,  0.00822595,  0.42176682,  0.88942825,  0.18123361,
        0.32378112,  0.29247569,  0.49456305,  0.3926878 ,  0.89470578,
        0.49481512,  0.86737739,  0.17113436,  0.7440892 ,  0.17423119,
        0.61718773,  0.737241  ,  0.0484013 ,  0.13569248, -0.03882672,
        0.27470466,  0.49807149,  0.78103274, -0.08591464,  0.32424947,
        0.46970676, -0.03503543,  0.88257509,  0.39246283,  0.60509994,
        0.81164849,  0.3566652 ,  0.01156602,  0.28020506,  0.76344848,
        0.65472547,  0.40756415,  0.90630922,  0.17431762,  0.32189679,
        0.80079741,  0.01431713,  0.91492909,  0.62260657,  0.88287618])