Intuitive example and used it to show how the glove NLP embedding works and also to download the GloVe dataset from Stanford's NLP repository for use in our projects feature extraction

In [9]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np


In [10]:
#simple example of a list of words to be tokenized
texts = ['text', 'the', 'leader', 'prime', 'natural', 'language']

In [12]:
#create a tokenizer and feed it the list
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)

print("Number of unique words in dictionary =", len(tokenizer.word_index))
print("Dictionary is =", tokenizer.word_index)

Number of unique words in dictionary = 6
Dictionary is = {'text': 1, 'the': 2, 'leader': 3, 'prime': 4, 'natural': 5, 'language': 6}


In [13]:
def embedding_for_vocab(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1  # +1 for padding token (index 0)
    embedding_matrix_vocab = np.zeros((vocab_size, embedding_dim))

    with open(filepath, encoding="utf8") as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word]
                embedding_matrix_vocab[idx] = np.array(vector, dtype=np.float32)[:embedding_dim]

    return embedding_matrix_vocab

In [None]:
#This cell downloads the GloVe dataset from Stanford's NLP repository and unzips it. In case that the file is already saved locally the cell prints Found: glove.6B*
import urllib.request
import zipfile
import os
from pathlib import Path

zip_path = Path("glove.6B.zip")
extract_dir = Path(".")

url = "http://nlp.stanford.edu/data/glove.6B.zip"
if not zip_path.exists():
    urllib.request.urlretrieve(url, zip_path)

with zipfile.ZipFile(zip_path, "r") as zip_ref:
    zip_ref.extractall(extract_dir)

for file in os.listdir(extract_dir):
    if file.startswith("glove.6B"):
        print(f"Found: {file}")



Found: glove.6B.100d.txt
Found: glove.6B.200d.txt
Found: glove.6B.300d.txt
Found: glove.6B.50d.txt
Found: glove.6B.zip


In [15]:
embedding_dim = 50 # match this with glove file
glove_path = './glove.6B.50d.txt'

embedding_matrix_vocab = embedding_for_vocab(glove_path, tokenizer.word_index, embedding_dim)

In [16]:
first_word_index = 1  # Tokenizer indexes start from 1
print("Dense vector for word with index 1 =>", embedding_matrix_vocab[first_word_index])

Dense vector for word with index 1 => [ 0.32615     0.36686    -0.0074905  -0.37553     0.66715002  0.21646
 -0.19801    -1.10010004 -0.42221001  0.10574    -0.31292     0.50953001
  0.55774999  0.12019     0.31441    -0.25042999 -1.06369996 -1.32130003
  0.87797999 -0.24627     0.27379    -0.51091999  0.49324     0.52243
  1.16359997 -0.75322998 -0.48052999 -0.11259    -0.54595    -0.83920997
  2.98250008 -1.19159997 -0.51958001 -0.39365    -0.1419     -0.026977
  0.66295999  0.16574    -1.1681      0.14443     1.63049996 -0.17216
 -0.17436001 -0.01049    -0.17794     0.93076003  1.0381      0.94265997
 -0.14805    -0.61109   ]
