# Word2Vec

“you shall know a word by the company it keeps” — (Firth, J. R. 1957:11)

As the goal of this resource is to teach NLP for low-resource languages, we are going to start with learning word embeddings for some low resource languages e.g Yoruba, Igbo, Swahili e.t.c from scratch using word2vec. Word2Vec was introduced in [Efficient Estimation of Word Representations in
Vector Space](https://arxiv.org/pdf/1301.3781.pdf) and the overarching idea is that the meaning of a word is dependent on the context of which it is often used.

In this tutorial, we would learn and visualize word embeddings for different languages using the [mC4](https://huggingface.co/datasets/mc4) dataset and pytorch.


## Important terminologies to note

- CBOW:
- SkipGram:
- Corpus:
- Vocabulary:
- Word Subsampling:


In [1]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.4.0-py3-none-any.whl (365 kB)
[K     |████████████████████████████████| 365 kB 7.8 MB/s 
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 67.9 MB/s 
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 70.1 MB/s 
Collecting multiprocess
  Downloading multiprocess-0.70.13-py37-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 76.3 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
[K     |████████████████████████████████| 127 kB 68.0 MB/s 
Installing collected package

# Dataset

- First we need to build a corpus which consists of all th

In [2]:
#@title String fields

Select_Language = 'Yoruba' #@param ["English", "Yoruba", "Igbo", "Swahili", "Hausa"]

LANG_2_CODE = {"English" : "en", "Yoruba" : "yo", "Igbo": "ig", "Swahili": "sw", "Hausa": "ha"} 

In [3]:
import warnings
from tqdm import tqdm
from datasets import load_dataset
warnings.filterwarnings("ignore")

def fetch_igbo_corpus() -> list:
    """
    This function generates a a corpus for Igbo language which is a list of 
    sentences from a huggingface dataset
    Dataset: https://huggingface.co/datasets/igbo_monolingual
    """

    igbo_monolingual = ['eze_goes_to_school', 'bbc-igbo', 'igbo-radio', 'jw-ot-igbo', \
                        'jw-nt-igbo', 'jw-books', 'jw-teta', 'jw-ulo_nche', 'jw-ulo_nche_naamu']

    sentences = []
    for conf in tqdm(igbo_monolingual):
        dataset = load_dataset("igbo_monolingual", conf)
        if conf == 'eze_goes_to_school':
            sentences.extend([sentence for sentence in dataset['train']['chapters'][0]['title']])
            for i in range(len(dataset['train']['chapters'][0]['content'])):
                sentences.extend([sentence for sentence in dataset['train']['chapters'][0]['content'][i].split(".")])
        elif conf == 'bbc-igbo':
            sentences.extend([sentence for sentence in dataset['train']['title']])
            sentences.extend([sentence for sentence in dataset['train']['description']])
            for i in range(len(dataset['train']['content'])):
                sentences.extend([sentence for sentence in dataset['train']['content'][i].split(".")])
        elif conf == 'igbo-radio':
            for i in range(len(dataset['train']['content'])):
                sentences.extend([sentence for sentence in dataset['train']['content'][i].split(".")])
                sentences.extend([sentence for sentence in dataset['train']['description'][i].split(".")])
        elif conf in ['jw-ot-igbo', 'jw-nt-igbo']:
            for i in range(len(dataset['train'])):
                for j in range(len(dataset['train'][i]['chapters'])):
                    try:
                        sentences.extend([sentence for sentence in dataset['train']['chapters'][i]['content'][j].split(".")])
                    except IndexError as e:
                        continue
        else:
            for i in range(len(dataset['train'])):
                sentences.extend([sentence for sentence in dataset['train'][i]['content'].split(".")])

    return sentences


def fetch_english_corpus() -> list:
    """
    This function generates a a corpus for English language which is a list of 
    sentences from a huggingface dataset
    Dataset: https://huggingface.co/datasets/wikitext
    """
    sentences = []

    dataset = load_dataset("wikitext", 'wikitext-103-v1')
    for split in ["test", "train", "validation"]:
        for i in tqdm(range(len(dataset[split]['text']))):
            sentences.extend([sentence for sentence in dataset[split]['text'][i].split(".")])
    
    return sentences

def fetch_yoruba_corpus() -> list:
    """
    This function generates a a corpus for Yoruba language which is a list of 
    sentences from a huggingface dataset
    Dataset: https://huggingface.co/datasets/yoruba_text_c3
    """
    sentences = []

    dataset = load_dataset("yoruba_text_c3")
    # for i in tqdm(range(len(dataset["train"]["text"]))):
    #     sentences.extend([sentence for sentence in dataset["train"]["text"][i].split(".")])
    
    for sentence in tqdm(dataset["train"]["text"]):
        sentences.extend(sentence.split("."))

    sentences = [w.split() for w in sentences if w != '']
    return sentences


def fetch_swahili_corpus() -> list:
    """
    This function generates a a corpus for Swahili language which is a list of 
    sentences from a huggingface dataset
    Dataset: https://huggingface.co/datasets/swahili
    """
    sentences = []

    dataset = load_dataset("swahili")
    for split in ["test", "train", "validation"]:
        for i in tqdm(range(len(dataset[split]["text"]))):
            sentences.extend([sentence for sentence in dataset[split]["text"][i].split(".")])

    return sentences



sentences = fetch_yoruba_corpus()

Downloading builder script:   0%|          | 0.00/1.49k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/942 [00:00<?, ?B/s]

Downloading and preparing dataset yoruba_text_c3/yoruba_text_c3 (download: 71.91 MiB, generated: 73.52 MiB, post-processed: Unknown size, total: 145.44 MiB) to /root/.cache/huggingface/datasets/yoruba_text_c3/yoruba_text_c3/1.0.0/dbf0b0085c03d98b73f35fe80f4e75928025b483958c24f86440104921830d98...


Downloading data:   0%|          | 0.00/75.4M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/562238 [00:00<?, ? examples/s]

Dataset yoruba_text_c3 downloaded and prepared to /root/.cache/huggingface/datasets/yoruba_text_c3/yoruba_text_c3/1.0.0/dbf0b0085c03d98b73f35fe80f4e75928025b483958c24f86440104921830d98. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 562238/562238 [00:00<00:00, 1727237.30it/s]


In [4]:
import re

def process_text(corpus_list: list) -> list:
    """
    This function is for preprocessing the downloaded texts
    1. lowercase
    2. remove symbols and numbers
    3. remove whitespace
    """

    processed_corpus = []

    for sentence in tqdm(corpus_list):
        sentence = [re.sub(r"\W", "", x) for x in sentence]
        sentence = [re.sub(r"\d+", "", x) for x in sentence]
        sentence = [re.sub(r"\s", '', x) for x in sentence]
        sentence = [x.lower() for x in sentence if x.strip() != "" and len(x) > 1]
        processed_corpus.append(sentence)

    return processed_corpus

sentences = process_text(sentences)

100%|██████████| 569382/569382 [01:03<00:00, 8929.95it/s] 


In [5]:
sentences[0]

['lílo',
 'àkàbà',
 'njé',
 'máa',
 'ṣe',
 'àyèwò',
 'wònyí',
 'tó',
 'lè',
 'dáàbò',
 'bò']

# Random Words Subsampling

In [6]:
from collections import Counter
import random, math
import itertools

def subsample_words(corpus_list: list) -> list:
    """
    """
    filtered_corpus = []

    word_counts = dict(Counter(itertools.chain.from_iterable(corpus_list)))
    sum_word_count = sum(word_counts.values())
    word_proportion = {word: word_counts[word]/sum_word_count for word in word_counts}

    for sentence in tqdm(corpus_list):
        filtered_corpus.append([])

        for word in sentence:
            subsampling_score = (1e-3/word_proportion[word]) * (math.sqrt(1e3*word_proportion[word])+1)
            if random.random() < subsampling_score:
                filtered_corpus[-1].append(word)
    
    return filtered_corpus




sentences = subsample_words(sentences)

100%|██████████| 569382/569382 [00:08<00:00, 71162.85it/s]


In [7]:
sentences[0]

['lílo', 'àkàbà', 'njé', 'máa', 'àyèwò', 'wònyí', 'lè', 'dáàbò', 'bò']

In [8]:
final_corpus = subsample_words(sentences)
final_vocabulary = set(itertools.chain.from_iterable(final_corpus))

word_2_index  = {w: i for i, w in enumerate(final_vocabulary)}
index_2_word  = {i: w for i, w in enumerate(final_vocabulary)}


print(f"There are {len(final_vocabulary)} words in the training vocabulary (unique words)")
print(f"There are {len(final_corpus)} sentences in the training corpus")

100%|██████████| 569382/569382 [00:05<00:00, 98423.83it/s]


There are 99196 words in the training vocabulary (unique words)
There are 569382 sentences in the training corpus


In [9]:
index_2_word[0]

'foún'

In [10]:
def gen_context_words(corpus: list, context_window: int = 3) -> list:
    """
    Without Negative Sampling
    """
    context_words = [] #list of tuples containing (word, context_word)

    for sentence_list in tqdm(corpus):
        for i, word in enumerate(sentence_list):
            start_index = max(0, i-context_window)
            end_index = min(len(sentence_list), i+context_window)
            for j in range(start_index, end_index):
                if i!=j:
                    context_words.append((word, sentence_list[j]))
    
    return context_words

context_words = gen_context_words(final_corpus)
print("\n There are {} pairs of target and context words".format(len(context_words)))

100%|██████████| 569382/569382 [00:14<00:00, 40145.93it/s]


 There are 31351693 pairs of target and context words





In [11]:
context_words[:10]

[('lílo', 'àkàbà'),
 ('lílo', 'njé'),
 ('àkàbà', 'lílo'),
 ('àkàbà', 'njé'),
 ('àkàbà', 'àyèwò'),
 ('njé', 'lílo'),
 ('njé', 'àkàbà'),
 ('njé', 'àyèwò'),
 ('njé', 'wònyí'),
 ('àyèwò', 'lílo')]

# Negative Sampling

In [13]:
import numpy as np
from numpy.random import multinomial

def sample_negative(corpus: list, sample_size: int=8):
    sample_probability = {}
    word_counts = dict(Counter(list(itertools.chain.from_iterable(corpus))))
    normalizing_factor = sum([v**0.75 for v in word_counts.values()])
    for word in word_counts:
        sample_probability[word] = word_counts[word]**0.75 / normalizing_factor
    words = np.array(list(word_counts.keys()))
    while True:
        word_list = []
        sampled_index = np.array(multinomial(sample_size, list(sample_probability.values())))
        for index, count in enumerate(sampled_index):
            for _ in range(count):
                 word_list.append(words[index])
        yield word_list

def gen_context_words_with_negative_sampling(corpus: list, context_window: int=3) -> list:
    """
    With Negative Sampling
    """
    context_words = [] #list of tuples containing (word, context_word, negative_samples)

    neg_samples = sample_negative(final_corpus)

    for sentence_list in tqdm(corpus[:20]):
        for i, word in enumerate(sentence_list):
            start_index = max(0, i-context_window)
            end_index = min(len(sentence_list), i+context_window)
            for j in range(start_index, end_index):
                if i!=j:
                    context_words.append((word, sentence_list[j], next(neg_samples)))
    
    return context_words


context_words = gen_context_words_with_negative_sampling(final_corpus)
print("\n There are {} pairs of target and context words".format(len(context_words)))

100%|██████████| 20/20 [00:20<00:00,  1.02s/it]


 There are 555 pairs of target and context words





In [14]:
context_words[0]

('lílo',
 'àkàbà',
 ['ẹrù', 'ńṣe', 'ìṣe', 'sábé', 'yòówù', 'jae', 'iṣẹlẹ', 'egberun'])

# Model

In [36]:
from numpy.core.memmap import dtype
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

class word2vecDataset(Dataset):
    def __init__(self, corpus: list) -> None:
        super().__init__()
        self.context_words = gen_context_words_with_negative_sampling(corpus)
    
    def __len__(self) -> int:
        return len(self.context_words)

    def __getitem__(self, i: int):
        target_word, context_word, negative_samples = self.context_words[i]
        target_tensor = torch.tensor(np.array(word_2_index[target_word]), dtype=torch.long)
        context_tensor = torch.tensor(np.array(word_2_index[context_word]), dtype=torch.long)
        negative_tensor = torch.tensor(np.array([word_2_index[word] for word in negative_samples]), dtype=torch.long)

        return target_tensor, context_tensor, negative_tensor



dataset = word2vecDataset(final_corpus)
train_loader = DataLoader(dataset, shuffle=True, batch_size=16)


print(next(iter(train_loader)))


100%|██████████| 20/20 [00:19<00:00,  1.00it/s]

[tensor([79565, 80341, 35287, 54106, 31178, 57992, 10673, 42177, 91794,  4186,
        81089, 54106, 90680, 91794, 42177,  1197]), tensor([47463, 16202, 77602, 92111, 59236,  4186, 22928, 84166, 35254, 47463,
        81022,  7575, 17334, 50924, 55996, 46635]), tensor([[ 1197, 94219,  8311, 87419, 38456, 57697, 49939, 26770],
        [50678, 93500,  8086, 48208, 42698, 61921, 82094, 71516],
        [97805, 51987, 49652, 82358, 86661, 14706, 83529, 62256],
        [ 4186, 44891, 67772, 30506, 19696, 96559, 10328, 30984],
        [35824, 94180, 72945, 49320, 47562, 45865, 20336, 74802],
        [36770, 34002, 93467, 69006, 10185,  8074, 53860, 12647],
        [98278, 74412, 11997, 85019, 58167, 68985, 98207, 78079],
        [16296, 13057, 57697, 71397, 58044, 93287, 37787, 50545],
        [34002, 41020, 16853, 48704, 18337, 31980, 82054, 41126],
        [10918, 75802, 61900, 85903, 60869, 16918, 10229, 66283],
        [56126, 32171, 20702,  8585, 90917, 39249, 95414, 76630],
        [1046




In [37]:

class Word2Vec(nn.Module):
    def __init__(self, embedding_size, vocab_size):
        """
        """
        super().__init__()
        self.target_embedding = nn.Embedding(vocab_size, embedding_size)
        self.context_embedding = nn.Embedding(vocab_size, embedding_size)

    def forward(self, target_word, context_word, negative_words):
        """
        """
        targ_emb = self.target_embedding(target_word)       # output_size=torch.Size([batch_size, embedding_size])
        cont_emb = self.context_embedding(context_word)     # output_size=torch.Size([batch_size, embedding_size])
        emb_product = torch.mul(targ_emb, cont_emb)         # output_size=torch.Size([batch_size, embedding_size])
        emb_product = torch.sum(emb_product, dim=1)         # output_size=torch.Size([batch_size])
        out = torch.sum(F.logsigmoid(emb_product))
        
        
        

model = Word2Vec(256, len(final_vocabulary))
sample_target_batch,  sample_context_batch, sample_negatives_batch = next(iter(train_loader))

print(f"Shape of sample input target tensor batch ==> {sample_target_batch.size()}")
print(f"Shape of sample input context tensor batch ==> {sample_context_batch.size()}")
print(f"Shape of sample input negative samples tensor batch ==> {sample_negatives_batch.size()}")


model(sample_target_batch, sample_context_batch, sample_negatives_batch)

Shape of sample input target tensor batch ==> torch.Size([16])
Shape of sample input context tensor batch ==> torch.Size([16])
Shape of sample input negative samples tensor batch ==> torch.Size([16, 8])
tensor(-87.2528, grad_fn=<SumBackward0>)
