# Embeddings from Wikipedia

Train a simple embedding model by calling random Wikipedia article (abstracts only). This is to practise training on streamline data.

1. Data Cleaning
2. Extract Context and Label
3. Dynamically growing Vocabulary
4. Define a CBow model
5. Insights

In [2]:
### 1 Data Cleaning

import re
import requests
from bs4 import BeautifulSoup

def clean_text(text):
    # remove non characters
    text = re.sub("[^a-zA-Z ]", "", text)
    # remove duplicate white spaces
    text = " ".join(text.split())
    # remove citations
    text = re.sub("\[\d*\]", "", text)
    # lower case
    text = text.lower()
    return text
  
def get_random_wiki():
    # get random article
    URL = "https://en.wikipedia.org/wiki/Special:Random"
    page = requests.get(URL)
    target_url = page.request.url
    soup = BeautifulSoup(page.content, 'html.parser')

    # extract intro
    texts = []
    for o in soup.find_all(class_="mw-parser-output"):
        texts.extend([s.text for s in o.find_all("p")])
        texts = [clean_text(t) for t in texts if t not in ["\n", ""]]
    return texts, target_url


### 2 Extract Context and Label
def get_training(corpus, window_size, pad_token=0):
    inputs = []
    labels = []
    ws = 2 * window_size
    for sentence in corpus:
        n = len(sentence)
        for i, word in enumerate(sentence):
            # get indices of context
            choose = [i + k for k in range(-window_size, window_size + 1) if
                    k != 0 and i + k >= 0 and i + k < n]
            # choose elements
            context = tf.gather(sentence, choose)
            context = tf.concat([context, [pad_token] * (ws - len(choose))], axis=0)  # pad with 0
            y = word
            inputs.append(context)
            labels.append(y)
    return tf.stack(inputs), tf.stack(labels)

from collections import defaultdict


### 3. Dynamically increasing Vocabulary

class Vocab:
    """
    Have a constantly learning Vocabulary. Stores key-value and value-dict dictionaries, adds new elements
    to the dictionary and also acts as generator for Wikipedia abstract training data. Includes a word counter.
    :param max_size: Maximum size of Vocabulary. (Idea: delete rare words if too full)
    """
    def __init__(self, max_size = 300000):
        self.pad_id = 0 
        self.word_index = 1
        self.vocab = {}
        self.vocab_inv = {}
        self.word_counts = defaultdict(lambda: 0)
        self.max_size = max_size
        self.full_dict_warning = False # notify if dict is full

    def add_key(self, key):
        # add a new key if vocabulary not yet full
        if self.word_index > self.max_size and not self.full_dict_warning:
            print("ATTENTION: Dictionary is full - no more words are added")
            self.full_dict_warning = True
        else:
            self.vocab[key] = self.word_index
            self.vocab_inv[self.word_index] = key
            self.word_index += 1

    def get_one(self, key):
        # input a word and return the corresponding integer token
        if key not in self.vocab:
            self.add_key(key)
        self.word_counts[key] += 1    
        return self.vocab[key]

    def get_sentence(self, sentence):
        # input a sentence (list of words) and return a list of integer tokens
        return [self.get_one(key) for key in sentence.split()]

    def get(self, text):
        # input list of sentences and return list of list of integer tokens
        return [self.get_sentence(sentence) for sentence in text]

    def get_vocabulary(self):
        # return entire vocabulary
        return list(self.vocab.keys())

    def get_training_data(self, size=100, n_articles=5, window_size=2, pad_token=0):
        # yield tuples of labels and context
        # n_articles: how man articles to yield at a time
        # window_size: size of context
        for j in range(size):
            texts = []
            urls = []
            for i in range(n_articles):
                text, url = get_random_wiki()
                texts.extend(text)
                urls.append(url)
            texts_encoded = self.get(texts)
            yield get_training(corpus=texts_encoded, window_size=window_size, pad_token=pad_token), urls

### CBow model

Implement a CBow model as in the *Embeddings_Low_Level.ipynb* but use keras implementation of specific layers.

In [3]:
import tensorflow as tf

# Create a shorter word2vec version than before
class CbowCustom(tf.keras.models.Model):
    """
    Simplec Cbow model.
    """
    def __init__(self, vocab_size, embed_size):
        super(CbowCustom, self).__init__()
        self.embed = tf.keras.layers.Embedding(vocab_size, embed_size)
        self.soft = tf.keras.layers.Dense(vocab_size-1, activation="softmax")

    def call(self, inputs):
        x = self.embed(inputs)
        x = tf.reduce_mean(x, axis=1)
        x = self.soft(x)
        return x

In [5]:
# define maximum vocabulary and embedding size
vocab_size = 300000
embed_size = 25
# create a vocabulary instance
V = Vocab(max_size=vocab_size)
# create a CBow instance
m = CbowCustom(vocab_size=vocab_size, embed_size=embed_size)
# combile
m.compile(optimizer="rmsprop", loss=tf.keras.losses.sparse_categorical_crossentropy)

### Train model

Use Vocabulary to create training data and train the model.

In [6]:
# store all wikipedia articles used for training
urls = []
# Parameter to choose number of "epochs"
k = 5

# create a generator
gen = V.get_training_data(size=k, n_articles=2)
for i in range(k):
    # progress bar
    print(f"\r{i+1}/{k}")
    # try - sometimes errors in cleaning
    try:
        temp, urls = next(gen)
        urls.extend(urls)
        # train step
        m.fit(*temp, batch_size=10)
    except:
        pass

1/5
2/5
3/5
4/5
5/5


### Insights

Check if similar words are more similar than very different words. Seems legit :-)

In [25]:
def cor(x, y):
    # simple pearson correlation
    xc = tf.math.square(x - tf.reduce_mean(x))
    yc = tf.math.square(y - tf.reduce_mean(y))
    nom = tf.reduce_sum(tf.multiply(xc, yc))
    denom = tf.math.reduce_std(x) * tf.math.reduce_std(y)
    return float(nom/denom)

def compare_words(x,y):
    print(f"{x}-{y}: {round(cor(m.embed(V.vocab[x]), m.embed(V.vocab[y])), 4)}")

compare_words("england", "new")
compare_words("england", "cricket")
compare_words("england", "small")
compare_words("england", "convincing")

england-new: 0.5517
england-cricket: 0.5174
england-small: 0.2995
england-convincing: 0.278
