# Word2Vec Embedding

In [1]:
import torch
import torch.nn.functional as F
from torch import nn, optim

In [2]:
import numpy as np

In [3]:
%matplotlib inline
import matplotlib.pyplot as plt

### Description

Word2Vec word embedding using the skip-gram architecture.

### GPU

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


## Data Set


### Loading

In [5]:
# Load text
with open("data/text8") as fin:
    text = fin.read()

print(text[:100])

anarchism originated as a term of abuse first used against early working class radicals including t


### Pre-Processing

Transform punctuation into words:

In [6]:
from string import punctuation

punctuation_names = {
    ".": " <PERIOD> ",
    ",": " <COMMA> ",
    "\"": " <QUOTATIONMARK> ", 
    ":": " <COLON> ", 
    ";": " <SEMICOLON> ", 
    "!": " <EXCLAMATIONMARK> ", 
    "?": " <QUESTIONMARK> ", 
    "(": " <LPAREN> ", 
    ")": " <RPAREN> ",
}

text = text.lower()

for p in punctuation:
    try:
        print(f"Replacing {p} with {punctuation_names[p]}")
        text.replace(p, punctuation_names[p])
    except KeyError:
        # Remove punctuation
        text.replace(p, "")

Replacing ! with  <EXCLAMATIONMARK> 
Replacing " with  <QUOTATIONMARK> 
Replacing ( with  <LPAREN> 
Replacing ) with  <RPAREN> 
Replacing , with  <COMMA> 
Replacing . with  <PERIOD> 
Replacing : with  <COLON>
Replacing ; with  <SEMICOLON> 
Replacing ? with  <QUESTIONMARK> 


In [7]:
words = text.split()
print(words[:25])

['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against', 'early', 'working', 'class', 'radicals', 'including', 'the', 'diggers', 'of', 'the', 'english', 'revolution', 'and', 'the', 'sans', 'culottes']


In [14]:
from collections import Counter

wordcounts = Counter(words)

# Remove rare words
words = [word for word in words if wordcounts[word] > 5]

In [13]:
print(f"Total words: {len(words)}")
print(f"Unique words: {len(set(words))}")

Total words: 16680599
Unique words: 63641


### Maps

In [15]:
int_to_word = {i: word for i, word in enumerate(words)}
word_to_int = {word: i for i, word in int_to_word.items()}

We can now transform the whole text into integer numbers representing different words:

In [18]:
iwords = [word_to_int[word] for word in words]

print(iwords[:25])

[14656338, 16667876, 16680491, 16680543, 16679204, 16680577, 16656223, 16679786, 16680352, 16680326, 16678804, 16675832, 16674838, 16555448, 16679417, 16680575, 14486722, 16680577, 16680575, 16673274, 16675992, 16680586, 16680575, 16659558, 12996717]


### Narrowing the Context

Very common words, usually do not provide useful context information (because they can be used in many different context). We can therefore subsample the dataset in order to obtain a more meaningful context reprsentation. Mikolov's subsampling consists in removing a word $w_i$ with probability $p_i$ given by

$$
    p_i(t) = 1 - \sqrt{\frac{t}{f(w_i)}}
$$

where $t$ is a given threshold parameter and $f(w_i)$ is the frequency of word $w_i$. 

In [22]:
import random

# Threshold
t = 1e-5

n_iwords = len(iwords)

iwordcounts = Counter(iwords)

frequencies = {iword: count / n_iwords for iword, count in iwordcounts.items()}
pdrop = {iword: 1 - np.sqrt(t / frequencies[iword]) for iword in iwords}

trainset = [iword for iword in iwords if random.random() < (1 - pdrop[iword])]

### Batches