We will use the following method to build simple word embeddings. 

We create a matrix where we put the co-occurences of all the words. 

We factorize that matrix. 

In [207]:
import numpy as np
import scipy as sp
import pandas as pd

In [208]:
sentences = [
    "a dog is a sweet animal",
    "a cat is a mean beast", 
    "a human is a different creature",
    "a cat is a nice pet",
    "a dog is a nice pet also"
]

In [209]:
from collections import defaultdict
from itertools import product, combinations

Nij_counts = defaultdict(int)

N = 1
k = 5 # The window size
window_size = 2

vocab = set()
for sentence in sentences: 
    for idx_a, word_a in enumerate(sentence.split(" ")): 
        start = idx_a - 1 
        stop = idx_a + 2
        for word_b in sentence.split(" ")[start:stop]:
            if word_a == word_b:
              continue
            Nij_counts[(word_a, word_b)] += 1
            N += 1
            vocab.add(word_a)
            vocab.add(word_b)

Ni_counts = defaultdict(int)
Nj_counts = defaultdict(int)
for (i,j), N_ij in Nij_counts.items():
  Ni_counts[ i ] += N_ij
  Nj_counts[ j ] += N_ij


Pi = {k:v/N for k,v in Ni_counts.items()}
Pj = {k:v/N for k,v in Nj_counts.items()}
Pij = {k:v/N for k,v in Nij_counts.items()}

In [220]:
pmi_matrix = np.zeros((len(vocab), len(vocab)))
for i, word_i in enumerate(vocab): 
    for j, word_j in enumerate(vocab):
        pmi_matrix[i][j] = np.log( Pij.get((word_i, word_j), 0) / (Pi[word_i] * Pj[word_j] ))  #- np.log(k)

pmi_matrix[ pmi_matrix < 0] = 0
pd.DataFrame(pmi_matrix, columns=vocab, index=vocab)

  pmi_matrix[i][j] = np.log( Pij.get((word_i, word_j), 0) / (Pi[word_i] * Pj[word_j] ))  #- np.log(k)


Unnamed: 0,cat,different,animal,dog,sweet,nice,is,beast,pet,creature,also,a,mean,human
cat,0.0,0.0,0.0,0.0,0.0,0.0,0.875469,0.0,0.0,0.0,0.0,0.470004,0.0,0.0
different,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.178054,0.0,0.470004,0.0,0.0
animal,0.0,0.0,0.0,0.0,3.178054,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
dog,0.0,0.0,0.0,0.0,0.0,0.0,0.875469,0.0,0.0,0.0,0.0,0.470004,0.0,0.0
sweet,0.0,0.0,3.178054,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.470004,0.0,0.0
nice,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.079442,0.0,0.0,0.470004,0.0,0.0
is,1.568616,0.0,0.0,1.568616,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.470004,0.0,1.568616
beast,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.178054,0.0
pet,0.0,0.0,0.0,0.0,0.0,2.079442,0.0,0.0,0.0,0.0,2.772589,0.0,0.0,0.0
creature,0.0,3.178054,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [221]:
U, sigma, Vt = np.linalg.svd(pmi_matrix)
word_embeddings = U * sigma

In [222]:
pd.DataFrame(U * sigma, index=vocab)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
cat,-0.125528,-0.082352,-0.056639,-0.154977,1.770129e-16,-3.472768e-16,2.250913e-16,-5.898841e-17,0.103016,0.950212,-0.152147,0.03033,-5.896389e-17,-8.886491e-18
different,-0.036187,-0.397648,-0.036706,-1.837233,-0.8467333,-0.5539689,-2.376266,-0.2507431,-0.210186,-0.083805,0.012984,0.002184,1.2238740000000001e-32,6.412951e-34
animal,-1.399474,0.009853,-1.144664,0.061518,-2.343343,0.9292952,0.60959,0.08310343,-0.019071,-0.265032,-0.152087,0.016544,-4.3439790000000005e-33,-1.361987e-33
dog,-0.125528,-0.082352,-0.056639,-0.154977,-1.407845e-15,1.465704e-16,3.570005e-16,5.723916e-16,0.103016,0.950212,-0.152147,0.03033,-6.870092000000001e-17,8.445844e-18
sweet,-0.036187,-0.397648,-0.036706,-1.837233,0.07382504,-0.8159933,1.762162,-1.719547,-0.210186,-0.083805,0.012984,0.002184,1.219035e-32,1.3777100000000002e-33
nice,-0.03659,-2.088265,0.044529,0.326219,1.017894e-15,-1.242717e-15,-1.230756e-15,1.1678e-15,0.001746,0.08202,-0.166661,-0.199703,4.333732e-31,-4.584136e-33
is,-0.019859,-0.1712,-0.013028,-0.387229,-1.320726e-15,0.0,0.0,0.0,2.720756,-0.141082,0.01809,0.002996,-4.1702200000000005e-31,0.0
beast,-1.399474,0.009853,-1.144664,0.061518,0.6504474,-2.190208,0.1499843,1.220968,-0.019071,-0.265032,-0.152087,0.016544,2.060855e-32,3.8354760000000005e-33
pet,-2.170444,0.092111,2.695585,-0.024651,-8.828112e-16,-1.487032e-15,-1.830279e-15,2.331132e-15,-0.0071,-0.134905,-0.083034,0.009092,7.846944000000001e-33,2.243644e-34
creature,-1.399474,0.009853,-1.144664,0.061518,1.692895,1.260913,-0.7595744,-1.304071,-0.019071,-0.265032,-0.152087,0.016544,-1.541832e-32,1.128522e-33


In [223]:
U_embeddings = {word: (U * sigma)[index, :] for index, word in enumerate(vocab)}
V_embeddings = {word: Vt[index, :] for index, word in enumerate(vocab)}

In [224]:
(U_embeddings['cat'] @ U_embeddings['dog'],
U_embeddings['cat'] @ U_embeddings['is'],
U_embeddings['cat'] @ U_embeddings['human'])

(0.987348921588194, 0.22090341150415482, 0.987348921588194)