# Distributed representations

In [1]:
import gc
import time
from tqdm import tqdm
from functools import partial

import numpy as np
import pandas as pd
import scipy.sparse
import sklearn.preprocessing

import utils

%load_ext autoreload
%autoreload 2
%matplotlib widget

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [8]:
preprocessor = partial(utils.preprocess_text, regexes=True, start_end_symbols=True)
dataset = utils.IMDBDataset(preprocessor=preprocessor)
df = dataset.dataframe
df.shape

(50000, 5)

In [9]:
df.head()

Unnamed: 0,file_id,score,sentiment,split,text
0,2257,7,1,train,"<s> sarafina was a fun movie, and some of the ..."
1,4778,9,1,train,"<s> like his early masterpiece ""the elephant m..."
2,7284,8,1,train,<s> when i was young i had seen very few movie...
3,4845,9,1,train,<s> hello playmates.i recently watched this fi...
4,6822,7,1,train,"<s> ""opening night"" released in tries to be an..."


In [10]:
small_df = dataset.get_portion(amount=500, seed=RANDOM_SEED)
small_df.shape

(500, 5)

In [11]:
small_df.head()

Unnamed: 0,file_id,score,sentiment,split,text
33553,4989,7,1,test,<s> too much added with too much taken away fr...
9427,6186,10,1,train,<s> released just before the production code c...
199,8806,9,1,train,<s> i chose to see the this film on the day it...
12447,9759,10,1,train,<s> i believe i received this film when i was ...
39489,445,3,0,test,<s> once upon a time there was a great america...


In [12]:
def build_vocabulary(df):
    """
    Given a dataset, builds the corresponding word vocabulary
    """
    doc_str = " ".join(df["text"].tolist()).replace("\n", " ").replace("\r", " ")
    words = sorted(set(doc_str.split()))
    vocabulary, inverse_vocabulary = dict(), dict()
    for i, w in tqdm(enumerate(words)):
        vocabulary[i] = w
        inverse_vocabulary[w] = i
    return vocabulary, inverse_vocabulary, words

In [13]:
idx_to_word, word_to_idx, word_listing = build_vocabulary(small_df)

19383it [00:00, 732752.84it/s]


In [14]:
len(word_listing)

19383

In [41]:
print(list(idx_to_word.items())[:5])
print(list(word_to_idx.items())[-5:])

[(0, '!'), (1, '!!!'), (2, '!....'), (3, '!....being'), (4, '"')]
[('£', 19378), ('£for', 19379), ('£it', 19380), ('½', 19381), ('½/*****).', 19382)]


In [15]:
def dict_to_csr(term_dict):
    """
    Given a dictionary like {(i, j): v}, returns a sparse matrix m
    s.t. m[i, j] = v
    """
    keys = list(term_dict.keys())
    values = list(term_dict.values())
    shape = list(np.repeat(np.asarray(keys).max() + 1, 2))
    csr = scipy.sparse.csr_matrix((values, zip(*keys)), shape=shape)
    return csr

In [16]:
def co_occurrence_count(df, idx_to_word, word_to_idx, window_size=4):
    """
    Builds word-word co-occurrence matrix based on word counts
    """
    counts = dict()
    for doc in tqdm(df["text"]):
        doc_words = doc.split()
        for doc_word_index, central_word in enumerate(doc_words):
            central_word_index = word_to_idx[central_word]
            context = (
                doc_words[max(0, doc_word_index - window_size) : doc_word_index] + 
                doc_words[doc_word_index + 1 : min(doc_word_index + window_size + 1, len(doc_words))]
            )
            for context_word in context:   
                context_word_index = word_to_idx[context_word]
                key = (central_word_index, context_word_index)
                counts[key] = counts.get(key, 0) + 1
    sparse_matrix = dict_to_csr(counts)
    del counts
    return sparse_matrix

In [66]:
tmp_df = pd.DataFrame({'text': ["ciao sono alessio ciao come stai", "ciao sono lorenzo chi sei"]})
tmp_idx_to_word, tmp_word_to_idx, tmp_word_listing = build_vocabulary(tmp_df)
#tmp_word_listing = ['ciao', 'sono', 'alessio', 'lorenzo', 'come', 'chi', 'sei', 'stai']
#tmp_idx_to_word = {0: 'ciao', 1: 'sono', 2: 'alessio', 3: 'lorenzo', 4: 'come', 5:'chi', 6:'sei', 7:'stai', 8:'test'}
#tmp_word_to_idx = {'ciao':0, 'sono':1, 'alessio':2, 'lorenzo':3, 'come':4, 'chi':5, 'sei':6, 'stai':7, 'test':8}
print(tmp_idx_to_word)
print(tmp_word_to_idx)
print(tmp_word_listing)
my_toy_co_occurrence_matrix = fast_co_occurrence_count(tmp_df, tmp_idx_to_word, tmp_word_to_idx, window_size=1)
my_toy_co_occurrence_matrix.toarray()

8it [00:00, 65793.00it/s]
100%|██████████| 2/2 [00:00<00:00, 9372.75it/s]

{0: 'alessio', 1: 'chi', 2: 'ciao', 3: 'come', 4: 'lorenzo', 5: 'sei', 6: 'sono', 7: 'stai'}
{'alessio': 0, 'chi': 1, 'ciao': 2, 'come': 3, 'lorenzo': 4, 'sei': 5, 'sono': 6, 'stai': 7}
['alessio', 'chi', 'ciao', 'come', 'lorenzo', 'sei', 'sono', 'stai']
{(2, 6): 2, (6, 2): 2, (6, 0): 1, (0, 6): 1, (0, 2): 1, (2, 0): 1, (2, 3): 1, (3, 2): 1, (3, 7): 1, (7, 3): 1, (6, 4): 1, (4, 6): 1, (4, 1): 1, (1, 4): 1, (1, 5): 1, (5, 1): 1}





array([[0, 0, 1, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 1, 1, 0, 0],
       [1, 0, 0, 1, 0, 0, 2, 0],
       [0, 0, 1, 0, 0, 0, 0, 1],
       [0, 1, 0, 0, 0, 0, 1, 0],
       [0, 1, 0, 0, 0, 0, 0, 0],
       [1, 0, 2, 0, 1, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0]])

In [17]:
if "co_occurrence_matrix" in globals():
    del co_occurrence_matrix
    gc.collect()
    time.sleep(10.0)

window_size = 4
co_occurrence_matrix = co_occurrence_count(
    small_df, idx_to_word, word_to_idx, window_size
)

100%|██████████| 500/500 [00:01<00:00, 357.54it/s]


In [18]:
co_occurrence_matrix

<19383x19383 sparse matrix of type '<class 'numpy.int64'>'
	with 506894 stored elements in Compressed Sparse Row format>

In [20]:
coo_svd = utils.reduce_svd(co_occurrence_matrix, seed=RANDOM_SEED)

In [21]:
utils.visualize_embeddings(coo_svd, ['good', 'love', 'beautiful'], word_to_idx)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [23]:
coo_tsne = utils.reduce_tsne(co_occurrence_matrix, seed=RANDOM_SEED)

In [24]:
utils.visualize_embeddings(coo_tsne, ['good', 'love', 'beautiful'], word_to_idx)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [25]:
def cosine_similarity(p, q, transpose_p=False, transpose_q=False):
    """
    Computes the cosine similarity of two d-dimensional matrices,
    where their second dimension should match
    """
    # If it is a vector, consider it as a single sample matrix
    if len(p.shape) == 1:
        p = p.reshape(1, -1)
    if len(q.shape) == 1:
        q = q.reshape(1, -1)

    # Check if dimensions match
    assert p.shape[1] == q.shape[1]

    # Check for sparsity
    if not hasattr(scipy.sparse, type(p).__name__):
        p = scipy.sparse.csr_matrix(p)
    if not hasattr(scipy.sparse, type(q).__name__):
        q = scipy.sparse.csr_matrix(q)

    # Compute cosine similarity
    p_norm = np.sqrt(p.dot(p.T).diagonal())
    q_norm = np.sqrt(q.dot(q.T).diagonal())
    norms_prod = np.outer(p_norm, q_norm)
    if transpose_p:
        res = q.dot(p.T) / norms_prod
    else:
        res = p.dot(q.T) / norms_prod
        
    return scipy.sparse.csr_matrix(res)

In [26]:
def fast_cosine_similarity(p, q, transpose_p=False, transpose_q=False):
    """
    Computes the cosine similarity of two d-dimensional matrices,
    using pre-normalization
    """
    # If it is a vector, consider it as a single sample matrix
    if len(p.shape) == 1:
        p = p.reshape(1, -1)
    if len(q.shape) == 1:
        q = q.reshape(1, -1)

    # Check for sparsity
    if not hasattr(scipy.sparse, type(p).__name__):
        p = scipy.sparse.csr_matrix(p)
    if not hasattr(scipy.sparse, type(q).__name__):
        q = scipy.sparse.csr_matrix(q)
    
    # Normalize inputs
    normalized_p = sklearn.preprocessing.normalize(p, axis=0)
    normalized_q = sklearn.preprocessing.normalize(q, axis=0)
    
    # Compute cosine similarity
    return (
        normalized_p.T * normalized_q if transpose_p 
        else normalized_q.T * normalized_p
    )

In [29]:
coo_similarity_matrix = fast_cosine_similarity(
    co_occurrence_matrix, co_occurrence_matrix, transpose_q=True
)

In [30]:
coo_similarity_matrix

<19383x19383 sparse matrix of type '<class 'numpy.float64'>'
	with 227364251 stored elements in Compressed Sparse Row format>

In [31]:
def nearest_neighbors(word, similarity_matrix, word_to_idx, k=1, farthest=False):
    index = word_to_idx[word]
    similarities = []
    for w, i in word_to_idx.items():
        similarities.append((w, similarity_matrix[index, i]))
    return sorted(similarities, key=lambda t: t[1], reverse=(not farthest))[1 : k + 1]

In [32]:
print(nearest_neighbors("film", coo_similarity_matrix, word_to_idx, k=5))
print(nearest_neighbors("amazing", coo_similarity_matrix, word_to_idx, k=5, farthest=True))
print(nearest_neighbors("good", coo_similarity_matrix, word_to_idx, k=5))
print(nearest_neighbors("good", coo_similarity_matrix, word_to_idx, k=5, farthest=True))

[('movie', 0.9700280746239791), ('film,', 0.9463668360456202), ('film.', 0.9461475062242477), ('in', 0.9428380248016573), ('is', 0.9400285409566773)]
[('"euro".', 0.0), ('"good"haha.', 0.0), ('"india', 0.0), ('"left-wing', 0.0), ('"nearly', 0.0)]
[('very', 0.9252070826651666), ('little', 0.9155640236149524), ('great', 0.9123573420111016), ('nice', 0.8878501392324601), ('as', 0.8862166197402453)]
[('(golden', 0.0), ('(little', 0.0), ('(spark,', 0.0), ('-),', 0.0), ('-episodes', 0.0)]


In [None]:
top_K_words, top_K_values = get_top_K_word_ranking(co_occurrence_matrix,
                                                   idx_to_word,
                                                   word_to_idx,
                                                   ['doctor', 'man'],
                                                   ['nurse'],
                                                   K)
print('Top K words: ', top_K_words)
print('Top K values: ', top_K_values)

In [35]:
def convert_ppmi(co_occurrence_matrix, to_dense=False):
    """
    Converts a count-based co-occurrence matrix to a PPMI matrix
    """
    # Compute sums
    total_sum = float(co_occurrence_matrix.sum())
    row_col_sums = np.array(
        co_occurrence_matrix.sum(axis=1), dtype=np.float64
    ).flatten()

    # Get CSR matrix elements
    if not hasattr(scipy.sparse, type(co_occurrence_matrix).__name__):
        co_occurrence_matrix = scipy.sparse.csr_matrix(co_occurrence_matrix)
    data, indices, indptr = (
        list(enumerate(co_occurrence_matrix.data)),
        co_occurrence_matrix.indices,
        co_occurrence_matrix.indptr,
    )

    # Compute PPMI matrix
    ppmi_data, ppmi_indices, ppmi_indptr = [], [], [0]
    for row in tqdm(range(len(indptr) - 1)):
        for col, elem in data[indptr[row] : indptr[row + 1]]:
            pmi = np.log2(
                (elem * total_sum) / (row_col_sums[row] * row_col_sums[indices[col]])
            )
            if pmi > 0:
                ppmi_data.append(pmi)
                ppmi_indices.append(indices[col])
        if ppmi_indptr[-1] != len(ppmi_data):
            ppmi_indptr.append(len(ppmi_data))

    # Re-format as sparse matrix
    res = scipy.sparse.csr_matrix(
        (ppmi_data, ppmi_indices, ppmi_indptr), dtype=np.float64
    )
    res.eliminate_zeros()
    return res if not to_dense else res.toarray()

In [36]:
ppmi_occurrence_matrix = convert_ppmi(co_occurrence_matrix)

100%|██████████| 19383/19383 [00:07<00:00, 2641.40it/s]


In [37]:
ppmi_occurrence_matrix

<19383x19383 sparse matrix of type '<class 'numpy.float64'>'
	with 478480 stored elements in Compressed Sparse Row format>

In [38]:
ppmi_svd = utils.reduce_svd(ppmi_occurrence_matrix, seed=RANDOM_SEED)

In [40]:
utils.visualize_embeddings(ppmi_svd, ['good', 'love', 'beautiful'], word_to_idx)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [None]:
ppmi_tsne = utils.reduce_tsne(ppmi_occurrence_matrix, seed=RANDOM_SEED)

In [None]:
utils.visualize_embeddings(ppmi_tsne, ['good', 'love', 'beautiful'], word_to_idx)

In [38]:
embedding_model_type = "glove"
embedding_dimension = 50
embedding_model = utils.load_embedding_model(embedding_model_type, embedding_dimension)

In [39]:
def check_oov_terms(embedding_model, word_listing):
    """
    Checks differences between pre-trained embedding model vocabulary
    and dataset specific vocabulary in order to highlight out-of-vocabulary terms.
    """
    oov_terms = []
    for word in word_listing:
        if word not in embedding_model.vocab:
            oov_terms.append(word)
    return oov_terms

In [40]:
oov_terms = check_oov_terms(embedding_model, word_listing)
print(f"Total OOV terms: {len(oov_terms)} ({round(len(oov_terms) / len(word_listing), 2)}%)")

Total OOV terms: 9390 (0.48%)


In [29]:
def build_embedding_matrix(
    embedding_model,
    embedding_dimension,
    word_to_idx,
    idx_to_word,
    oov_terms,
    co_occurrence_count_matrix,
    method="mean",
):
    """
    Builds the embedding matrix of a specific dataset given a pre-trained Gensim word embedding model
    """

    def random_embedding(embedding_dimension, interval=(-1, 1)):
        return interval[0] + np.random.sample(embedding_dimension) + interval[1]

    embedding_matrix = np.zeros((len(word_to_idx), embedding_dimension))
    for word, index in word_to_idx.items():
        # Words that are no OOV are taken from the Gensim model
        if word not in oov_terms:
            word_vector = embedding_model[word]
        # OOV words computed as the mean of not OOV neighboring words in the dataset
        elif method == "mean":
            neighboring_word_indices = co_occurrence_count_matrix.indices[
                co_occurrence_count_matrix.indptr[index]:co_occurrence_count_matrix.indptr[index + 1]
            ]
            neighboring_word_vectors = np.array(
                [
                    embedding_model[idx_to_word[k]]
                    for k in neighboring_word_indices
                    if idx_to_word[k] in embedding_model
                ]
            )
            # Check if at least one neighboring word is in the Gensim model vocabulary
            if len(neighboring_word_vectors) > 0:
                word_vector = np.mean(neighboring_word_vectors, axis=0)
            # If not, resort to random vectors
            else:
                word_vector = random_embedding(embedding_dimension)
        # OOV words computed as random vectors in range [-1, 1]
        elif method == "random":
            word_vector = random_embedding(embedding_dimension)
        embedding_matrix[index, :] = word_vector
    return embedding_matrix

In [30]:
embedding_matrix = build_embedding_matrix(
    embedding_model,
    embedding_dimension,
    word_to_idx,
    idx_to_word,
    oov_terms,
    co_occurrence_matrix,
)

In [31]:
embedding_matrix.shape

(19383, 50)