# Glove or Spacy encodings

Takes text inputs of varying lengths and outputs glove encodings of the same length

1. Tokenize with Spacy
2. Get word embeddings (start with Spacy's, then use Glove if needed)
3. Let torch handle uneven text lengths with `pad_sequence`?

In [None]:
#!/usr/local/anaconda3/envs/spike_basicoV5/bin/python -m spacy download en_core_web_sm
#!/usr/local/anaconda3/envs/spike_basicoV5/bin/python -m spacy download en_core_web_lg

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import src

In [3]:
import spacy
import numpy as np
nlp = spacy.load('en_core_web_sm', )

doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for token in doc:
    print(token.text)

Apple
is
looking
at
buying
U.K.
startup
for
$
1
billion


In [4]:
nlp = spacy.load("en_core_web_lg")
tokens = nlp("dog cat banana afskfsd")
for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)

dog True 7.0336733 False
cat True 6.6808186 False
banana True 6.700014 False
afskfsd False 0.0 True


In [5]:
example_list = ["hiya, what's up?",
               "another description of stuff",
               "and a final one! Yay!"]


+ index_item_matrix (items x largo_maximo_glosas)
+ embedding_matrix  (npalabras + 1 x nlatentes)

In [7]:
encodings_of_all_sentences = src.encode_sequence(example_list, nlp, n_of_dims=300)
encodings_of_all_sentences.shape

(3, 7, 300)

In [9]:
index_item_matrix, embedding_matrix, unique_tokens_d = src.get_embedding_and_index_item_matrix(example_list,
                                                                                               nlp, n_of_embedding_dims=300)

### Pre-process Movie Data

In [10]:

project_id = "spike-sandbox"

query = """
SELECT movie_id, movie_name, category_0
FROM EVIC.movies
"""

import pandas as pd

movies = pd.read_gbq(query, project_id=project_id)

In [13]:
movies['full_string'] = movies['movie_name'] + ' ' + movies.category_0

In [15]:
movies.head(3)

Unnamed: 0,movie_id,movie_name,category_0,full_string
0,777,Pharaoh's Army (1995),War,Pharaoh's Army (1995) War
1,966,"Walk in the Sun, A (1945)",War,"Walk in the Sun, A (1945) War"
2,1450,Prisoner of the Mountains (Kavkazsky plennik) ...,War,Prisoner of the Mountains (Kavkazsky plennik) ...


In [17]:
index_item_matrix, embedding_matrix, unique_tokens_d  = src.get_embedding_and_index_item_matrix(
                                                        movies.full_string.values, nlp, n_of_embedding_dims=300)


In [21]:
export_d = {'index_item_matrix': index_item_matrix,
           'embedding_matrix': embedding_matrix,
           'unique_tokens_d': unique_tokens_d}

import pickle
with open('data/index_item_and_embedding_matrix_dict.pickle', 'wb') as handle:
    pickle.dump(export_d, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [61]:
for text, ind in unique_tokens_d.items():
    print(text, ind)

hiya 0
, 1
what 2
's 3
up 4
? 5
another 6
description 7
of 8
stuff 9
and 10
a 11
final 12
one 13
! 14
Yay 15


In [53]:
unique_tokens_d

{'hiya': 0,
 ',': 1,
 'what': 2,
 "'s": 3,
 'up': 4,
 '?': 5,
 'another': 6,
 'description': 7,
 'of': 8,
 'stuff': 9,
 'and': 10,
 'a': 11,
 'final': 12,
 'one': 13,
 '!': 14,
 'Yay': 15}

In [39]:
unique_tokens = get_embedding_and_index_item_matrix(example_list, nlp, pad_string="xxxpad",
                   n_of_embedding_dims=96)

In [40]:
unique_tokens

{'hiya': 0,
 ',': 1,
 'what': 2,
 "'s": 3,
 'up': 4,
 '?': 5,
 'another': 6,
 'description': 7,
 'of': 8,
 'stuff': 9,
 'and': 10,
 'a': 11,
 'final': 12,
 'one': 13,
 '!': 14,
 'Yay': 15}

## Do Padding with pytorch instead?

In [6]:
from torch.nn.utils.rnn import pad_sequence
import torch
a = torch.ones(25, 300)
b = torch.ones(22, 300)
c = torch.ones(15, 300)
pad_sequence([a, b, c]).size()

torch.Size([25, 3, 300])