# Glove or Spacy encodings

Takes text inputs of varying lengths and outputs glove encodings of the same length

1. Tokenize with Spacy
2. Get word embeddings (start with Spacy's, then use Glove if needed)
3. Let torch handle uneven text lengths with `pad_sequence`?

In [None]:
#!/usr/local/anaconda3/envs/spike_basicoV5/bin/python -m spacy download en_core_web_sm
#!/usr/local/anaconda3/envs/spike_basicoV5/bin/python -m spacy download en_core_web_lg

In [2]:
import spacy
nlp = spacy.load('en_core_web_sm', )

doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for token in doc:
    print(token.text)

Apple
is
looking
at
buying
U.K.
startup
for
$
1
billion


In [4]:
nlp = spacy.load("en_core_web_lg")
tokens = nlp("dog cat banana afskfsd")
for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)

dog True 7.0336733 False
cat True 6.6808186 False
banana True 6.700014 False
afskfsd False 0.0 True


In [9]:
example_list = ["hiya, what's up?",
               "another description of stuff",
               "and a final one! Yay!"]


In [18]:
import numpy as np
np.empty(10, dtype='int')

array([3758096384178, 1563368096057,  244813136091, 3186865734196,
       3874060501017,  764504178876, 3874060501017,  764504178876,
       3874060501017,  764504178876])

In [22]:
example_list[1] + ' xxxpad'*3

'another description of stuff xxxpad xxxpad xxxpad'

In [47]:
def encode_sequence(list_of_text, nlp_model, pad_text="xxxpad",
                   n_of_dims=300):
    """
    Encodes a list of text with an nlp_model
    Pads text with a string so they all have the same length
    """
    max_length = 0
    n_of_sentences = len(list_of_text)
    lengths = np.empty(n_of_sentences, dtype='int')
    for i, sentence in enumerate(list_of_text):
        tokens = nlp_model(sentence)
        lengths[i] = len(tokens)
        max_length = max(max_length, len(tokens))
    
    array_of_encodings = np.empty((len(list_of_text), max_length, n_of_dims))
    
    for i, sentence in enumerate(list_of_text):
        #pad text
        sentence += (' ' + pad_text)*(max_length - lengths[i])
        tokens = nlp(sentence)
        array_of_encodings[i, :, :] = np.array([token.vector for token in tokens])

    return array_of_encodings
    
encodings_of_all_sentences = encode_sequence(example_list, nlp)

## Do Padding with pytorch instead?

In [6]:
from torch.nn.utils.rnn import pad_sequence
import torch
a = torch.ones(25, 300)
b = torch.ones(22, 300)
c = torch.ones(15, 300)
pad_sequence([a, b, c]).size()

torch.Size([25, 3, 300])