# Glove or Spacy encodings

Takes text inputs of varying lengths and outputs glove encodings of the same length

1. Tokenize with Spacy
2. Get word embeddings (start with Spacy's, then use Glove if needed)
3. Let torch handle uneven text lengths with `pad_sequence`?

In [None]:
#!/usr/local/anaconda3/envs/spike_basicoV5/bin/python -m spacy download en_core_web_sm
#!/usr/local/anaconda3/envs/spike_basicoV5/bin/python -m spacy download en_core_web_lg

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import src

In [3]:
import spacy
import numpy as np
nlp = spacy.load('en_core_web_sm', )

doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for token in doc:
    print(token.text)

Apple
is
looking
at
buying
U.K.
startup
for
$
1
billion


In [4]:
nlp = spacy.load("en_core_web_lg")
tokens = nlp("dog cat banana afskfsd")
for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)

dog True 7.0336733 False
cat True 6.6808186 False
banana True 6.700014 False
afskfsd False 0.0 True


In [5]:
example_list = ["hiya, what's up?",
               "another description of stuff",
               "and a final one! Yay!"]


+ index_item_matrix (items x largo_maximo_glosas)
+ embedding_matrix  (npalabras + 1 x nlatentes)

In [6]:
encodings_of_all_sentences = src.encode_sequence(example_list, nlp, n_of_dims=300)
encodings_of_all_sentences.shape

(3, 7, 300)

In [9]:
index_item_matrix, embedding_matrix, unique_tokens_d = src.get_embedding_and_index_item_matrix(example_list,
                                                                                               nlp, n_of_embedding_dims=300)

### Pre-process Movie Data

In [7]:
project_id = "spike-sandbox"

query = """
SELECT movie_id, movie_name, category_0
FROM EVIC.movies
"""

import pandas as pd

movies = pd.read_gbq(query, project_id=project_id)

In [8]:
movies['full_string'] = movies['movie_name'] + ' ' + movies.category_0

In [12]:
movies.sort_values('movie_id', inplace=True)
index_item_matrix, embedding_matrix, unique_tokens_d  = src.get_embedding_and_index_item_matrix(
                                                        movies.full_string.values, nlp, n_of_embedding_dims=300)


In [13]:
movies.full_string.values

array(['Toy Story (1995) Adventure', 'Jumanji (1995) Adventure',
       'Grumpier Old Men (1995) Comedy', ..., 'Choke (2008) Comedy',
       'Revolutionary Road (2008) Drama',
       'Blackadder Back & Forth (1999) Comedy'], dtype=object)

In [16]:
export_d = {'index_item_matrix': index_item_matrix,
           'embedding_matrix': embedding_matrix,
           'unique_tokens_d': unique_tokens_d}

import pickle
with open('data/index_item_and_embedding_matrix_dict.pickle', 'wb') as handle:
    pickle.dump(export_d, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [15]:
for text, ind in unique_tokens_d.items():
    print(text, ind)

Toy 0
Story 1
( 2
1995 3
) 4
Adventure 5
Jumanji 6
Grumpier 7
Old 8
Men 9
Comedy 10
Waiting 11
to 12
Exhale 13
Father 14
of 15
the 16
Bride 17
Part 18
II 19
Heat 20
Action 21
Sabrina 22
Tom 23
and 24
Huck 25
Sudden 26
Death 27
GoldenEye 28
American 29
President 30
, 31
The 32
Dracula 33
: 34
Dead 35
Loving 36
It 37
Balto 38
Animation 39
Nixon 40
Drama 41
Cutthroat 42
Island 43
Casino 44
Crime 45
Sense 46
Sensibility 47
Four 48
Rooms 49
Ace 50
Ventura 51
When 52
Nature 53
Calls 54
Money 55
Train 56
Get 57
Shorty 58
Copycat 59
Assassins 60
Powder 61
Leaving 62
Las 63
Vegas 64
Othello 65
Now 66
Then 67
Persuasion 68
City 69
Lost 70
Children 71
Cité 72
des 73
enfants 74
perdus 75
La 76
Shanghai 77
Triad 78
Yao 79
a 80
yao 81
dao 82
waipo 83
qiao 84
Dangerous 85
Minds 86
12 87
Monkeys 88
Twelve 89
Sci 90
- 91
Fi 92
Wings 93
Courage 94
Babe 95
Carrington 96
Man 97
Walking 98
Across 99
Sea 100
Time 101
Documentary 102
Takes 103
Two 104
Clueless 105
Cry 106
Beloved 107
Country 108
Richard 109


In [53]:
unique_tokens_d

{'hiya': 0,
 ',': 1,
 'what': 2,
 "'s": 3,
 'up': 4,
 '?': 5,
 'another': 6,
 'description': 7,
 'of': 8,
 'stuff': 9,
 'and': 10,
 'a': 11,
 'final': 12,
 'one': 13,
 '!': 14,
 'Yay': 15}

## Do Padding with pytorch instead?

In [6]:
from torch.nn.utils.rnn import pad_sequence
import torch
a = torch.ones(25, 300)
b = torch.ones(22, 300)
c = torch.ones(15, 300)
pad_sequence([a, b, c]).size()

torch.Size([25, 3, 300])