# doc2vec: How To Prep Document Vectors For Modeling

### Train Our Own Model

In [1]:
# Read in data, clean it, split it into train/test, and then train a doc2vec model
import gensim
import pandas as pd
from sklearn.model_selection import train_test_split
pd.set_option('display.max_colwidth', 100)

messages = pd.read_csv('../../../data/spam.csv', encoding='latin-1')
messages = messages.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
messages.columns = ["label", "text"]
messages['text_clean'] = messages['text'].apply(lambda x: gensim.utils.simple_preprocess(x))

X_train, X_test, y_train, y_test = train_test_split(messages['text_clean'],
                                                    messages['label'], test_size=0.2)

tagged_docs_tr = [gensim.models.doc2vec.TaggedDocument(v, [i]) for i, v in enumerate(X_train)]

d2v_model = gensim.models.Doc2Vec(tagged_docs_tr,
                                  vector_size=50,
                                  window=2,
                                  min_count=2)

In [2]:
# What does a document vector look like again?
d2v_model.infer_vector(['convert', 'words', 'to', 'vectors'])

array([ 0.00789178,  0.00179726, -0.00867785, -0.02377464,  0.01347704,
        0.01534288, -0.01276833, -0.0040705 , -0.01572244,  0.00372224,
        0.01325616,  0.01769315,  0.00371708,  0.01176185,  0.02344394,
       -0.00152305,  0.00475951,  0.00444851,  0.01137068, -0.00524353,
       -0.00312748,  0.00836366,  0.00295096,  0.00052437, -0.01073263,
        0.0056195 , -0.0064914 , -0.01604193,  0.01453729,  0.01601758,
        0.00298155,  0.00949255, -0.00374035,  0.01031712,  0.00238959,
        0.00439079, -0.01720384, -0.02651107, -0.01355453,  0.01799411,
       -0.00503105,  0.00260428,  0.01283077, -0.01282513, -0.00190053,
       -0.0080715 ,  0.0101283 ,  0.00883396, -0.00948556, -0.00480423],
      dtype=float32)

In [3]:
# How do we prepare these vectors to be used in a machine learning model?
vectors = [[d2v_model.infer_vector (words)] for words in X_test]

'''
Storing w2v vectors as arrays bc we needed to do element wise averaging across all of the arrays to create
our single vector representation of a text message. Element-wise averaging is much easier to do with an array 
than a list. Secondly. document vectors are not deterministic, so these vectors are slightly different 
each time I run it.
'''

In [4]:
vectors[0] #looks random to us, but there is a meaning that the model is learning 

[array([-1.8864933e-03,  7.5172829e-03,  2.7195187e-03, -9.6630324e-03,
         1.0285481e-02,  9.2690727e-03, -4.8824595e-03, -5.5104955e-03,
        -1.2425218e-02,  9.8774373e-04, -9.0716407e-04,  2.3203853e-03,
         1.1044599e-02,  1.1648269e-02,  1.1151540e-02, -5.3891726e-03,
         7.5852433e-03,  1.1432619e-03, -2.4092987e-03,  5.4307231e-03,
         4.0494613e-03, -9.0388861e-03, -8.6892759e-03, -8.3465257e-04,
        -9.2695502e-04,  1.6014901e-03,  4.4474592e-03,  1.5120900e-03,
         7.7567738e-03, -2.7301288e-03,  1.0844887e-03, -4.4334084e-03,
        -7.6287156e-03,  3.0590678e-03,  4.5597740e-03, -1.7824781e-03,
        -1.0330799e-02, -5.8183079e-03,  4.5669070e-03, -9.5598938e-05,
        -3.4995831e-04, -7.6616248e-03, -1.0572328e-03, -9.6582826e-03,
         2.1980891e-03,  8.2624739e-04,  7.0422245e-03,  5.5585732e-03,
        -7.4954093e-03, -2.8866485e-03], dtype=float32)]