<a href="https://colab.research.google.com/github/Preranakh/800LevelProject/blob/main/doc2vec%3A%20How%20To%20Prep%20Document%20Vectors%20For%20Modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# doc2vec: How To Prep Document Vectors For Modeling

### Train Our Own Model

In [None]:
# Read in data, clean it, split it into train/test, and then train a doc2vec model
import gensim
import pandas as pd
from sklearn.model_selection import train_test_split
pd.set_option('display.max_colwidth', 100)

messages = pd.read_csv('spam.csv', encoding='latin-1')
messages = messages.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
messages.columns = ["label", "text"]
messages['text_clean'] = messages['text'].apply(lambda x: gensim.utils.simple_preprocess(x))

X_train, X_test, y_train, y_test = train_test_split(messages['text_clean'],
                                                    messages['label'], test_size=0.2)

tagged_docs_tr = [gensim.models.doc2vec.TaggedDocument(v, [i]) for i, v in enumerate(X_train)]

d2v_model = gensim.models.Doc2Vec(tagged_docs_tr,
                                  vector_size=50,
                                  window=2,
                                  min_count=2)



In [None]:
# What does a document vector look like again?
d2v_model.infer_vector(['convert', 'words', 'to', 'vectors'])

array([ 0.00151944,  0.01132164,  0.03140726, -0.00106518, -0.02262434,
        0.00023972,  0.03444552, -0.00729175,  0.00968945,  0.0086614 ,
       -0.01126313,  0.01954444, -0.00801179,  0.01314732, -0.02157699,
        0.02634576, -0.00615448, -0.01450423, -0.04639507, -0.00788815,
       -0.00713077, -0.00187751, -0.01582085,  0.00790736, -0.00328182,
        0.00675943,  0.01246577,  0.00300645, -0.01500604, -0.00033372,
        0.00301996, -0.01699701,  0.00275437, -0.01818444,  0.00390828,
        0.0195324 ,  0.02644457,  0.02060623,  0.01175738,  0.00793391,
        0.00034175, -0.00802218, -0.00554608,  0.01175212,  0.0055935 ,
       -0.00954805,  0.00841827, -0.02477973,  0.00583086, -0.00660794],
      dtype=float32)

In [None]:
# How do we prepare these vectors to be used in a machine learning model?
vectors = [[d2v_model.infer_vector(words)] for words in X_test]

In [None]:
vectors[0]

[array([ 0.00783376,  0.00775851,  0.02024805,  0.0049935 , -0.00150692,
        -0.00203217,  0.01880531,  0.00567714,  0.00793545,  0.00866023,
        -0.01281821,  0.01232355, -0.02073684,  0.00752395, -0.01973525,
         0.0174582 , -0.01828633, -0.0167623 , -0.02213873,  0.00389899,
         0.00523663,  0.01536538, -0.00166003,  0.01063706, -0.00159474,
         0.0028676 ,  0.02435752, -0.00245815, -0.01784284,  0.00704234,
         0.00260431,  0.00143783,  0.00879969, -0.02413778,  0.00830294,
         0.0102398 ,  0.02272806,  0.00852937,  0.01124532,  0.01234766,
        -0.00562462, -0.00117927,  0.00712122,  0.01221628, -0.00363981,
         0.00792707, -0.00399919, -0.0162257 ,  0.00325484, -0.00716213],
       dtype=float32)]