<a href="https://colab.research.google.com/github/Venture-Coding/Linkedin_Learning/blob/main/NLP/doc2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# doc2vec: Prep Document Vectors For Modeling

### Train Our Own Model

In [1]:
# Read in data, clean it, split it into train/test, and then train a doc2vec model
import gensim
import pandas as pd
from sklearn.model_selection import train_test_split
pd.set_option('display.max_colwidth', 100)

messages = pd.read_csv('spam.csv', encoding='latin-1')
messages.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives around here though",,,


In [2]:
messages = messages.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
messages.columns = ["label", "text"]
messages['text_clean'] = messages['text'].apply(lambda x: gensim.utils.simple_preprocess(x))

X_train, X_test, y_train, y_test = train_test_split(messages['text_clean'],
                                                    messages['label'], test_size=0.2)

tagged_docs_tr = [gensim.models.doc2vec.TaggedDocument(v, [i]) for i, v in enumerate(X_train)]

d2v_model = gensim.models.Doc2Vec(tagged_docs_tr,
                                  vector_size=50,
                                  window=2,
                                  min_count=2)

In [3]:
# What does a document vector look like again?
d2v_model.infer_vector(['convert', 'words', 'to', 'vectors'])

array([-0.00412556,  0.01186739, -0.01328432,  0.0120802 ,  0.01250968,
       -0.00693019,  0.00597198,  0.01932476,  0.01145053, -0.00652549,
        0.01859386,  0.0086034 ,  0.00553505,  0.00817771, -0.01272245,
        0.00758645, -0.00031357,  0.009814  , -0.00713333, -0.0105396 ,
       -0.0022726 ,  0.01486406,  0.00603506,  0.00583262,  0.00992863,
       -0.0027992 , -0.00962885, -0.00443396,  0.01463754, -0.00441251,
        0.00905803, -0.01508413, -0.01684954, -0.00693312, -0.00899978,
       -0.00354817, -0.00183986, -0.00364084, -0.01269242, -0.01931383,
       -0.00709434, -0.02532249, -0.00351395,  0.01385808, -0.01524033,
       -0.00355795,  0.0175324 ,  0.001019  ,  0.00845837, -0.00021805],
      dtype=float32)

In [4]:
# How do we prepare these vectors to be used in a machine learning model?
vectors = [[d2v_model.infer_vector(words)] for words in X_test]

In [5]:
vectors[0]

[array([ 0.0208026 ,  0.11048983, -0.01253386,  0.04126347,  0.01871553,
        -0.00270041,  0.03049139,  0.08216374,  0.01014216, -0.01790841,
         0.07473028,  0.01102621, -0.01920313,  0.01651726, -0.031496  ,
        -0.00206541,  0.02457395,  0.04318288, -0.00089882, -0.10173186,
        -0.01984756,  0.04450674,  0.00857184, -0.00938395,  0.01071824,
         0.0023073 , -0.06346518,  0.00948959,  0.08795197, -0.00359903,
         0.06724188, -0.06668112, -0.07647999, -0.00305523,  0.00206878,
        -0.02936345, -0.0154579 , -0.00300377, -0.05961602, -0.14162768,
         0.01364166, -0.08451587, -0.0202428 ,  0.03441074, -0.02783517,
         0.02623395,  0.07394551, -0.02555393,  0.04044015,  0.03027109],
       dtype=float32)]