## Basics Of Doc2Vec

In [23]:
import pandas as pd
import gensim
from gensim.models import Doc2Vec
from gensim.utils import simple_preprocess
from gensim.models.doc2vec import TaggedDocument
from sklearn.model_selection import train_test_split
pd.set_option('display.max_colwidth', 100)


In [9]:
messagesData = pd.read_csv('spam.csv', encoding='latin-1')
messagesData = messagesData.drop(['Unnamed: 2','Unnamed: 3', 'Unnamed: 4'], axis=1)

In [11]:
print("Shape Of the dataset is : ", messagesData.shape)
messagesData.head()

Shape Of the dataset is :  (5572, 2)


Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


In [13]:
print("Distribution of Spam and Ham in the dataset are : ")
messagesData['label'].value_counts()

Distribution of Spam and Ham in the dataset are : 


ham     4825
spam     747
Name: label, dtype: int64

### Text Cleaning

Clean the text and pre-process it.

In [18]:
messagesData['text_clean'] = messagesData['text'].apply(lambda x: simple_preprocess(x))

In [19]:
messagesData.head()

Unnamed: 0,label,text,text_clean
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...","[go, until, jurong, point, crazy, available, only, in, bugis, great, world, la, buffet, cine, th..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entry, in, wkly, comp, to, win, fa, cup, final, tkts, st, may, text, fa, to, to, receive,..."
3,ham,U dun say so early hor... U c already then say...,"[dun, say, so, early, hor, already, then, say]"
4,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, don, think, he, goes, to, usf, he, lives, around, here, though]"


In [26]:
# Split the data into train and test...
X_train, X_test, y_train, y_test = train_test_split(messagesData['text_clean'], 
                                                    messagesData['label'], 
                                                    test_size=0.2)

In [51]:
# Create tagged documents...
tagged_docs = [TaggedDocument(words=value, tags=[str(index)]) for index, value in enumerate(X_train)]

In [55]:
tagged_docs[0]

TaggedDocument(words=['gud', 'ni', 'dear', 'slp', 'well', 'take', 'care', 'swt', 'dreams', 'muah'], tags=['0'])

In [56]:
model_d2v = Doc2Vec(tagged_docs,
                   vector_size=100,
                   window=5,
                   min_count=2)

In [57]:
model_d2v.infer_vector(['i','am', 'learning', 'nlp'])

array([-6.15543732e-03,  7.02834735e-03, -2.35414365e-03,  4.24405513e-03,
        3.08401370e-03, -1.07090292e-03,  1.91943184e-03,  4.42718435e-03,
       -7.00676395e-03,  4.38497495e-03,  1.56381782e-02,  4.93112020e-04,
        6.60312548e-03,  9.75890644e-03, -3.62993637e-03,  1.44379528e-03,
       -1.03157246e-02,  1.20594315e-02,  2.40801671e-03, -3.29758483e-03,
        5.55190444e-03,  1.24145678e-04,  2.64947908e-03,  3.12144373e-04,
       -5.47410338e-04,  3.67438816e-03, -2.14254507e-03,  1.75181404e-03,
       -7.45737180e-03,  2.03588349e-03, -1.14017814e-04,  1.07255857e-03,
        9.10566375e-03, -3.80413351e-03, -5.10922819e-03, -3.99098685e-03,
        2.09977245e-03,  2.85463524e-03, -7.01277424e-03,  8.19729734e-03,
       -1.49877975e-03, -2.91218981e-03,  2.70821969e-03, -1.53051992e-03,
       -6.75039273e-03, -3.83164757e-03, -1.96160702e-03, -1.49537373e-04,
        3.93639645e-03,  3.77238076e-03,  1.74163142e-04,  9.65790171e-03,
       -1.02547063e-02,  

In [59]:
# Prepare the vectors to be passed into the machine learning model...
vectors = [[model_d2v.infer_vector(word)] for word in X_test]

In [64]:
vectors[0]

[array([-5.3671678e-03,  5.1020185e-04, -4.0908796e-03,  9.0921402e-04,
         4.0739886e-03, -4.7100205e-03, -3.7805827e-03,  3.4590655e-03,
        -1.8986421e-02,  9.7225793e-03,  1.0686611e-02,  3.4462474e-03,
         1.2066212e-03,  4.6618679e-03,  1.5893063e-03, -2.2070017e-03,
        -1.0551234e-02,  1.2079314e-02, -1.3269404e-03,  1.7862121e-03,
         4.4846102e-03, -9.2857977e-04,  3.2806767e-03, -1.8103214e-03,
         5.8145646e-04,  5.4084095e-03, -7.1187275e-03,  7.4259588e-03,
        -1.9709599e-04,  4.4738976e-03, -1.8609380e-03, -4.1548279e-03,
         6.9660009e-03, -1.2002401e-02,  1.5237051e-03, -6.7705852e-03,
         9.4192292e-05,  4.8808581e-03, -4.0833391e-03,  4.8780679e-03,
        -4.7803614e-03, -7.4840141e-03,  2.7903200e-03, -5.0829852e-04,
        -4.9716174e-03, -8.1258826e-03,  5.1625711e-03, -4.2339740e-03,
         8.4361713e-03,  7.5110891e-03,  6.0809311e-04,  1.2092414e-02,
        -5.0695315e-03, -5.4128296e-03,  3.5129662e-03, -1.08819