In [1]:
import pandas as pd
import gensim
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('spam.csv',encoding='latin-1')

In [3]:
df.shape

(5572, 5)

In [4]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [5]:
df.columns

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [6]:
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'],axis=1,inplace=True)

In [7]:
df.columns = ['label','text']

In [8]:
df['text_clean'] = df['text'].apply(lambda x: gensim.utils.simple_preprocess(x))

In [9]:
df.head()

Unnamed: 0,label,text,text_clean
0,ham,"Go until jurong point, crazy.. Available only ...","[go, until, jurong, point, crazy, available, o..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, wkly, comp, to, win, fa, cup..."
3,ham,U dun say so early hor... U c already then say...,"[dun, say, so, early, hor, already, then, say]"
4,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, don, think, he, goes, to, usf, he, lives..."


In [10]:
X_train, X_test, y_train, y_test = train_test_split(df['text_clean'],
                                                    df['label'], test_size=0.2)

In [11]:
# Create tagged document objects to prepare to train the model
tagged_docs = [gensim.models.doc2vec.TaggedDocument(v,[i]) for i,v in enumerate(X_train)]

Here we are using index for the tag. It should generally be a topic related tag. We can later use it for topic modelling like clustering the same documents.

In [12]:
X_train.shape

(4457,)

In [13]:
len(tagged_docs)

4457

In [14]:
tagged_docs[0]

TaggedDocument(words=['haha', 'good', 'to', 'hear', 'officially', 'paid', 'and', 'on', 'the', 'market', 'for', 'an', 'th'], tags=[0])

In [15]:
d2v_model = gensim.models.Doc2Vec(tagged_docs,vector_size=100, window= 5 )

In [16]:
#It will not work on a single string
d2v_model.infer_vector('King')

TypeError: Parameter doc_words of infer_vector() must be a list of strings (not a single string).

In [17]:
d2v_model.infer_vector(['King','is','dead','.'])

array([-0.00983823,  0.01675   ,  0.01327844, -0.00272052,  0.00229653,
       -0.03447121,  0.00473601,  0.04951475, -0.02171814, -0.02437157,
       -0.02791663, -0.0441206 ,  0.00753125,  0.01444491,  0.01090808,
       -0.02164501,  0.01657606, -0.02524606,  0.00127753, -0.04965498,
        0.01355841,  0.01531728,  0.01928653, -0.01424606, -0.00648841,
       -0.00340772, -0.01875123, -0.02562121, -0.01504201, -0.01743628,
        0.02807012,  0.01454458,  0.0125759 , -0.01275914, -0.01002979,
        0.02788577,  0.0080764 , -0.02236235, -0.01491709, -0.0418215 ,
        0.00022152, -0.02671449, -0.00338701, -0.0036188 ,  0.01570048,
       -0.00643619, -0.02201773, -0.00658422,  0.0050827 ,  0.00860386,
        0.02234035, -0.01160616, -0.006371  ,  0.00135769, -0.01110563,
        0.01236768,  0.00636389,  0.00095394, -0.02935318,  0.00777448,
        0.0077316 ,  0.00521949, -0.00421635, -0.00085605, -0.03412816,
        0.03138021,  0.01873833,  0.01993297, -0.03335539,  0.02

In [19]:
print(len(d2v_model.infer_vector(['King','is','dead','.'])))
print(type(d2v_model.infer_vector(['King','is','dead','.'])))

100
<class 'numpy.ndarray'>


In [24]:
test_vectors = [ [d2v_model.infer_vector(words)] for words in X_test]

In [26]:
type(test_vectors)

list