<a href="https://colab.research.google.com/github/Venture-Coding/Linkedin_Learning/blob/main/NLP/word2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# word2vec: Prep Word Vectors For Modeling

### Train Our Own Model

In [14]:
# Read in the data, clean it, split it into train and test sets, and then train a word2vec model
import gensim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
pd.set_option('display.max_colwidth', 100)

messages = pd.read_csv('spam.csv', encoding='latin-1')
messages = messages.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
messages.columns = ["label", "text"]


messages['text_clean'] = messages['text'].apply(lambda x: gensim.utils.simple_preprocess(x))
X_train, X_test, y_train, y_test = train_test_split(messages['text_clean'],
                                                    messages['label'], test_size=0.2)

w2v_model = gensim.models.Word2Vec(X_train,
                                   size=100,
                                   window=5,
                                   min_count=2)

### Prep Word Vectors

In [15]:
# Generate a list of words the word2vec model learned word vectors for
w2v_model.wv.index2word[:50]

['to',
 'you',
 'the',
 'and',
 'is',
 'in',
 'me',
 'it',
 'my',
 'for',
 'your',
 'of',
 'call',
 'have',
 'that',
 'now',
 'on',
 'are',
 'can',
 'not',
 'so',
 'but',
 'do',
 'or',
 'at',
 'we',
 'get',
 'be',
 'if',
 'will',
 'ur',
 'no',
 'with',
 'just',
 'this',
 'up',
 'gt',
 'lt',
 'when',
 'how',
 'free',
 'what',
 'from',
 'ok',
 'go',
 'all',
 'out',
 'll',
 'know',
 'good']

In [16]:
# Generate aggregated sentence vectors based on the word vectors for each word in the sentence
w2v_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in w2v_model.wv.index2word])
                     for ls in X_test])

  This is separate from the ipykernel package so we can avoid doing imports until


In [17]:
# Why is the length of the sentence different than the length of the sentence vector?
for i, v in enumerate(w2v_vect):
    print(len(X_test.iloc[i]), len(v))

3 3
6 6
26 26
8 8
16 12
4 3
18 15
10 9
7 6
15 14
14 10
6 6
5 3
31 31
7 7
23 23
2 2
60 60
5 5
18 16
19 19
6 6
7 6
7 5
5 3
20 20
20 16
4 4
26 20
13 12
22 22
4 4
6 6
28 25
11 11
13 9
5 5
8 8
27 25
10 10
5 5
4 3
4 4
18 18
9 8
18 17
26 25
10 9
7 7
11 10
8 8
22 22
19 19
24 20
14 13
25 24
2 0
9 9
10 9
27 26
15 14
5 3
6 6
14 14
16 12
6 6
1 1
23 16
11 10
21 21
8 8
8 8
12 12
21 21
31 27
5 5
16 16
24 23
25 25
5 4
6 6
13 12
6 4
7 7
23 21
16 15
13 12
6 6
16 15
2 2
24 23
8 7
5 3
6 6
4 4
4 3
24 20
8 7
9 9
12 12
26 26
9 9
4 4
24 23
4 4
14 14
9 9
26 23
6 5
11 8
6 5
21 20
13 10
17 16
9 7
26 26
6 6
5 5
10 10
24 19
7 6
42 39
20 18
20 17
8 8
6 5
16 11
7 7
8 8
4 4
14 14
36 34
15 12
9 9
9 9
38 36
11 11
5 4
21 21
23 7
10 10
5 5
8 8
10 9
4 4
22 21
8 6
7 7
9 8
5 5
8 8
13 12
22 20
4 4
12 11
7 5
2 2
23 23
15 15
19 19
26 26
22 20
32 31
5 3
7 6
12 12
21 21
19 19
21 21
3 3
21 20
6 6
8 7
23 23
29 26
39 29
34 32
6 6
22 21
7 5
20 18
14 14
8 6
8 8
9 7
21 21
0 0
5 4
33 28
11 11
6 5
27 26
7 7
23 23
6 5
17 17
15 15
4 3
21 

In [18]:
# Compute sentence vectors by averaging the word vectors for the words contained in the sentence
w2v_vect_avg = []

for vect in w2v_vect:
    if len(vect)!=0:
        w2v_vect_avg.append(vect.mean(axis=0))
    else:
        w2v_vect_avg.append(np.zeros(100))

In [19]:
# Are our sentence vector lengths consistent?
for i, v in enumerate(w2v_vect_avg):
  if i < 30:
    print(len(X_test.iloc[i]), len(v))

3 100
6 100
26 100
8 100
16 100
4 100
18 100
10 100
7 100
15 100
14 100
6 100
5 100
31 100
7 100
23 100
2 100
60 100
5 100
18 100
19 100
6 100
7 100
7 100
5 100
20 100
20 100
4 100
26 100
13 100


Yup, we do have all of them consistent.