In [3]:
import pandas as pd
import numpy as np
import re
from tqdm import tqdm

from nltk.corpus import stopwords

from gensim.models import Word2Vec
from gensim.models import Phrases
from gensim.models.phrases import Phraser

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

%matplotlib inline



In [5]:
import csv
import os
import pandas as pd
data = pd.read_csv('D:\\Soong\\Soong\\corpus\\CNNNewsDataset.csv')

In [6]:
data.head()

Unnamed: 0,title,content
0,Claxton hunting first major medal\n,"For the first time, Claxton has only been prep..."
1,O'Sullivan could run in Worlds\n,Athletics Ireland have hinted that the 35-year...
2,Greene sets sights on world title\n,"""I just felt like I was running all alone. ""I ..."
3,IAAF launches fight against drugs\n,Two task forces have been set up to examine do...
4,"Dibaba breaks 5,000m world record\n",Dibaba won in 14 minutes 32.93 seconds to eras...


In [8]:
# get stop words from nltk
stopWords = stopwords.words('english')

# pre processing data
def cleanData(sentence):
    processedList = ""
    
    # convert to lowercase, ignore all special characters - keep only alpha-numericals and spaces (not removing full-stop here)
    sentence = re.sub(r'[^A-Za-z0-9\s.]',r'',str(sentence).lower())
    sentence = re.sub(r'\n',r' ',sentence)
    
    # remove stop words
    sentence = " ".join([word for word in sentence.split() if word not in stopWords])
    
    return sentence

In [9]:
# clean data 
data['content'] = data['content'].map(lambda x: cleanData(x))

In [10]:
data.head()

Unnamed: 0,title,content
0,Claxton hunting first major medal\n,first time claxton preparing campaign hurdles ...
1,O'Sullivan could run in Worlds\n,athletics ireland hinted 35yearold cobh runner...
2,Greene sets sights on world title\n,felt like running alone. believe middle race w...
3,IAAF launches fight against drugs\n,two task forces set examine doping nutrition i...
4,"Dibaba breaks 5,000m world record\n",dibaba 14 minutes 32.93 seconds erase previous...


In [12]:
Tcorpus = data['content'].map(lambda x: x.split('.'))

In [13]:
# Tqdm 是一个快速，可扩展的Python进度条，可以在 Python 长循环中添加一个进度提示信息，
#corpus 包括了所有的语料库的单词
corpus = []
for i in tqdm(range(len(Tcorpus))):
    for line in Tcorpus[i]:
        words = [x for x in line.split()]
        corpus.append(words)

100%|███████████████████████████████████████████████████████████████████████████| 1824/1824 [00:00<00:00, 40618.49it/s]


In [14]:
#获取语料库中句子，单词的长度信息
num_of_sentences = len(corpus)
num_of_words = 0
for line in corpus:
    num_of_words += len(line)

print('Num of sentences - %s'%(num_of_sentences))
print('Num of words - %s'%(num_of_words))

Num of sentences - 13964
Num of words - 133824


In [15]:
phrases = Phrases(sentences=corpus,min_count=25,threshold=50)
bigram = Phraser(phrases)

2019-07-21 15:05:12,811 : INFO : collecting all words and their counts
2019-07-21 15:05:12,812 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2019-07-21 15:05:12,945 : INFO : PROGRESS: at sentence #10000, processed 95534 words and 85009 word types
2019-07-21 15:05:13,000 : INFO : collected 116446 word types from a corpus of 133824 words (unigram + bigrams) and 13964 sentences
2019-07-21 15:05:13,001 : INFO : using 116446 counts as vocab in Phrases<0 vocab, min_count=25, threshold=50, max_vocab_size=40000000>
2019-07-21 15:05:13,002 : INFO : source_vocab length 116446
2019-07-21 15:05:13,779 : INFO : Phraser built with 20 20 phrasegrams


In [16]:
for index,sentence in enumerate(corpus):
    corpus[index] = bigram[sentence]

In [17]:
# shuffle corpus
def shuffle_corpus(sentences):
    shuffled = list(sentences)
    random.shuffle(shuffled)
    return shuffled

# 100dimension

In [18]:
# sg - skip gram |  window = size of the window | size = vector dimension
size = 100
window_size = 2 # sentences weren't too long, so
epochs = 100
min_count = 2
workers = 4

# train word2vec model using gensim
model = Word2Vec(corpus, sg=1,window=window_size,size=size,
                 min_count=min_count,workers=workers,iter=epochs,sample=0.01)

2019-07-21 15:06:57,029 : INFO : collecting all words and their counts
2019-07-21 15:06:57,030 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-07-21 15:06:57,048 : INFO : PROGRESS: at sentence #10000, processed 94754 words, keeping 15044 word types
2019-07-21 15:06:57,059 : INFO : collected 18547 word types from a corpus of 132827 raw words and 13964 sentences
2019-07-21 15:06:57,060 : INFO : Loading a fresh vocabulary
2019-07-21 15:06:57,082 : INFO : min_count=2 retains 10254 unique words (55% of original 18547, drops 8293)
2019-07-21 15:06:57,083 : INFO : min_count=2 leaves 124534 word corpus (93% of original 132827, drops 8293)
2019-07-21 15:06:57,105 : INFO : deleting the raw counts dictionary of 18547 items
2019-07-21 15:06:57,106 : INFO : sample=0.01 downsamples 0 most-common words
2019-07-21 15:06:57,107 : INFO : downsampling leaves estimated 124534 word corpus (100.0% of prior 124534)
2019-07-21 15:06:57,129 : INFO : estimated required memory for

2019-07-21 15:06:59,419 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-07-21 15:06:59,426 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-07-21 15:06:59,427 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-07-21 15:06:59,443 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-07-21 15:06:59,444 : INFO : EPOCH - 15 : training on 132827 raw words (124534 effective words) took 0.2s, 789216 effective words/s
2019-07-21 15:06:59,574 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-07-21 15:06:59,579 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-07-21 15:06:59,582 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-07-21 15:06:59,591 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-07-21 15:06:59,592 : INFO : EPOCH - 16 : training on 132827 raw words (124534 effective words) took 0.1s, 904397 effective words/s


2019-07-21 15:07:01,773 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-07-21 15:07:01,775 : INFO : EPOCH - 31 : training on 132827 raw words (124534 effective words) took 0.2s, 820446 effective words/s
2019-07-21 15:07:01,943 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-07-21 15:07:01,947 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-07-21 15:07:01,950 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-07-21 15:07:01,967 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-07-21 15:07:01,968 : INFO : EPOCH - 32 : training on 132827 raw words (124534 effective words) took 0.2s, 683793 effective words/s
2019-07-21 15:07:02,135 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-07-21 15:07:02,138 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-07-21 15:07:02,141 : INFO : worker thread finished; awaiting finish of 1 more threads


2019-07-21 15:07:04,359 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-07-21 15:07:04,367 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-07-21 15:07:04,370 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-07-21 15:07:04,381 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-07-21 15:07:04,381 : INFO : EPOCH - 48 : training on 132827 raw words (124534 effective words) took 0.1s, 928290 effective words/s
2019-07-21 15:07:04,509 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-07-21 15:07:04,510 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-07-21 15:07:04,511 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-07-21 15:07:04,526 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-07-21 15:07:04,527 : INFO : EPOCH - 49 : training on 132827 raw words (124534 effective words) took 0.1s, 925746 effective words/s


2019-07-21 15:07:06,518 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-07-21 15:07:06,519 : INFO : EPOCH - 64 : training on 132827 raw words (124534 effective words) took 0.1s, 1019888 effective words/s
2019-07-21 15:07:06,629 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-07-21 15:07:06,630 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-07-21 15:07:06,637 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-07-21 15:07:06,647 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-07-21 15:07:06,648 : INFO : EPOCH - 65 : training on 132827 raw words (124534 effective words) took 0.1s, 1037995 effective words/s
2019-07-21 15:07:06,757 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-07-21 15:07:06,761 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-07-21 15:07:06,767 : INFO : worker thread finished; awaiting finish of 1 more thread

2019-07-21 15:07:08,794 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-07-21 15:07:08,796 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-07-21 15:07:08,800 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-07-21 15:07:08,806 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-07-21 15:07:08,808 : INFO : EPOCH - 81 : training on 132827 raw words (124534 effective words) took 0.1s, 970539 effective words/s
2019-07-21 15:07:08,921 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-07-21 15:07:08,925 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-07-21 15:07:08,926 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-07-21 15:07:08,945 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-07-21 15:07:08,946 : INFO : EPOCH - 82 : training on 132827 raw words (124534 effective words) took 0.1s, 969711 effective words/s


2019-07-21 15:07:10,952 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-07-21 15:07:10,953 : INFO : EPOCH - 97 : training on 132827 raw words (124534 effective words) took 0.1s, 894744 effective words/s
2019-07-21 15:07:11,063 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-07-21 15:07:11,065 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-07-21 15:07:11,070 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-07-21 15:07:11,084 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-07-21 15:07:11,085 : INFO : EPOCH - 98 : training on 132827 raw words (124534 effective words) took 0.1s, 1015902 effective words/s
2019-07-21 15:07:11,214 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-07-21 15:07:11,216 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-07-21 15:07:11,220 : INFO : worker thread finished; awaiting finish of 1 more threads

In [19]:
# save model
model.save('D:\\Soong\\Soong\\corpus\\CNNNewsDatasetEmbedding\\CNNword2vec_model_100')

2019-07-21 15:08:53,275 : INFO : saving Word2Vec object under D:\Soong\Soong\corpus\CNNNewsDatasetEmbedding\CNNword2vec_model_100, separately None
2019-07-21 15:08:53,276 : INFO : not storing attribute vectors_norm
2019-07-21 15:08:53,277 : INFO : not storing attribute cum_table
2019-07-21 15:08:53,374 : INFO : saved D:\Soong\Soong\corpus\CNNNewsDatasetEmbedding\CNNword2vec_model_100


In [20]:
# load word2vec model
model = Word2Vec.load('D:\\Soong\\Soong\\corpus\\CNNNewsDatasetEmbedding\\CNNword2vec_model_100')

2019-07-21 15:09:12,662 : INFO : loading Word2Vec object from D:\Soong\Soong\corpus\CNNNewsDatasetEmbedding\CNNword2vec_model_100
2019-07-21 15:09:12,764 : INFO : loading wv recursively from D:\Soong\Soong\corpus\CNNNewsDatasetEmbedding\CNNword2vec_model_100.wv.* with mmap=None
2019-07-21 15:09:12,765 : INFO : setting ignored attribute vectors_norm to None
2019-07-21 15:09:12,766 : INFO : loading vocabulary recursively from D:\Soong\Soong\corpus\CNNNewsDatasetEmbedding\CNNword2vec_model_100.vocabulary.* with mmap=None
2019-07-21 15:09:12,766 : INFO : loading trainables recursively from D:\Soong\Soong\corpus\CNNNewsDatasetEmbedding\CNNword2vec_model_100.trainables.* with mmap=None
2019-07-21 15:09:12,767 : INFO : setting ignored attribute cum_table to None
2019-07-21 15:09:12,768 : INFO : loaded D:\Soong\Soong\corpus\CNNNewsDatasetEmbedding\CNNword2vec_model_100


In [22]:
model.wv['hotel']

array([-1.56256612e-02,  2.12111399e-01, -1.79761022e-01,  1.86606914e-01,
       -4.99335617e-01, -1.66405573e-01,  7.84050882e-01, -5.50614178e-01,
        3.09597790e-01,  4.45861183e-02,  6.49209440e-01, -2.52457529e-01,
       -1.98725745e-01, -1.79063082e-01,  3.62796545e-01, -1.30399340e-03,
        1.60638377e-01,  1.19059753e+00,  4.71335083e-01,  4.94677156e-01,
       -1.87464997e-01, -1.31527472e+00, -2.06182867e-01, -5.29919565e-02,
       -6.52990997e-01,  3.55300963e-01, -2.90789366e-01,  6.62089229e-01,
       -1.32811397e-01,  2.23098770e-01, -2.90507466e-01, -2.04251017e-02,
       -8.02846432e-01,  6.08568966e-01, -2.39825889e-01,  1.81732178e-02,
        1.23858607e+00, -7.18100905e-01, -1.08887248e-01,  5.82530677e-01,
       -1.08308387e+00,  6.57818735e-01, -9.70660523e-02, -6.88938677e-01,
       -1.22457691e-01,  6.78172112e-01, -8.59439671e-01,  2.79198825e-01,
        1.04719363e-01,  4.10822809e-01,  6.61870420e-01,  1.19430649e+00,
        9.78288352e-01, -

# 300dimension

In [23]:
# sg - skip gram |  window = size of the window | size = vector dimension
size = 300
window_size = 2 # sentences weren't too long, so
epochs = 100
min_count = 2
workers = 4

# train word2vec model using gensim
model = Word2Vec(corpus, sg=1,window=window_size,size=size,
                 min_count=min_count,workers=workers,iter=epochs,sample=0.01)

2019-07-21 15:10:50,268 : INFO : collecting all words and their counts
2019-07-21 15:10:50,269 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-07-21 15:10:50,287 : INFO : PROGRESS: at sentence #10000, processed 94754 words, keeping 15044 word types
2019-07-21 15:10:50,295 : INFO : collected 18547 word types from a corpus of 132827 raw words and 13964 sentences
2019-07-21 15:10:50,296 : INFO : Loading a fresh vocabulary
2019-07-21 15:10:50,316 : INFO : min_count=2 retains 10254 unique words (55% of original 18547, drops 8293)
2019-07-21 15:10:50,317 : INFO : min_count=2 leaves 124534 word corpus (93% of original 132827, drops 8293)
2019-07-21 15:10:50,343 : INFO : deleting the raw counts dictionary of 18547 items
2019-07-21 15:10:50,344 : INFO : sample=0.01 downsamples 0 most-common words
2019-07-21 15:10:50,344 : INFO : downsampling leaves estimated 124534 word corpus (100.0% of prior 124534)
2019-07-21 15:10:50,366 : INFO : estimated required memory for

2019-07-21 15:10:53,949 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-07-21 15:10:53,951 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-07-21 15:10:53,958 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-07-21 15:10:53,982 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-07-21 15:10:53,983 : INFO : EPOCH - 15 : training on 132827 raw words (124534 effective words) took 0.2s, 617344 effective words/s
2019-07-21 15:10:54,189 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-07-21 15:10:54,198 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-07-21 15:10:54,199 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-07-21 15:10:54,224 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-07-21 15:10:54,225 : INFO : EPOCH - 16 : training on 132827 raw words (124534 effective words) took 0.2s, 536087 effective words/s


2019-07-21 15:10:57,723 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-07-21 15:10:57,724 : INFO : EPOCH - 31 : training on 132827 raw words (124534 effective words) took 0.2s, 608961 effective words/s
2019-07-21 15:10:57,916 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-07-21 15:10:57,917 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-07-21 15:10:57,923 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-07-21 15:10:57,948 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-07-21 15:10:57,949 : INFO : EPOCH - 32 : training on 132827 raw words (124534 effective words) took 0.2s, 577682 effective words/s
2019-07-21 15:10:58,148 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-07-21 15:10:58,155 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-07-21 15:10:58,159 : INFO : worker thread finished; awaiting finish of 1 more threads


2019-07-21 15:11:01,487 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-07-21 15:11:01,493 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-07-21 15:11:01,495 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-07-21 15:11:01,524 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-07-21 15:11:01,525 : INFO : EPOCH - 48 : training on 132827 raw words (124534 effective words) took 0.2s, 557398 effective words/s
2019-07-21 15:11:01,727 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-07-21 15:11:01,737 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-07-21 15:11:01,739 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-07-21 15:11:01,761 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-07-21 15:11:01,762 : INFO : EPOCH - 49 : training on 132827 raw words (124534 effective words) took 0.2s, 549262 effective words/s


2019-07-21 15:11:04,949 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-07-21 15:11:04,950 : INFO : EPOCH - 64 : training on 132827 raw words (124534 effective words) took 0.2s, 647068 effective words/s
2019-07-21 15:11:05,123 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-07-21 15:11:05,128 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-07-21 15:11:05,130 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-07-21 15:11:05,153 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-07-21 15:11:05,154 : INFO : EPOCH - 65 : training on 132827 raw words (124534 effective words) took 0.2s, 640378 effective words/s
2019-07-21 15:11:05,321 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-07-21 15:11:05,326 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-07-21 15:11:05,329 : INFO : worker thread finished; awaiting finish of 1 more threads


2019-07-21 15:11:08,464 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-07-21 15:11:08,471 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-07-21 15:11:08,474 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-07-21 15:11:08,498 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-07-21 15:11:08,499 : INFO : EPOCH - 81 : training on 132827 raw words (124534 effective words) took 0.2s, 595874 effective words/s
2019-07-21 15:11:08,668 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-07-21 15:11:08,674 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-07-21 15:11:08,675 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-07-21 15:11:08,699 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-07-21 15:11:08,700 : INFO : EPOCH - 82 : training on 132827 raw words (124534 effective words) took 0.2s, 647564 effective words/s


2019-07-21 15:11:11,752 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-07-21 15:11:11,753 : INFO : EPOCH - 97 : training on 132827 raw words (124534 effective words) took 0.2s, 693266 effective words/s
2019-07-21 15:11:11,920 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-07-21 15:11:11,925 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-07-21 15:11:11,931 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-07-21 15:11:11,941 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-07-21 15:11:11,942 : INFO : EPOCH - 98 : training on 132827 raw words (124534 effective words) took 0.2s, 694557 effective words/s
2019-07-21 15:11:12,106 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-07-21 15:11:12,107 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-07-21 15:11:12,109 : INFO : worker thread finished; awaiting finish of 1 more threads


In [24]:
# save model
model.save('D:\\Soong\\Soong\\corpus\\CNNNewsDatasetEmbedding\\CNNword2vec_model_300')

2019-07-21 15:11:20,242 : INFO : saving Word2Vec object under D:\Soong\Soong\corpus\CNNNewsDatasetEmbedding\CNNword2vec_model_300, separately None
2019-07-21 15:11:20,243 : INFO : not storing attribute vectors_norm
2019-07-21 15:11:20,244 : INFO : not storing attribute cum_table
2019-07-21 15:11:20,982 : INFO : saved D:\Soong\Soong\corpus\CNNNewsDatasetEmbedding\CNNword2vec_model_300


In [28]:
# load word2vec model
model = Word2Vec.load('D:\\Soong\\Soong\\corpus\\CNNNewsDatasetEmbedding\\CNNword2vec_model_300')

2019-07-21 15:12:08,937 : INFO : loading Word2Vec object from D:\Soong\Soong\corpus\CNNNewsDatasetEmbedding\CNNword2vec_model_300
2019-07-21 15:12:09,101 : INFO : loading wv recursively from D:\Soong\Soong\corpus\CNNNewsDatasetEmbedding\CNNword2vec_model_300.wv.* with mmap=None
2019-07-21 15:12:09,102 : INFO : setting ignored attribute vectors_norm to None
2019-07-21 15:12:09,103 : INFO : loading vocabulary recursively from D:\Soong\Soong\corpus\CNNNewsDatasetEmbedding\CNNword2vec_model_300.vocabulary.* with mmap=None
2019-07-21 15:12:09,104 : INFO : loading trainables recursively from D:\Soong\Soong\corpus\CNNNewsDatasetEmbedding\CNNword2vec_model_300.trainables.* with mmap=None
2019-07-21 15:12:09,105 : INFO : setting ignored attribute cum_table to None
2019-07-21 15:12:09,106 : INFO : loaded D:\Soong\Soong\corpus\CNNNewsDatasetEmbedding\CNNword2vec_model_300


In [29]:
model.wv['beautiful']

array([ 1.04130007e-01, -3.65296036e-01,  2.51829088e-01,  1.28041446e-01,
       -6.06811464e-01, -8.66407454e-02, -4.98714179e-01,  2.11960062e-01,
       -2.91960146e-02,  4.39124890e-02, -1.16787285e-01, -9.89213660e-02,
        1.33559525e-01, -4.15771812e-01, -3.04663070e-02, -1.02605879e-01,
        2.92204618e-01,  5.67032874e-01, -2.90404290e-01, -3.51323605e-01,
       -5.10966301e-01, -1.01204820e-01, -2.83170998e-01,  3.67487371e-02,
       -4.23017554e-02,  3.71706575e-01, -4.59306479e-01,  2.15956047e-01,
        2.39092088e-03, -3.15188840e-02,  1.04260869e-01, -6.04436025e-02,
        7.01874611e-04, -5.61440229e-01,  2.33381078e-01,  8.62297416e-02,
        1.51361987e-01, -3.62693042e-01,  3.53941531e-03, -1.51383609e-01,
       -3.68845742e-03,  2.13216469e-01,  2.37182394e-01, -1.71619549e-01,
        2.58115977e-01, -5.00950336e-01,  1.49099693e-01, -2.76902646e-01,
       -4.19374183e-02,  2.57023033e-02,  1.19096331e-01,  1.67256799e-02,
       -3.08671832e-01, -