In [100]:
sent1 = 'The cat is walking in the bedroom'

In [101]:
sent2 = 'A dog was running across the kitchen'

In [102]:
from sklearn.feature_extraction.text import CountVectorizer

In [103]:
count_vec = CountVectorizer()

In [104]:
sentences = [sent1,sent2]

In [105]:
print count_vec.fit_transform(sentences).toarray()

[[0 1 1 0 1 1 0 0 2 1 0]
 [1 0 0 1 0 0 1 1 1 0 1]]


In [106]:
print count_vec.get_feature_names()

[u'across', u'bedroom', u'cat', u'dog', u'in', u'is', u'kitchen', u'running', u'the', u'walking', u'was']


In [107]:
import nltk

In [108]:
# 对句子进行词汇分割和正规化

In [109]:
tokens_1 = nltk.word_tokenize(sent1)
print tokens_1

['The', 'cat', 'is', 'walking', 'in', 'the', 'bedroom']


In [110]:
tokens_2 = nltk.word_tokenize(sent2)

In [111]:
print tokens_2

['A', 'dog', 'was', 'running', 'across', 'the', 'kitchen']


In [112]:
vocab_1 = sorted(set(tokens_1))

In [113]:
print vocab_1

['The', 'bedroom', 'cat', 'in', 'is', 'the', 'walking']


In [114]:
vocab_2 = sorted(set(tokens_2))

In [115]:
print vocab_2

['A', 'across', 'dog', 'kitchen', 'running', 'the', 'was']


In [116]:
# 初始化stemmer寻找各个词汇最原始的词根

In [117]:
stemmer = nltk.stem.PorterStemmer()

In [118]:
stem_1 = [stemmer.stem(t) for t in tokens_1]

In [119]:
print stem_1

['the', 'cat', 'is', u'walk', 'in', 'the', 'bedroom']


In [120]:
stem_2 = [stemmer.stem(t) for t in tokens_2]

In [121]:
print stem_2

['A', 'dog', u'wa', u'run', u'across', 'the', 'kitchen']


In [122]:
# 初始化词性标注器，对每个词汇进行标注

In [123]:
pos_tag_1 = nltk.tag.pos_tag(tokens_1)

In [124]:
print pos_tag_1

[('The', 'DT'), ('cat', 'NN'), ('is', 'VBZ'), ('walking', 'VBG'), ('in', 'IN'), ('the', 'DT'), ('bedroom', 'NN')]


In [125]:
pos_tag_2 = nltk.tag.pos_tag(tokens_2)

In [126]:
print pos_tag_2

[('A', 'DT'), ('dog', 'NN'), ('was', 'VBD'), ('running', 'VBG'), ('across', 'IN'), ('the', 'DT'), ('kitchen', 'NN')]


In [127]:
from sklearn.datasets import  fetch_20newsgroups

In [128]:
news = fetch_20newsgroups(subset="all")

In [129]:
X,y = news.data,news.target

In [130]:
print news.target_names

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [131]:
# 从bs4导入BeautifulSoup

In [132]:
from bs4 import  BeautifulSoup

In [133]:
import nltk,re

In [134]:
# 定义一个函数名为news_to_sentences将每条新闻中的句子逐一玻璃出来，并返回一个句子的列表

In [151]:
def news_to_sentences(news):
    news_text = BeautifulSoup(news).get_text()
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw_sentences = tokenizer.tokenize(news_text)
    sentences = []
    for sent in raw_sentences:
        sentences.append(re.sub('[^a-zA-Z]',' ',sent.lower().strip()).split())
    return sentences

In [152]:
sentences = []

In [153]:
for x in X:
    sentences += news_to_sentences(x)

In [154]:
# 从长篇新闻把句子剥离出来，用于训练

In [155]:
# 从gensim.models里导入word2vec

In [156]:
from  gensim.models  import word2vec 

In [157]:
num_features = 300

In [158]:
min_word_count = 20

In [159]:
num_workers = 2

In [160]:
context = 5

In [161]:
downsampling = 1e-3

In [162]:
# 训练词向量模型

In [163]:
model = word2vec.Word2Vec(sentences,workers=num_workers,size=num_features,min_count=min_word_count,window=context,sample=downsampling)

In [164]:
model.init_sims(replace=True)

In [165]:
model.most_similar('email')

[(u'mail', 0.6986672282218933),
 (u'contact', 0.6790420413017273),
 (u'address', 0.6570186614990234),
 (u'request', 0.6553895473480225),
 (u'replies', 0.6443120241165161),
 (u'sas', 0.6435778141021729),
 (u'listserv', 0.629580020904541),
 (u'compuserve', 0.6226867437362671),
 (u'mailed', 0.6165179014205933),
 (u'send', 0.6050395965576172)]

In [166]:
model.most_similar('morning')

[(u'afternoon', 0.807927131652832),
 (u'weekend', 0.7877218127250671),
 (u'evening', 0.7460664510726929),
 (u'night', 0.7307092547416687),
 (u'saturday', 0.7119767665863037),
 (u'friday', 0.703860342502594),
 (u'sunday', 0.6924899816513062),
 (u'summer', 0.6640925407409668),
 (u'thursday', 0.6513188481330872),
 (u'tuesday', 0.6490784883499146)]