# 目录
- 分词
- 词向量
- 参考

In [19]:
import jieba
import gensim.models.word2vec as w2v

# 分词

### 简单

In [20]:
seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
print("Full Mode: " + "/ ".join(seg_list)) # 全模式

seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
print("Default Mode: " + "/ ".join(seg_list)) # 精确模式

seg_list = jieba.cut("他来到了网易杭研大厦") # 默认是精确模式
print(", ".join(seg_list))

seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所，后在日本京都大学深造") # 搜索引擎模式
print(", ".join(seg_list))

Full Mode: 我/ 来到/ 北京/ 清华/ 清华大学/ 华大/ 大学
Default Mode: 我/ 来到/ 北京/ 清华大学
他, 来到, 了, 网易, 杭研, 大厦
小明, 硕士, 毕业, 于, 中国, 科学, 学院, 科学院, 中国科学院, 计算, 计算所, ，, 后, 在, 日本, 京都, 大学, 日本京都大学, 深造


### 示例：一篇文章，倚天屠龙记

In [21]:
def stop_word():
    sw = []
    with open('../../data/stop_words.txt', 'r') as fin:
        lines = fin.readlines()
    for line in lines:
        sw.append(line.replace('\n', ''))
    return sw

def tokenizer(line):
    l=line
    for sw in stop_word():
        l=l.replace(sw,'')
    return l

l=tokenizer("    ()　　“春游浩荡，是年年寒食，梨花时节。白锦无纹香烂漫，玉树琼苞堆雪。静夜沉沉，浮光霭霭，冷浸溶溶月。人间天上，烂银霞照通彻。浑似姑射真人，天姿灵秀，意气殊高洁。万蕊参差谁信道，不与群芳同列。浩气清英，仙才卓荦，下土难分别。瑶台归去，洞天方看清绝。")
print(l)


春游浩荡年年寒食梨花时节锦纹香烂漫玉树琼苞堆雪静夜沉沉浮霭霭冷浸溶溶月间天烂银霞通彻浑似姑射真天姿灵秀意气殊高洁万蕊参差信道群芳列浩气清英仙卓荦土难瑶台洞天清


In [23]:
# 语料位置
file_path = '../../data/custom/corpus_original/yitiantulongji.txt'
# 语料分词后位置
file_segment_path = '../../data/custom/tokenizer/yitiantulongji_tokenizer.txt'

fin = open(file_path, 'r', encoding='UTF-8')
fou = open(file_segment_path, 'w', encoding='UTF-8')

line = fin.readline()
while line:
    if line is not '\n':
        newline = jieba.cut(tokenizer(line), cut_all=False) # 分词和tokenizer，先后是否有影响
        print(' '.join(newline), file=fou)
    line = fin.readline()
    
fin.close()
fou.close()

分词后，文件内容已空格分割。

# 词向量模型

## 训练

In [None]:
d=100
window=5
min_count=2
theme='yitiantulongji'

index_model_file_name_tpl='../../data/custom/tokenizer/index_{}_d{}_win{}_mincount{}.w2v'
index_model_file_name = index_model_file_name_tpl.format(theme,d,window,min_count)
print(index_model_file_name)

### LineSentence
传递文件路径，按行读取成句子

In [None]:
sentences = w2v.LineSentence(file_segment_path)

### Text8Corpus

In [24]:
# sentences=w2v.Text8Corpus(file_segment_path)

### File Interator
因为采用迭代器，可以大规模数据

In [28]:
# from gensim import utils

# class MyCorpus(object):
#     """An interator that yields sentences (lists of str)."""

#     def __iter__(self):
#         for line in open(file_segment_path):
#             # assume there's one document per line, tokens separated by whitespace
#             yield utils.simple_preprocess(line)
            
# sentences = MyCorpus()

In [29]:
model = w2v.Word2Vec(sentences, size=d, window=window, min_count=min_count, workers=4)


model end.


## 保存模型

In [None]:
model.save(model_file_name)
print('model end.')

```
Parameters
----------
sentences : iterable of iterables, optional
    The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora,
    consider an iterable that streams the sentences directly from disk/network.
    See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus`
    or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples.
    See also the `tutorial on data streaming in Python
    <https://rare-technologies.com/data-streaming-in-python-generators-iterators-iterables/>`_.
    If you don't supply `sentences`, the model is left uninitialized -- use if you plan to initialize it
    in some other way.
corpus_file : str, optional
    Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format.
    You may use this argument instead of `sentences` to get performance boost. Only one of `sentences` or
    `corpus_file` arguments need to be passed (or none of them, in that case, the model is left uninitialized).
size : int, optional
    Dimensionality of the word vectors.
window : int, optional
    Maximum distance between the current and predicted word within a sentence.
min_count : int, optional
    Ignores all words with total frequency lower than this.
workers : int, optional
    Use these many worker threads to train the model (=faster training with multicore machines).
sg : {0, 1}, optional
    Training algorithm: 1 for skip-gram; otherwise CBOW.
hs : {0, 1}, optional
    If 1, hierarchical softmax will be used for model training.
    If 0, and `negative` is non-zero, negative sampling will be used.
negative : int, optional
    If > 0, negative sampling will be used, the int for negative specifies how many "noise words"
    should be drawn (usually between 5-20).
    If set to 0, no negative sampling is used.
ns_exponent : float, optional
    The exponent used to shape the negative sampling distribution. A value of 1.0 samples exactly in proportion
    to the frequencies, 0.0 samples all words equally, while a negative value samples low-frequency words more
    than high-frequency words. The popular default value of 0.75 was chosen by the original Word2Vec paper.
    More recently, in https://arxiv.org/abs/1804.04212, Caselles-Dupré, Lesaint, & Royo-Letelier suggest that
    other values may perform better for recommendation applications.
cbow_mean : {0, 1}, optional
    If 0, use the sum of the context word vectors. If 1, use the mean, only applies when cbow is used.
alpha : float, optional
    The initial learning rate.
min_alpha : float, optional
    Learning rate will linearly drop to `min_alpha` as training progresses.
seed : int, optional
    Seed for the random number generator. Initial vectors for each word are seeded with a hash of
    the concatenation of word + `str(seed)`. Note that for a fully deterministically-reproducible run,
    you must also limit the model to a single worker thread (`workers=1`), to eliminate ordering jitter
    from OS thread scheduling. (In Python 3, reproducibility between interpreter launches also requires
    use of the `PYTHONHASHSEED` environment variable to control hash randomization).
max_vocab_size : int, optional
    Limits the RAM during vocabulary building; if there are more unique
    words than this, then prune the infrequent ones. Every 10 million word types need about 1GB of RAM.
    Set to `None` for no limit.
max_final_vocab : int, optional
    Limits the vocab to a target vocab size by automatically picking a matching min_count. If the specified
    min_count is more than the calculated min_count, the specified min_count will be used.
    Set to `None` if not required.
sample : float, optional
    The threshold for configuring which higher-frequency words are randomly downsampled,
    useful range is (0, 1e-5).
hashfxn : function, optional
    Hash function to use to randomly initialize weights, for increased training reproducibility.
iter : int, optional
    Number of iterations (epochs) over the corpus.
trim_rule : function, optional
    Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary,
    be trimmed away, or handled using the default (discard if word count < min_count).
    Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`),
    or a callable that accepts parameters (word, count, min_count) and returns either
    :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`.
    The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part of the
    model.

    The input parameters are of the following types:
        * `word` (str) - the word we are examining
        * `count` (int) - the word's frequency count in the corpus
        * `min_count` (int) - the minimum count threshold.
sorted_vocab : {0, 1}, optional
    If 1, sort the vocabulary by descending frequency before assigning word indexes.
    See :meth:`~gensim.models.word2vec.Word2VecVocab.sort_vocab()`.
batch_words : int, optional
    Target size (in words) for batches of examples passed to worker threads (and
    thus cython routines).(Larger batches will be passed if individual
    texts are longer than 10000 words, but the standard cython code truncates to that maximum.)
compute_loss: bool, optional
    If True, computes and stores loss value which can be retrieved using
    :meth:`~gensim.models.word2vec.Word2Vec.get_latest_training_loss`.
callbacks : iterable of :class:`~gensim.models.callbacks.CallbackAny2Vec`, optional
    Sequence of callbacks to be executed at specific stages during training.
```

## 加载模型

In [None]:
model=w2v.Word2Vec.load(model_file_name)

# 相似度查询

- `张翠山`与`张翠山`的相似度 float Cosine similarity

In [41]:
print(model.wv.similarity('张翠山','张翠山'))

1.0


- `张翠山`与`殷素素`的相似度 float Cosine similarity

In [42]:
print(model.wv.similarity('张翠山','殷素素'))

0.9871146


- `张翠山`与`殷素素`的相似度 numpy.ndarray Similarities

In [49]:
print(model.wv.n_similarity('张翠山','殷素素'))

0.97939414


- `张翠山`与`柱子`的相似度 float Cosine similarity

In [50]:
print(model.wv.similarity('张翠山','柱子'))

0.02741475


- 与`张翠山`相似度最高的词

In [51]:
for vec in model.wv.most_similar('张翠山'):
    print(vec)

('殷素素', 0.9871145486831665)
('周芷若', 0.9819607138633728)
('张无忌', 0.978989839553833)
('谢逊', 0.978682279586792)
('转头', 0.9668984413146973)
('低声', 0.960997998714447)
('稍宽', 0.9605162143707275)
('嗓子', 0.9601899981498718)
('点头', 0.9592341184616089)
('谢逊忽', 0.9556468725204468)


- 与`张三丰`相似度最高的词

In [53]:
for vec in model.wv.most_similar('张三丰'):
    print(vec)

('自杀身亡', 0.60069340467453)
('隔开', 0.5950946807861328)
('谢礼', 0.5875046253204346)
('激飞', 0.5648559331893921)
('柱子', 0.5570248961448669)
('顺利', 0.5541802644729614)
('矮胖', 0.5491266250610352)
('融会贯通', 0.5483474731445312)
('牵住', 0.5370802879333496)
('问到', 0.5359084606170654)


- `张翠山`与`陆小风`的相似度

In [55]:
print(model.wv.similarity('张翠山','陆小风'))

KeyError: "word '陆小风' not in vocabulary"

# 参考

- [gensim](https://radimrehurek.com/gensim/auto_examples/index.html)