In [1]:
import os

In [2]:
def get_filePath_list(file_dir):
    '''
    Get the list of all files within file_dir, including those under subdirs
    : param file_dir
    : return the file list
    '''
    filePath_list = []
    for walk in os.walk(file_dir):
        part_filePath_list = [os.path.join(walk[0], file) for file in walk[2]]
        filePath_list.extend(part_filePath_list)
    return filePath_list

def get_files_list(file_dir, postfix='ALL'):
    '''
    Get the list of all files in file_dir whose postfix is the postfix, including those under subdirs
    : param file_dir
    : param postfix
    : return file list 
    '''
    postfix = postfix.split('.')[-1]
    file_list = []
    filePath_list = get_filePath_list(file_dir)
    if postfix == 'ALL':
        file_list = filePath_list
    else:
        for file in filePath_list:
            basename=os.path.basename(file)
            postfix_basename = basename.split('.')[-1]
            if postfix_basename == postfix:
                file_list.append(file)
    file_list.sort()
    return file_list

In [3]:
import jieba

In [4]:
def segment_files(file_list, segment_out_dir, stopwords=[]):
    '''
    Segment out all the words from the source documents
    : param file_list
    : param segment_out_dir
    : param stopwords
    '''
    for i, file in enumerate(file_list):
        segment_out_name = os.path.join(segment_out_dir, 'segment_{}.txt'.format(i))
        with open(file, 'rb') as fin:
            document = fin.read()
            document_cut = jieba.cut(document)
            sentence_segment=[]
            for word in document_cut:
                if word not in stopwords:
                    sentence_segment.append(word)
            result = ' '.join(sentence_segment)
            result = result.encode('utf-8')
            with open(segment_out_name, 'wb') as fout:
                fout.write(result)        

In [6]:
#source and segment dir
source_folder = './three_kingdoms/source'
segment_folder = './three_kingdoms/segment'

file_list = get_files_list(source_folder, postfix='*.txt')
segment_files(file_list, segment_folder)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\alexs\AppData\Local\Temp\jieba.cache
Loading model cost 0.554 seconds.
Prefix dict has been built successfully.


In [7]:
from gensim.models import word2vec
import multiprocessing

In [9]:
sentences = word2vec.PathLineSentences(segment_folder)

In [11]:
for i, sentence in enumerate(sentences):
    if i < 100:
        print(sentence)

['三国演义']
['作者', '：', '罗贯中']
['正文', '第一回', '宴', '桃园', '豪杰', '三', '结义', '斩', '黄巾', '英雄', '首', '立功']
['滚滚', '长江', '东', '逝水', '，', '浪花', '淘尽', '英雄', '。', '是非成败', '转头', '空', '。']
['青山', '依旧', '在', '，', '几度', '夕阳红', '。', '白发', '渔樵', '江渚上', '，', '惯']
['看', '秋月春风', '。', '一壶', '浊酒', '喜相逢', '。', '古今', '多少', '事', '，', '都', '付']
['笑谈', '中', '。']
['—', '—', '调寄', '《', '临江仙', '》']
['话', '说', '天下', '大势', '，', '分久必合', '，', '合久必分', '。', '周末', '七', '国', '分争', '，', '并入', '于', '秦', '。', '及', '秦灭', '之后', '，', '楚', '、', '汉', '分争', '，', '又', '并入', '于汉', '。', '汉朝', '自', '高祖', '斩', '白蛇', '而', '起义', '，', '一统天下', '，', '后来', '光武', '中兴', '，', '传至', '献帝', '，', '遂', '分为', '三国', '。', '推其致', '乱', '之', '由', '，', '殆', '始于', '桓', '、', '灵', '二帝', '。', '桓帝', '禁锢', '善类', '，', '崇信', '宦官', '。', '及桓帝', '崩', '，', '灵帝', '即位', '，', '大将军', '窦武', '、', '太傅陈', '蕃', '共', '相', '辅佐', '。', '时有', '宦官', '曹节', '等', '弄权', '，', '窦武', '、', '陈蕃', '谋', '诛', '之', '，', '机事不密', '，', '反为', '所害', '，', '中', '涓', '自此', '愈横', '。']
['建宁', '二年', '四月', '望'

In [14]:
print('Train word2vec model with {} CPUs'.format(multiprocessing.cpu_count()))
model = word2vec.Word2Vec(sentences, size=128, window=5, min_count = 5, workers=multiprocessing.cpu_count())

if not os.path.exists('models'):
    try:
        os.makedirs('models')
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise
# save model
model.save('./models/word2Vec.model')

Train word2vec model with 12 CPUs


In [15]:
print(model.wv.most_similar('曹操'))
print(model.wv.most_similar(positive=['曹操', '刘备'], negative=['张飞']))

[('孙权', 0.9837316870689392), ('众将', 0.9814666509628296), ('定', 0.981120228767395), ('周瑜', 0.9783538579940796), ('自有主张', 0.9776253700256348), ('手书', 0.9773138761520386), ('秀', 0.9770835638046265), ('半晌', 0.9770827889442444), ('关公', 0.9766952991485596), ('先', 0.976443350315094)]
[('吾', 0.9895710945129395), ('丞相', 0.9880417585372925), ('今', 0.9873430728912354), ('臣', 0.9833163022994995), ('汝', 0.983021080493927), ('叹', 0.9819997549057007), ('此', 0.980284571647644), ('耳', 0.9774681925773621), ('陛下', 0.9772478938102722), ('何以', 0.9771377444267273)]
