In [2]:
import numpy as np
from scipy import spatial
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.cluster import KMeans
import xlrd, xlwt
from gensim.models import Word2Vec
import time
from sklearn.decomposition import PCA



In [3]:
def data_preprocessing(data):
    total_words = []
    tokenized = []
    
    for pattern in data:
        sts = ""
        for word in pattern:
            if word not in pre_stopwords:
                sts += word
        t = jieba.lcut(sts)
        t_s = []
        for word in t:
            if word not in stopwords:
                t_s.append(word)
        total_words.extend(t_s)
        tokenized.append(t_s)

    return tokenized

def build_word2vec(data, w2v_dims, w2v_model=None, update=True):
    date = time.strftime("%Y%m%d")
    tokenized = []
    max_len = 0
    
    for pattern in data:
#         求全数据集中句子的最长长度
        if len(pattern) > max_len:
            max_len = len(pattern)
        t = jieba.lcut(pattern)
        t_s = []
#         去后停用词
        for word in t:
            if word not in stopwords:
                t_s.append(word)
        tokenized.append(t_s)
        
# 若没有传入词向量模型，生成新的模型
    if w2v_model is None:
        print('正在生成新的词向量模型...')
        w2v_model = Word2Vec(tokenized,sg=1,size=w2v_dims,window=5,min_count=1,negative=1,sample=0.001,hs=1)
        w2v_model.train(tokenized, total_examples=len(tokenized), epochs=5)
# 若传入了词向量模型，则更新模型
    elif update is True:
        print('正在更新词向量...')
        w2v_model = update_w2v(w2v_model, tokenized)
    return w2v_model, max_len

# 计算短文本的平均词向量
def avg_feature_vector(sentence, index2word_set):
    words = jieba.lcut(sentence)
#     print(words)
    feature_vec = np.zeros((w2v_dims, ), dtype='float32')
    n_words = 0
    i = 0
    for word in words:
        if word in index2word_set:
            n_words += 1
#             print(model[word])
#             print(tf_idf[i])
            feature_vec = np.add(feature_vec, w2v_model[word])
#             print(feature_vec)
#             sleep(1)
        i += 1
    if (n_words > 0):
        feature_vec = np.divide(feature_vec, n_words)
    return feature_vec

# 构建词向量模型
def build_wd2idx_embedMatrix(w2vModel):
    word2idx = {"_stopWord": 0}  # 这里加了一行是用来过滤停用词的。
    vocab_list = [(w, w2vModel.wv[w]) for w, v in w2vModel.wv.vocab.items()]
    embedMatrix = np.zeros((len(w2vModel.wv.vocab.items()) + 1, w2vModel.vector_size))

    for i in range(0, len(vocab_list)):
        word = vocab_list[i][0]
        word2idx[word] = i + 1
        embedMatrix[i + 1] = vocab_list[i][1]
    return word2idx, embedMatrix

In [6]:
# 训练集和对应标签预处理
date = time.strftime("%Y%m%d")
readbook = xlrd.open_workbook('语音转文本_全业务数据集.xlsx')
sheet1 = readbook.sheet_by_name('全集')
sheet2 = readbook.sheet_by_name('清洗后')
stopwords = [line.strip() for line in open('stopwords.txt',encoding='gb18030',errors='ignor').readlines()]
# 加载自定义词典
jieba.load_userdict("催收文本-newdic.txt")

global w2v_dims, w2v_model
w2v_dims = 200

# 读取全业务数据集
total_data = []
for i in range(sheet1.nrows-1):
    total_data.append(sheet1.cell(i,0).value)

# 读取清洗后的数据集
data_set = []
for i in range(sheet2.nrows-1):
    data_set.append(sheet2.cell(i+1,0).value)

# 创建词向量模型
# w2v_model = Word2Vec.load('20190521_w2v_model')
w2v_model, max_len = build_word2vec(total_data, w2v_dims, update=True)
w2v_model.save(date+'_词向量_w2v_model')# 保存模型
print('词向量模型保存成功：%s_词向量_w2v_model' %date)

# index2word_set = set(w2v_model.index2word)
wd2idx, embedMatrix = build_wd2idx_embedMatrix(w2v_model)

dictionary = []
Sts2Vec = []
for i in data_set:
    Vec = avg_feature_vector(i, wd2idx)
    Sts2Vec.append(Vec)
    dictionary.append({'sts': i, 'sts2vec': Vec})
# sim = 1 - spatial.distance.cosine(s1_afv, s2_afv)
# print(Sts2Vec)

正在生成新的词向量模型...
词向量模型保存成功：20190607_词向量_w2v_model




In [8]:
num_clusters = 200
km_cluster = KMeans(n_clusters=num_clusters, max_iter=10000, n_init=1, \
                    init='k-means++',n_jobs=1)

result = km_cluster.fit_predict(Sts2Vec)

In [9]:
count_dict = []
count = np.zeros(num_clusters)
for i in range(len(result)):
    count[result[i]] += 1
#     print(result[0])

# print(count)

for i in range(len(count)):
    count_dict.append({'count': i, 'num': count[i]})
# print(count_dict)

for i in range(len(count_dict)):
    for j in range(len(count_dict)-i-1):
        if count_dict[j]['num'] > count_dict[j+1]['num']:
            num_tmp = count_dict[j]['num']
            count_tmp = count_dict[j]['count']
            count_dict[j]['num'] = count_dict[j+1]['num']
            count_dict[j]['count'] = count_dict[j+1]['count']
            count_dict[j+1]['num'] = num_tmp
            count_dict[j+1]['count'] = count_tmp
            

In [10]:
for i in count_dict:
    print(i['count'], i['num'])

120 2.0
7 7.0
143 13.0
195 13.0
128 15.0
52 18.0
190 19.0
45 22.0
59 24.0
161 30.0
91 32.0
174 32.0
197 39.0
141 44.0
158 44.0
155 46.0
46 48.0
184 50.0
35 53.0
169 53.0
67 54.0
83 54.0
84 54.0
76 57.0
92 58.0
129 58.0
135 62.0
170 63.0
178 63.0
116 65.0
103 66.0
199 67.0
165 69.0
0 71.0
144 71.0
163 71.0
65 72.0
185 72.0
66 76.0
154 76.0
125 80.0
146 81.0
149 83.0
147 85.0
171 86.0
160 90.0
188 90.0
114 93.0
198 93.0
38 94.0
175 95.0
130 101.0
133 101.0
14 103.0
17 103.0
94 103.0
44 105.0
3 106.0
150 106.0
12 108.0
131 109.0
97 112.0
137 114.0
168 114.0
186 114.0
27 115.0
20 116.0
51 120.0
56 120.0
50 121.0
49 122.0
85 127.0
142 128.0
30 129.0
118 131.0
68 134.0
126 134.0
138 135.0
86 138.0
111 146.0
62 148.0
134 152.0
15 157.0
113 158.0
53 160.0
93 164.0
121 165.0
69 166.0
61 167.0
151 168.0
18 169.0
179 169.0
47 171.0
115 171.0
98 172.0
122 182.0
37 184.0
152 184.0
159 186.0
70 187.0
127 188.0
89 189.0
140 189.0
107 190.0
23 191.0
73 191.0
28 192.0
132 193.0
78 198.0
77 199.0
100 20

In [30]:
for i in range(len(count)):
    if count[i]<10:
        print(i)

2
9
14
15
27
30
37
53
57
69
73
86
87
90
92
94
100
101
102
108
113
118
127
130
132
134
137
140
148
156
166
168
183
187
189
197
198
203
212
214
215
217
224
227
229
230
232
236
239
243
244
247
248
249
252
254
260
261
264
270
271
279
284
287
288
292
295
300
301
303
305
308
313
314
316
322
323
326
328
330
334
337
338
339
342
343
344
348
349
352
359
361
366
368
374
378
379
388
392
396
398
400
403
405
406
407
415
422
424
425
429
431
434
435
440
441
442
446
453
455
456
458
463
464
465
468
473
474
476
485
486
488
492
498
501
502
505
506
511
512
514
518
520
524
529
530
531
534
536
537
538
540
544
545
546
550
554
557
559
569
570
571
572
574
576
577
578
581
584
586
588
591
598
600
601
604
605
615
622
623
628
630
633
641
649
650
653
655
656
657
659
661
664
667
670
672
675
678
685
686
694
696
704
706
708
709
711
714
715
716
722
723
730
733
736
740
747
748
751
752
757
759
762
767
768
769
775
789
791
792
793
794
795
796
797
799
811
812
814
816
818
819
822
824
825
828
831
833
835
836
839
840
841
846
84

In [11]:
fobj = open(date+'句向量-聚类.txt','a')
for i in result:
    fobj.write('\n'+str(i))
print('句向量聚类结果已保存：'+date+'句向量-聚类.txt')
fobj.close()

句向量聚类结果已保存：20190607句向量-聚类.txt
