In [115]:
import numpy as np
import pandas as pd
import jieba
import matplotlib.pyplot as plt
# from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from joblib import dump, load
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import pickle
import math

In [2]:
data = pd.read_csv('dataset.csv')

In [3]:
data.shape

(32526, 3)

In [4]:
data.head()

Unnamed: 0,qid,question,answer
0,1,补发网银盾,您可以到智慧柜员机办理此项业务，方便快捷，节省您的宝贵时间哦。若去柜台办理，请刷身份证取号。
1,2,代发工资,您要办理的是对公开户、销户、签约及其他业务。请刷身份证，小龙人来帮您取个号吧。
2,3,对帐单查询打印,您要办理的是对公开户、销户、签约及其他业务。请刷身份证，小龙人来帮您取个号吧。
3,4,理财产品取号,请说出您需要办理的业务
4,5,密码修改,请您带上身份证到自助柜员机办理，简单快捷，操作容易哦。


In [70]:
data.loc[0, 'question']

'补发网银盾'

In [68]:
data

Unnamed: 0,qid,question,answer
0,1,补发网银盾,您可以到智慧柜员机办理此项业务，方便快捷，节省您的宝贵时间哦。若去柜台办理，请刷身份证取号。
1,2,代发工资,您要办理的是对公开户、销户、签约及其他业务。请刷身份证，小龙人来帮您取个号吧。
2,3,对帐单查询打印,您要办理的是对公开户、销户、签约及其他业务。请刷身份证，小龙人来帮您取个号吧。
3,4,理财产品取号,请说出您需要办理的业务
4,5,密码修改,请您带上身份证到自助柜员机办理，简单快捷，操作容易哦。
...,...,...,...
32521,69115,我要点咖啡,请选择您要的口味
32522,69116,帮忙点咖啡,请选择您要的口味
32523,69117,问题1,测试
32524,69118,1111,johlhk


In [5]:
# 加载语料
with open('stopwords.txt') as file:
    stopwords = file.read().split('\n')
    
corpus = []

for question in data['question']:
    question = str(question).strip()
    corpus.append(" ".join(x for x in jieba.cut(question) if x not in stopwords))

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/k2/dnh2dnfx6937855thyp78q9r0000gn/T/jieba.cache
Loading model cost 0.844 seconds.
Prefix dict has been built successfully.


In [6]:
corpus

['补发 网银盾',
 '代发 工资',
 '帐单 查询 打印',
 '理财产品 取号',
 '密码 修改',
 '查询 名下 所有 账户',
 '交易 流水 明细 证明',
 '个人 结售汇',
 '现金 取款',
 '账户 销户',
 '转账 业务',
 '现金 存款',
 '贵金属 业务',
 '个人账户 管理',
 '开 特色 卡',
 '吞卡 证明',
 '补打 发票',
 '查询 取款',
 '本卡 转 本卡',
 '本卡 转 现金',
 '5 万 以内 转账',
 '5 万 20 万 转账',
 '20 万 200 万 转账',
 '200 万 以上 转账',
 '结售汇 不取 现金',
 '结售汇 及取 现金',
 '银星 收款',
 '仅 查询',
 '银星 汇款',
 '西联 收款',
 '西联 汇款',
 '身份证 开卡',
 '证件 开卡',
 '信用卡 非现金 还款',
 '信用卡 业务',
 '建信 保险',
 '保险',
 '社保 业务',
 '存单 业务',
 '查询 外汇 汇率',
 '存折 解挂',
 '纪念币 业务',
 '开 普通 龙卡',
 '结算 通卡 业务',
 '换 手机 银行 手机号',
 '粤 通卡 充值',
 '充值 缴费',
 '柜台 缴费',
 '信用卡 短信 业务',
 '出国 惠 增值 服务',
 '咨询 外币 旅行支票 光票 托收 业务',
 '保费',
 '卡折 挂失',
 '通知 存款 转 活期 咨询',
 '手机 银行 通知 存款 转 活期',
 '网银 首次 购买 理财 风险 测评 提示',
 '信用卡 挂失 生效 时间',
 'II 类 账户 定义',
 '手机 银行 活期 转 定期',
 '私人 银行 客户 助业贷 扩大 抵押物 范围',
 '外币 储蓄 利率',
 '蓝豹 百大 龙卡 联名卡 种类',
 '咨询 汇率 牌价',
 '短信 推送 业务 类型 客户 反感',
 '网银 支付 方式 适用 对象',
 '信用卡 附属 卡开卡 条件',
 '是否 信用卡 账单 日前 交易 都 会 反映 一期 账单 上',
 '外币 储蓄 定期 转存',
 '保管箱 销户',
 '下载 个人 金交所 交易 客户端',
 '私享品 陶瓷',
 '查询 两年 前 交易 明细',
 '活期存款 账户 销户'

In [93]:
tfidf_vectorizer = TfidfVectorizer()
tfidf = tfidf_vectorizer.fit_transform(corpus)
kmeans = KMeans(n_clusters=1000).fit(tfidf)


# 打印出各个类的中心点
# print(kmeans.cluster_centers_)
# for index, label in enumerate(kmeans.labels_, 1):
#     print("index: {}, label: {}".format(index, label))

# 样本距其最近的聚类中心的平方距离之和inertia_，用来评估分类的准确度，值越小越好
# 调整超参数n_clusters可以改善inertia_
print("inertia: {}".format(kmeans.inertia_))

dump(kmeans, 'kmeans_1000.joblib')

inertia: 18842.383989936447


['kmeans_1000.joblib']

In [94]:
dump(tfidf_vectorizer, 'tfidf_vectorizer_1000.joblib')

['tfidf_vectorizer_1000.joblib']

In [163]:
kmeans = load('kmeans_1000.joblib')

In [164]:
tfidf_vectorizer = load('tfidf_vectorizer_1000.joblib')

In [165]:
user_input = ['补发 网银盾']
result = kmeans.predict(tfidf_vectorizer.transform(user_input))

In [166]:
result[0]

3

In [140]:
for index, label in enumerate(kmeans.labels_):
    print("index: {}, label: {}".format(index, label))

index: 0, label: 3
index: 1, label: 3
index: 2, label: 537
index: 3, label: 674
index: 4, label: 264
index: 5, label: 343
index: 6, label: 81
index: 7, label: 714
index: 8, label: 440
index: 9, label: 699
index: 10, label: 898
index: 11, label: 440
index: 12, label: 749
index: 13, label: 3
index: 14, label: 3
index: 15, label: 251
index: 16, label: 3
index: 17, label: 399
index: 18, label: 3
index: 19, label: 440
index: 20, label: 60
index: 21, label: 60
index: 22, label: 60
index: 23, label: 60
index: 24, label: 456
index: 25, label: 456
index: 26, label: 368
index: 27, label: 18
index: 28, label: 382
index: 29, label: 368
index: 30, label: 382
index: 31, label: 322
index: 32, label: 322
index: 33, label: 179
index: 34, label: 898
index: 35, label: 401
index: 36, label: 401
index: 37, label: 898
index: 38, label: 542
index: 39, label: 429
index: 40, label: 3
index: 41, label: 898
index: 42, label: 334
index: 43, label: 385
index: 44, label: 232
index: 45, label: 949
index: 46, label: 

index: 1692, label: 136
index: 1693, label: 3
index: 1694, label: 307
index: 1695, label: 569
index: 1696, label: 3
index: 1697, label: 3
index: 1698, label: 3
index: 1699, label: 3
index: 1700, label: 3
index: 1701, label: 119
index: 1702, label: 3
index: 1703, label: 70
index: 1704, label: 252
index: 1705, label: 252
index: 1706, label: 3
index: 1707, label: 3
index: 1708, label: 3
index: 1709, label: 65
index: 1710, label: 423
index: 1711, label: 3
index: 1712, label: 3
index: 1713, label: 638
index: 1714, label: 115
index: 1715, label: 392
index: 1716, label: 3
index: 1717, label: 361
index: 1718, label: 3
index: 1719, label: 3
index: 1720, label: 3
index: 1721, label: 3
index: 1722, label: 3
index: 1723, label: 3
index: 1724, label: 242
index: 1725, label: 3
index: 1726, label: 901
index: 1727, label: 3
index: 1728, label: 377
index: 1729, label: 99
index: 1730, label: 309
index: 1731, label: 417
index: 1732, label: 417
index: 1733, label: 3
index: 1734, label: 665
index: 1735, la

index: 3192, label: 103
index: 3193, label: 103
index: 3194, label: 43
index: 3195, label: 349
index: 3196, label: 349
index: 3197, label: 156
index: 3198, label: 3
index: 3199, label: 904
index: 3200, label: 3
index: 3201, label: 3
index: 3202, label: 3
index: 3203, label: 274
index: 3204, label: 3
index: 3205, label: 3
index: 3206, label: 3
index: 3207, label: 3
index: 3208, label: 38
index: 3209, label: 3
index: 3210, label: 76
index: 3211, label: 856
index: 3212, label: 3
index: 3213, label: 3
index: 3214, label: 523
index: 3215, label: 334
index: 3216, label: 274
index: 3217, label: 3
index: 3218, label: 61
index: 3219, label: 3
index: 3220, label: 119
index: 3221, label: 3
index: 3222, label: 3
index: 3223, label: 154
index: 3224, label: 3
index: 3225, label: 53
index: 3226, label: 3
index: 3227, label: 27
index: 3228, label: 27
index: 3229, label: 27
index: 3230, label: 27
index: 3231, label: 3
index: 3232, label: 3
index: 3233, label: 3
index: 3234, label: 3
index: 3235, label:

index: 4691, label: 195
index: 4692, label: 186
index: 4693, label: 186
index: 4694, label: 166
index: 4695, label: 166
index: 4696, label: 21
index: 4697, label: 21
index: 4698, label: 267
index: 4699, label: 3
index: 4700, label: 3
index: 4701, label: 3
index: 4702, label: 103
index: 4703, label: 23
index: 4704, label: 734
index: 4705, label: 3
index: 4706, label: 3
index: 4707, label: 40
index: 4708, label: 615
index: 4709, label: 40
index: 4710, label: 209
index: 4711, label: 468
index: 4712, label: 43
index: 4713, label: 160
index: 4714, label: 3
index: 4715, label: 370
index: 4716, label: 109
index: 4717, label: 913
index: 4718, label: 269
index: 4719, label: 82
index: 4720, label: 817
index: 4721, label: 43
index: 4722, label: 447
index: 4723, label: 3
index: 4724, label: 490
index: 4725, label: 3
index: 4726, label: 3
index: 4727, label: 3
index: 4728, label: 3
index: 4729, label: 23
index: 4730, label: 3
index: 4731, label: 207
index: 4732, label: 3
index: 4733, label: 3
index

index: 6191, label: 133
index: 6192, label: 133
index: 6193, label: 133
index: 6194, label: 47
index: 6195, label: 133
index: 6196, label: 133
index: 6197, label: 3
index: 6198, label: 3
index: 6199, label: 22
index: 6200, label: 3
index: 6201, label: 12
index: 6202, label: 38
index: 6203, label: 3
index: 6204, label: 577
index: 6205, label: 3
index: 6206, label: 62
index: 6207, label: 285
index: 6208, label: 3
index: 6209, label: 3
index: 6210, label: 3
index: 6211, label: 143
index: 6212, label: 3
index: 6213, label: 3
index: 6214, label: 3
index: 6215, label: 480
index: 6216, label: 3
index: 6217, label: 414
index: 6218, label: 23
index: 6219, label: 468
index: 6220, label: 23
index: 6221, label: 23
index: 6222, label: 3
index: 6223, label: 870
index: 6224, label: 3
index: 6225, label: 3
index: 6226, label: 3
index: 6227, label: 3
index: 6228, label: 598
index: 6229, label: 3
index: 6230, label: 293
index: 6231, label: 3
index: 6232, label: 3
index: 6233, label: 3
index: 6234, label

index: 7690, label: 653
index: 7691, label: 653
index: 7692, label: 653
index: 7693, label: 3
index: 7694, label: 3
index: 7695, label: 323
index: 7696, label: 233
index: 7697, label: 255
index: 7698, label: 3
index: 7699, label: 3
index: 7700, label: 3
index: 7701, label: 3
index: 7702, label: 3
index: 7703, label: 3
index: 7704, label: 550
index: 7705, label: 6
index: 7706, label: 42
index: 7707, label: 3
index: 7708, label: 550
index: 7709, label: 104
index: 7710, label: 104
index: 7711, label: 104
index: 7712, label: 104
index: 7713, label: 104
index: 7714, label: 104
index: 7715, label: 922
index: 7716, label: 155
index: 7717, label: 22
index: 7718, label: 3
index: 7719, label: 151
index: 7720, label: 3
index: 7721, label: 3
index: 7722, label: 607
index: 7723, label: 423
index: 7724, label: 3
index: 7725, label: 3
index: 7726, label: 103
index: 7727, label: 71
index: 7728, label: 3
index: 7729, label: 216
index: 7730, label: 3
index: 7731, label: 307
index: 7732, label: 3
index: 

index: 9190, label: 3
index: 9191, label: 3
index: 9192, label: 3
index: 9193, label: 3
index: 9194, label: 48
index: 9195, label: 340
index: 9196, label: 402
index: 9197, label: 315
index: 9198, label: 47
index: 9199, label: 47
index: 9200, label: 47
index: 9201, label: 47
index: 9202, label: 3
index: 9203, label: 3
index: 9204, label: 3
index: 9205, label: 94
index: 9206, label: 480
index: 9207, label: 608
index: 9208, label: 608
index: 9209, label: 264
index: 9210, label: 3
index: 9211, label: 821
index: 9212, label: 3
index: 9213, label: 3
index: 9214, label: 3
index: 9215, label: 736
index: 9216, label: 736
index: 9217, label: 814
index: 9218, label: 814
index: 9219, label: 258
index: 9220, label: 3
index: 9221, label: 3
index: 9222, label: 105
index: 9223, label: 59
index: 9224, label: 356
index: 9225, label: 3
index: 9226, label: 611
index: 9227, label: 99
index: 9228, label: 611
index: 9229, label: 42
index: 9230, label: 983
index: 9231, label: 318
index: 9232, label: 318
index

index: 10689, label: 832
index: 10690, label: 851
index: 10691, label: 587
index: 10692, label: 750
index: 10693, label: 340
index: 10694, label: 3
index: 10695, label: 3
index: 10696, label: 3
index: 10697, label: 3
index: 10698, label: 3
index: 10699, label: 3
index: 10700, label: 567
index: 10701, label: 3
index: 10702, label: 254
index: 10703, label: 3
index: 10704, label: 15
index: 10705, label: 15
index: 10706, label: 3
index: 10707, label: 3
index: 10708, label: 3
index: 10709, label: 852
index: 10710, label: 871
index: 10711, label: 858
index: 10712, label: 71
index: 10713, label: 709
index: 10714, label: 709
index: 10715, label: 709
index: 10716, label: 709
index: 10717, label: 709
index: 10718, label: 23
index: 10719, label: 23
index: 10720, label: 602
index: 10721, label: 602
index: 10722, label: 602
index: 10723, label: 602
index: 10724, label: 23
index: 10725, label: 602
index: 10726, label: 23
index: 10727, label: 602
index: 10728, label: 23
index: 10729, label: 602
index

index: 12189, label: 144
index: 12190, label: 3
index: 12191, label: 99
index: 12192, label: 104
index: 12193, label: 3
index: 12194, label: 140
index: 12195, label: 501
index: 12196, label: 394
index: 12197, label: 3
index: 12198, label: 3
index: 12199, label: 6
index: 12200, label: 3
index: 12201, label: 3
index: 12202, label: 101
index: 12203, label: 104
index: 12204, label: 558
index: 12205, label: 115
index: 12206, label: 3
index: 12207, label: 826
index: 12208, label: 3
index: 12209, label: 479
index: 12210, label: 479
index: 12211, label: 233
index: 12212, label: 3
index: 12213, label: 3
index: 12214, label: 863
index: 12215, label: 22
index: 12216, label: 3
index: 12217, label: 3
index: 12218, label: 470
index: 12219, label: 2
index: 12220, label: 66
index: 12221, label: 58
index: 12222, label: 3
index: 12223, label: 3
index: 12224, label: 553
index: 12225, label: 3
index: 12226, label: 3
index: 12227, label: 3
index: 12228, label: 50
index: 12229, label: 314
index: 12230, labe

index: 13688, label: 3
index: 13689, label: 966
index: 13690, label: 220
index: 13691, label: 67
index: 13692, label: 140
index: 13693, label: 70
index: 13694, label: 966
index: 13695, label: 23
index: 13696, label: 3
index: 13697, label: 3
index: 13698, label: 307
index: 13699, label: 810
index: 13700, label: 260
index: 13701, label: 133
index: 13702, label: 578
index: 13703, label: 3
index: 13704, label: 61
index: 13705, label: 3
index: 13706, label: 3
index: 13707, label: 3
index: 13708, label: 3
index: 13709, label: 638
index: 13710, label: 3
index: 13711, label: 3
index: 13712, label: 3
index: 13713, label: 82
index: 13714, label: 3
index: 13715, label: 3
index: 13716, label: 376
index: 13717, label: 376
index: 13718, label: 376
index: 13719, label: 376
index: 13720, label: 376
index: 13721, label: 376
index: 13722, label: 58
index: 13723, label: 58
index: 13724, label: 58
index: 13725, label: 3
index: 13726, label: 408
index: 13727, label: 3
index: 13728, label: 3
index: 13729, l

index: 15188, label: 982
index: 15189, label: 12
index: 15190, label: 59
index: 15191, label: 897
index: 15192, label: 982
index: 15193, label: 303
index: 15194, label: 12
index: 15195, label: 678
index: 15196, label: 351
index: 15197, label: 994
index: 15198, label: 3
index: 15199, label: 994
index: 15200, label: 3
index: 15201, label: 3
index: 15202, label: 3
index: 15203, label: 51
index: 15204, label: 3
index: 15205, label: 201
index: 15206, label: 954
index: 15207, label: 111
index: 15208, label: 523
index: 15209, label: 55
index: 15210, label: 46
index: 15211, label: 177
index: 15212, label: 904
index: 15213, label: 3
index: 15214, label: 437
index: 15215, label: 789
index: 15216, label: 3
index: 15217, label: 3
index: 15218, label: 177
index: 15219, label: 94
index: 15220, label: 3
index: 15221, label: 3
index: 15222, label: 3
index: 15223, label: 128
index: 15224, label: 3
index: 15225, label: 438
index: 15226, label: 3
index: 15227, label: 3
index: 15228, label: 74
index: 1522

index: 16687, label: 299
index: 16688, label: 261
index: 16689, label: 261
index: 16690, label: 261
index: 16691, label: 3
index: 16692, label: 3
index: 16693, label: 362
index: 16694, label: 3
index: 16695, label: 3
index: 16696, label: 105
index: 16697, label: 3
index: 16698, label: 3
index: 16699, label: 904
index: 16700, label: 246
index: 16701, label: 904
index: 16702, label: 356
index: 16703, label: 356
index: 16704, label: 356
index: 16705, label: 57
index: 16706, label: 611
index: 16707, label: 611
index: 16708, label: 99
index: 16709, label: 611
index: 16710, label: 384
index: 16711, label: 611
index: 16712, label: 3
index: 16713, label: 377
index: 16714, label: 377
index: 16715, label: 789
index: 16716, label: 377
index: 16717, label: 650
index: 16718, label: 650
index: 16719, label: 650
index: 16720, label: 650
index: 16721, label: 650
index: 16722, label: 3
index: 16723, label: 486
index: 16724, label: 362
index: 16725, label: 2
index: 16726, label: 486
index: 16727, label:

index: 18187, label: 467
index: 18188, label: 58
index: 18189, label: 862
index: 18190, label: 893
index: 18191, label: 3
index: 18192, label: 3
index: 18193, label: 3
index: 18194, label: 3
index: 18195, label: 3
index: 18196, label: 3
index: 18197, label: 3
index: 18198, label: 104
index: 18199, label: 805
index: 18200, label: 292
index: 18201, label: 805
index: 18202, label: 805
index: 18203, label: 805
index: 18204, label: 3
index: 18205, label: 3
index: 18206, label: 3
index: 18207, label: 911
index: 18208, label: 539
index: 18209, label: 539
index: 18210, label: 539
index: 18211, label: 894
index: 18212, label: 3
index: 18213, label: 3
index: 18214, label: 304
index: 18215, label: 3
index: 18216, label: 3
index: 18217, label: 3
index: 18218, label: 492
index: 18219, label: 3
index: 18220, label: 491
index: 18221, label: 3
index: 18222, label: 558
index: 18223, label: 3
index: 18224, label: 3
index: 18225, label: 912
index: 18226, label: 3
index: 18227, label: 58
index: 18228, lab

index: 19686, label: 3
index: 19687, label: 3
index: 19688, label: 3
index: 19689, label: 74
index: 19690, label: 55
index: 19691, label: 221
index: 19692, label: 61
index: 19693, label: 3
index: 19694, label: 3
index: 19695, label: 23
index: 19696, label: 3
index: 19697, label: 606
index: 19698, label: 898
index: 19699, label: 3
index: 19700, label: 2
index: 19701, label: 3
index: 19702, label: 608
index: 19703, label: 3
index: 19704, label: 307
index: 19705, label: 619
index: 19706, label: 34
index: 19707, label: 3
index: 19708, label: 253
index: 19709, label: 3
index: 19710, label: 217
index: 19711, label: 735
index: 19712, label: 762
index: 19713, label: 611
index: 19714, label: 99
index: 19715, label: 946
index: 19716, label: 611
index: 19717, label: 57
index: 19718, label: 3
index: 19719, label: 818
index: 19720, label: 3
index: 19721, label: 3
index: 19722, label: 3
index: 19723, label: 124
index: 19724, label: 134
index: 19725, label: 3
index: 19726, label: 54
index: 19727, lab

index: 21186, label: 955
index: 21187, label: 955
index: 21188, label: 432
index: 21189, label: 995
index: 21190, label: 399
index: 21191, label: 284
index: 21192, label: 507
index: 21193, label: 264
index: 21194, label: 791
index: 21195, label: 791
index: 21196, label: 413
index: 21197, label: 60
index: 21198, label: 264
index: 21199, label: 264
index: 21200, label: 3
index: 21201, label: 960
index: 21202, label: 647
index: 21203, label: 785
index: 21204, label: 785
index: 21205, label: 785
index: 21206, label: 785
index: 21207, label: 52
index: 21208, label: 308
index: 21209, label: 203
index: 21210, label: 726
index: 21211, label: 382
index: 21212, label: 385
index: 21213, label: 385
index: 21214, label: 657
index: 21215, label: 432
index: 21216, label: 432
index: 21217, label: 432
index: 21218, label: 996
index: 21219, label: 445
index: 21220, label: 445
index: 21221, label: 445
index: 21222, label: 633
index: 21223, label: 872
index: 21224, label: 696
index: 21225, label: 722
inde

index: 22685, label: 901
index: 22686, label: 673
index: 22687, label: 673
index: 22688, label: 673
index: 22689, label: 382
index: 22690, label: 382
index: 22691, label: 968
index: 22692, label: 968
index: 22693, label: 865
index: 22694, label: 931
index: 22695, label: 302
index: 22696, label: 674
index: 22697, label: 657
index: 22698, label: 344
index: 22699, label: 484
index: 22700, label: 524
index: 22701, label: 722
index: 22702, label: 122
index: 22703, label: 429
index: 22704, label: 673
index: 22705, label: 673
index: 22706, label: 673
index: 22707, label: 3
index: 22708, label: 239
index: 22709, label: 968
index: 22710, label: 865
index: 22711, label: 865
index: 22712, label: 865
index: 22713, label: 524
index: 22714, label: 524
index: 22715, label: 3
index: 22716, label: 3
index: 22717, label: 137
index: 22718, label: 968
index: 22719, label: 968
index: 22720, label: 968
index: 22721, label: 898
index: 22722, label: 762
index: 22723, label: 597
index: 22724, label: 545
index:

index: 24185, label: 171
index: 24186, label: 440
index: 24187, label: 919
index: 24188, label: 360
index: 24189, label: 122
index: 24190, label: 735
index: 24191, label: 874
index: 24192, label: 270
index: 24193, label: 179
index: 24194, label: 963
index: 24195, label: 477
index: 24196, label: 122
index: 24197, label: 477
index: 24198, label: 881
index: 24199, label: 881
index: 24200, label: 477
index: 24201, label: 963
index: 24202, label: 81
index: 24203, label: 615
index: 24204, label: 963
index: 24205, label: 615
index: 24206, label: 615
index: 24207, label: 477
index: 24208, label: 615
index: 24209, label: 881
index: 24210, label: 171
index: 24211, label: 881
index: 24212, label: 788
index: 24213, label: 806
index: 24214, label: 580
index: 24215, label: 81
index: 24216, label: 477
index: 24217, label: 60
index: 24218, label: 477
index: 24219, label: 477
index: 24220, label: 3
index: 24221, label: 546
index: 24222, label: 122
index: 24223, label: 442
index: 24224, label: 854
index

index: 25684, label: 413
index: 25685, label: 3
index: 25686, label: 343
index: 25687, label: 34
index: 25688, label: 331
index: 25689, label: 625
index: 25690, label: 198
index: 25691, label: 106
index: 25692, label: 198
index: 25693, label: 829
index: 25694, label: 83
index: 25695, label: 485
index: 25696, label: 251
index: 25697, label: 291
index: 25698, label: 291
index: 25699, label: 291
index: 25700, label: 291
index: 25701, label: 659
index: 25702, label: 251
index: 25703, label: 659
index: 25704, label: 659
index: 25705, label: 931
index: 25706, label: 931
index: 25707, label: 3
index: 25708, label: 3
index: 25709, label: 586
index: 25710, label: 270
index: 25711, label: 364
index: 25712, label: 364
index: 25713, label: 364
index: 25714, label: 892
index: 25715, label: 892
index: 25716, label: 892
index: 25717, label: 892
index: 25718, label: 892
index: 25719, label: 892
index: 25720, label: 892
index: 25721, label: 892
index: 25722, label: 892
index: 25723, label: 933
index: 2

index: 27184, label: 3
index: 27185, label: 3
index: 27186, label: 3
index: 27187, label: 701
index: 27188, label: 170
index: 27189, label: 3
index: 27190, label: 3
index: 27191, label: 3
index: 27192, label: 3
index: 27193, label: 3
index: 27194, label: 3
index: 27195, label: 707
index: 27196, label: 3
index: 27197, label: 3
index: 27198, label: 67
index: 27199, label: 2
index: 27200, label: 140
index: 27201, label: 3
index: 27202, label: 687
index: 27203, label: 3
index: 27204, label: 475
index: 27205, label: 930
index: 27206, label: 3
index: 27207, label: 3
index: 27208, label: 27
index: 27209, label: 301
index: 27210, label: 24
index: 27211, label: 480
index: 27212, label: 614
index: 27213, label: 3
index: 27214, label: 3
index: 27215, label: 3
index: 27216, label: 3
index: 27217, label: 468
index: 27218, label: 61
index: 27219, label: 3
index: 27220, label: 3
index: 27221, label: 29
index: 27222, label: 3
index: 27223, label: 80
index: 27224, label: 359
index: 27225, label: 3
inde

index: 28683, label: 3
index: 28684, label: 927
index: 28685, label: 985
index: 28686, label: 490
index: 28687, label: 362
index: 28688, label: 3
index: 28689, label: 918
index: 28690, label: 3
index: 28691, label: 133
index: 28692, label: 67
index: 28693, label: 952
index: 28694, label: 3
index: 28695, label: 358
index: 28696, label: 3
index: 28697, label: 3
index: 28698, label: 3
index: 28699, label: 3
index: 28700, label: 3
index: 28701, label: 983
index: 28702, label: 3
index: 28703, label: 818
index: 28704, label: 259
index: 28705, label: 491
index: 28706, label: 169
index: 28707, label: 143
index: 28708, label: 3
index: 28709, label: 550
index: 28710, label: 480
index: 28711, label: 551
index: 28712, label: 303
index: 28713, label: 69
index: 28714, label: 3
index: 28715, label: 988
index: 28716, label: 936
index: 28717, label: 3
index: 28718, label: 333
index: 28719, label: 169
index: 28720, label: 730
index: 28721, label: 176
index: 28722, label: 12
index: 28723, label: 246
inde

index: 30183, label: 3
index: 30184, label: 3
index: 30185, label: 3
index: 30186, label: 3
index: 30187, label: 3
index: 30188, label: 3
index: 30189, label: 307
index: 30190, label: 3
index: 30191, label: 3
index: 30192, label: 3
index: 30193, label: 3
index: 30194, label: 3
index: 30195, label: 3
index: 30196, label: 3
index: 30197, label: 3
index: 30198, label: 3
index: 30199, label: 3
index: 30200, label: 3
index: 30201, label: 3
index: 30202, label: 27
index: 30203, label: 3
index: 30204, label: 3
index: 30205, label: 91
index: 30206, label: 3
index: 30207, label: 116
index: 30208, label: 3
index: 30209, label: 3
index: 30210, label: 3
index: 30211, label: 3
index: 30212, label: 7
index: 30213, label: 3
index: 30214, label: 3
index: 30215, label: 2
index: 30216, label: 2
index: 30217, label: 7
index: 30218, label: 3
index: 30219, label: 3
index: 30220, label: 353
index: 30221, label: 3
index: 30222, label: 3
index: 30223, label: 3
index: 30224, label: 3
index: 30225, label: 3
ind

index: 31682, label: 774
index: 31683, label: 664
index: 31684, label: 560
index: 31685, label: 524
index: 31686, label: 844
index: 31687, label: 791
index: 31688, label: 620
index: 31689, label: 664
index: 31690, label: 64
index: 31691, label: 791
index: 31692, label: 285
index: 31693, label: 289
index: 31694, label: 378
index: 31695, label: 939
index: 31696, label: 227
index: 31697, label: 413
index: 31698, label: 432
index: 31699, label: 545
index: 31700, label: 271
index: 31701, label: 590
index: 31702, label: 432
index: 31703, label: 545
index: 31704, label: 939
index: 31705, label: 939
index: 31706, label: 939
index: 31707, label: 334
index: 31708, label: 432
index: 31709, label: 432
index: 31710, label: 227
index: 31711, label: 420
index: 31712, label: 366
index: 31713, label: 595
index: 31714, label: 196
index: 31715, label: 590
index: 31716, label: 146
index: 31717, label: 420
index: 31718, label: 939
index: 31719, label: 939
index: 31720, label: 432
index: 31721, label: 432
i

In [206]:
kmeans.cluster_centers_

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [188]:
input_tfidf = tfidf_vectorizer.transform(user_input).todense()
input_tfidf.getA()[0] 

array([0., 0., 0., ..., 0., 0., 0.])

In [112]:
#余弦相似性转换成python
def dot_product(v1, v2):
    return sum(a * b for a, b in zip(v1, v2))

def magnitude(vector):
    return math.sqrt(dot_product(vector, vector))

def similarity(v1, v2):
    return dot_product(v1, v2) / (magnitude(v1) * magnitude(v2) + .00000000001)    

In [189]:
similarity(weight[0], input_tfidf.getA()[0])

0.0

In [190]:
def cosine_similarity(x, y, norm=False):
    """ 计算两个向量x和y的余弦相似度 """
    assert len(x) == len(y), "len(x) != len(y)"
    zero_list = [0] * len(x)
    # f x == zero_list or y == zero_list:
    #    return float(1) if x == y else float(0)

    res = np.array([[x[i] * y[i], x[i] * x[i], y[i] * y[i]] for i in range(len(x))])
    cos = sum(res[:, 0]) / (np.sqrt(sum(res[:, 1])) * np.sqrt(sum(res[:, 2])))

    return 0.5 * cos + 0.5 if norm else cos  # 归一化到[0, 1]区间内

In [203]:
similar_result = []
for we in weight:
    similar_result.append(cosine_similarity(we, input_tfidf.getA()[0], norm=False))

In [204]:
similar_result

[0.0,
 0.0,
 0.0,
 0.03392726453487765,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.13572688052049464,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.01689116002387832,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.011263084409621973,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,