In [2]:
import graphlab as gl

In [3]:
import pymysql

In [4]:
config = {
    'host': 'localhost',
    'user': 'root',
    'password': '123456',
    'db': 'prod_schema',
    'charset': 'utf8mb4',
    'cursorclass': pymysql.cursors.DictCursor
}
def db_execute(config, sql):
    connection = pymysql.connect(**config)
    try:
        with connection.cursor() as cursor:
            cursor.execute(sql)
        connection.commit()
        return True
    finally:
        connection.close()
        
def db_query(config, sql):
    connection = pymysql.connect(**config)
    try:
        with connection.cursor() as cursor:
            cursor.execute(sql)
            result = cursor.fetchall()
            return result
    finally:
        connection.close()

In [5]:
import jieba
from graphlab import SFrame
from graphlab import SArray
import jieba.analyse

In [6]:
def query_to_SFrame(config, sql):
    result = db_query(config, sql)
    sf = gl.SFrame(result)
    result_sf = sf.unpack('X1', column_name_prefix='')
    return result_sf

In [7]:
def column_to_tfidf(column):
    result = list()
    for content in column:
        tags = jieba.analyse.extract_tags(content, topK=8, withWeight=True)
        result.append(dict(tags))
    return SArray(result)

In [8]:
prod_sf = query_to_SFrame(config, "select * from prod_schema.prod_info")

[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1494985539.log


This non-commercial license of GraphLab Create for academic use is assigned to workingjhy@gmail.com and will expire on April 29, 2018.


In [8]:
prod_sf.column_names()

['pid', 'pname', 'pprice']

In [9]:
prod_sf['tf_idf'] = column_to_tfidf(prod_sf['pname'])

Building prefix dict from the default dictionary ...
Dumping model to file cache /var/folders/nr/hxpbrpxj6fddzg_1bm2y2_100000gq/T/jieba.cache
Loading model cost 1.553 seconds.
Prefix dict has been built succesfully.


In [10]:
prod_model = gl.nearest_neighbors.create(prod_sf, label='pname', features = ['tf_idf'], 
                                         method='auto', distance = 'euclidean')

In [13]:
for prod in prod_sf:
    nearest = prod_model.query(prod_sf[prod_sf['pid'] == prod['pid']], label='pid', k = 6, verbose=False)
    for rec in nearest:
        sql = "insert into prod_schema.prod_nk (pid, kn_pid, distance, rank) values (%d, %d, %.4f, %d)" % (rec['query_label'], rec['reference_label'], rec['distance'], rec['rank'] )
        db_execute(config, sql)

In [7]:
prod_attri_sf = query_to_SFrame(config, "select * from prod_schema.prod_attri")

This non-commercial license of GraphLab Create for academic use is assigned to workingjhy@gmail.com and will expire on April 29, 2018.


[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1494674304.log


In [8]:
prod_attri_sf['tf_idf'] = column_to_tfidf(prod_attri_sf['serie'])

Building prefix dict from the default dictionary ...
Dumping model to file cache /var/folders/nr/hxpbrpxj6fddzg_1bm2y2_100000gq/T/jieba.cache
Loading model cost 1.722 seconds.
Prefix dict has been built succesfully.


In [10]:
prod_model2 = gl.nearest_neighbors.create(prod_attri_sf, label='pid', features = ['tf_idf'], 
                                         method='auto', distance = 'euclidean')

In [17]:
prod_model2.query(prod_attri_sf[prod_attri_sf['pid'] == 1], label='pid', k = 200,verbose=False).print_rows(num_rows=200)

+-------------+-----------------+---------------+------+
| query_label | reference_label |    distance   | rank |
+-------------+-----------------+---------------+------+
|      1      |        1        |      0.0      |  1   |
|      1      |        32       | 5.40530225184 |  2   |
|      1      |        9        | 6.68100214129 |  3   |
|      1      |        16       | 6.68100214129 |  4   |
|      1      |        24       | 6.68100214129 |  5   |
|      1      |        25       | 6.68100214129 |  6   |
|      1      |        26       | 6.68100214129 |  7   |
|      1      |        27       | 6.68100214129 |  8   |
|      1      |        42       | 6.68100214129 |  9   |
|      1      |        53       | 6.68100214129 |  10  |
|      1      |        54       | 6.68100214129 |  11  |
|      1      |        55       | 6.68100214129 |  12  |
|      1      |        57       | 6.68100214129 |  13  |
|      1      |        58       | 6.68100214129 |  14  |
|      1      |        59      

In [14]:
prod_model.query(prod_sf[prod_sf['pid'] == 1], label='pname', k = 10,verbose=False)

query_label,reference_label,distance,rank
LUSH岚舒 日本正品 BIG丰盈洗发露 蓬松 ...,LUSH岚舒 日本正品 BIG丰盈洗发露 蓬松 ...,0.0,1
LUSH岚舒 日本正品 BIG丰盈洗发露 蓬松 ...,杂果宾治洗发露 日 本LUSH岚舒控油舒缓 ...,1.94041065431,2
LUSH岚舒 日本正品 BIG丰盈洗发露 蓬松 ...,LUSH岚舒 日本正品� �罗亮泽洗发露 ...,2.02918189075,3
LUSH岚舒 日本正品 BIG丰盈洗发露 蓬松 ...,LUSH岚舒 日本正品� �果爽洗发露 ...,2.04523130313,4
LUSH岚舒 日本正品 BIG丰盈洗发露 蓬松 ...,LUSH岚舒 日本正品� �曲洗发露 损伤修� ...,2.30896973405,5
LUSH岚舒 日本正品 BIG丰盈洗发露 蓬松 ...,日本LUSH岚舒 薰衣迷手工洗发皂 ...,2.311619626,6
LUSH岚舒 日本正品 BIG丰盈洗发露 蓬松 ...,LUSH岚舒 日本正品� �爱蜜糖儿洗发露 ...,2.40133684412,7
LUSH岚舒 日本正品 BIG丰盈洗发露 蓬松 ...,LUSH岚舒 日本正品 草本洗面膏 ...,2.52654607415,8
LUSH岚舒 日本正品 BIG丰盈洗发露 蓬松 ...,LUSH岚舒 日本正品 黑夜天使洗面膏 控 ...,2.57419932137,9
LUSH岚舒 日本正品 BIG丰盈洗发露 蓬松 ...,LUSH岚舒 英国正品 古铜面部修饰霜 保 ...,2.57488576553,10
