In [1]:
import graphlab as gl

In [2]:
import pymysql

In [3]:
config = {
    'host': 'localhost',
    'user': 'root',
    'password': '123456',
    'db': 'prod_schema',
    'charset': 'utf8mb4',
    'cursorclass': pymysql.cursors.DictCursor
}
def db_execute(config, sql):
    connection = pymysql.connect(**config)
    try:
        with connection.cursor() as cursor:
            cursor.execute(sql)
        connection.commit()
        return True
    finally:
        connection.close()
        
def db_query(config, sql):
    connection = pymysql.connect(**config)
    try:
        with connection.cursor() as cursor:
            cursor.execute(sql)
            result = cursor.fetchall()
            return result
    finally:
        connection.close()

In [4]:
import jieba
from graphlab import SFrame
from graphlab import SArray
import jieba.analyse

In [5]:
def query_to_SFrame(config, sql):
    result = db_query(config, sql)
    sf = gl.SFrame(result)
    result_sf = sf.unpack('X1', column_name_prefix='')
    return result_sf

In [6]:
def column_to_tfidf(column):
    result = list()
    for content in column:
        tags = jieba.analyse.extract_tags(content, topK=8, withWeight=True)
        result.append(dict(tags))
    return SArray(result)

In [7]:
prod_sf = query_to_SFrame(config, "select * from prod_schema.prod_info")

[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1494425116.log


This non-commercial license of GraphLab Create for academic use is assigned to workingjhy@gmail.com and will expire on April 29, 2018.


In [8]:
prod_sf.column_names()

['pid', 'pname', 'pprice']

In [9]:
prod_sf['tf_idf'] = column_to_tfidf(prod_sf['pname'])

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/nr/hxpbrpxj6fddzg_1bm2y2_100000gq/T/jieba.cache
Loading model cost 0.387 seconds.
Prefix dict has been built succesfully.


In [10]:
prod_model = gl.nearest_neighbors.create(prod_sf, label='pid', features = ['tf_idf'], 
                                         method='auto', distance = 'euclidean')

In [13]:
for prod in prod_sf:
    nearest = prod_model.query(prod_sf[prod_sf['pid'] == prod['pid']], label='pid', k = 6, verbose=False)
    for rec in nearest:
        sql = "insert into prod_schema.prod_nk (pid, kn_pid, distance, rank) values (%d, %d, %.4f, %d)" % (rec['query_label'], rec['reference_label'], rec['distance'], rec['rank'] )
        db_execute(config, sql)