In [2]:
import os
import sys
# 如果当前代码文件运行测试需要加入修改路径，避免出现后导包问题
BASE_DIR = os.path.dirname(os.getcwd())
sys.path.insert(0, os.path.join(BASE_DIR))

PYSPARK_PYTHON = "/miniconda2/envs/reco_sys/bin/python"
os.environ["PYSPARK_PYTHON"] = PYSPARK_PYTHON
os.environ["PYSPARK_DRIVER_PYTHON"] = PYSPARK_PYTHON
from pyspark import SparkConf
from pyspark.sql import SparkSession
from server.utils import HBaseUtils
from server import pool
from pyspark.ml.linalg import DenseVector
from pyspark.ml.classification import LogisticRegressionModel
import pandas as pd


conf = SparkConf()
config = (
    ("spark.app.name", "sort"),
    ("spark.executor.memory", "2g"),    # 设置该app启动时占用的内存用量，默认1g
    ("spark.master", 'local[2]'),
    ("spark.executor.cores", "2"),   # 设置spark executor使用的CPU核心数
)

conf.setAll(config)
spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [7]:
# 1、读取用户特征中心特征hbase
hbu = HBaseUtils(pool)
try:
    # 找你数据库中有的测试数据
    user_feature = eval(hbu.get_table_row('ctr_feature_user',
                                    '{}'.format(1113244157343694848).encode(),
                                    'channel:{}'.format(18).encode()))
except Exception as e:
    user_feature = []

In [8]:
# 如果用户特征存在，进行文章特征获取，构造样本， 样本数量 == 文章数量
# if user_feature:
user_feature

[0.16443570267384364,
 0.16443570267384364,
 0.16443570267384364,
 0.16443570267384364,
 0.16443570267384364,
 0.16443570267384364,
 0.16443570267384364,
 0.16443570267384364,
 0.16443570267384364,
 0.16443570267384364]

In [9]:
# 2、读取文章特征中心特征、合并用户文章特征构造预测样本
result = []
for article_id in [17749, 17748, 44371, 44368]:
    
    try:
        article_feature = eval(hbu.get_table_row('ctr_feature_article',
                                           '{}'.format(article_id).encode(),
                                           'article:{}'.format(article_id).encode()))
        # article_feature组成1 channel_id, 10 article_weights, 100 article_vector
    except Exception as e:
        article_feature = [0.0] * 111
    
    # 一个用户与四篇文章构造成四个样本,训练特征顺序
    # 'channel_id', 'articlevector', 'weights', 'article_weights'合并成一个features
    f = []
    # channel_id
    f.extend([article_feature[0]])
    # articlevector
    f.extend(article_feature[11:])
    # weights 用户
    f.extend(user_feature)
    # article_weights
    f.extend(article_feature[1:11])
    # f 121维的特征
    # f 也就是一个样本的feature拼接完成
    features = DenseVector(f)
    result.append([1115629498121846784, article_id, features])

In [10]:
result

[[1115629498121846784,
  17749,
  DenseVector([18.0, 0.1227, 0.081, 0.034, 0.1269, -0.0256, 0.0209, -0.0063, 0.0983, -0.1121, 0.0009, -0.1636, 0.0753, 0.0187, 0.1265, -0.0363, 0.0467, -0.0083, 0.0357, -0.0022, 0.0354, -0.0645, 0.0585, -0.0616, 0.0242, -0.1319, -0.0823, 0.0962, 0.1201, 0.0466, -0.1208, -0.0107, 0.0868, 0.1241, 0.112, -0.111, 0.0271, -0.0727, -0.0639, 0.0935, -0.0096, 0.0967, -0.1252, -0.0392, 0.0469, 0.0027, 0.0239, 0.16, 0.0368, -0.1017, 0.0621, -0.029, -0.0463, 0.1203, 0.0851, 0.0117, 0.0537, 0.0098, -0.0483, 0.0364, 0.074, 0.0594, -0.122, 0.0138, -0.1525, -0.0137, 0.0899, -0.0316, -0.0336, 0.0451, -0.0646, 0.0084, -0.0082, -0.0234, -0.0715, 0.0266, 0.0372, 0.0822, -0.0082, 0.1135, -0.038, 0.0764, -0.1058, 0.0339, 0.0042, 0.0242, 0.0618, 0.0107, 0.0233, -0.0336, 0.1457, 0.0417, 0.1061, 0.1189, 0.0262, 0.0553, -0.0316, -0.0822, -0.1244, -0.0507, 0.197, 0.1644, 0.1644, 0.1644, 0.1644, 0.1644, 0.1644, 0.1644, 0.1644, 0.1644, 0.1644, 0.1297, 0.1385, 0.1575, 0.1902, 0.2114

In [11]:
df = pd.DataFrame(result, columns=['user_id', 'article_id', 'features'])

In [12]:
df

Unnamed: 0,user_id,article_id,features
0,1115629498121846784,17749,"[18.0, 0.12274417509787901, 0.0809537791707485..."
1,1115629498121846784,17748,"[18.0, 0.12336761023537457, 0.0566188635715047..."
2,1115629498121846784,44371,"[18.0, 0.12649150229610565, 0.0465560077913559..."
3,1115629498121846784,44368,"[18.0, 0.07960356197575431, 0.0482095266637195..."


In [13]:
# 3、预测并进行排序是筛选
spark_df = spark.createDataFrame(df)

In [14]:
spark_df.show()

+-------------------+----------+--------------------+
|            user_id|article_id|            features|
+-------------------+----------+--------------------+
|1115629498121846784|     17749|[18.0,0.122744175...|
|1115629498121846784|     17748|[18.0,0.123367610...|
|1115629498121846784|     44371|[18.0,0.126491502...|
|1115629498121846784|     44368|[18.0,0.079603561...|
+-------------------+----------+--------------------+



In [15]:
# 加载模型预测
model = LogisticRegressionModel.load("hdfs://hadoop-master:9000/headlines/models/LR.obj")

In [16]:
prediction = model.transform(spark_df)

In [17]:
prediction.show()

+-------------------+----------+--------------------+--------------------+--------------------+----------+
|            user_id|article_id|            features|       rawPrediction|         probability|prediction|
+-------------------+----------+--------------------+--------------------+--------------------+----------+
|1115629498121846784|     17749|[18.0,0.122744175...|[2.13649261419033...|[0.89439980005788...|       0.0|
|1115629498121846784|     17748|[18.0,0.123367610...|[2.13728984545425...|[0.89447507392007...|       0.0|
|1115629498121846784|     44371|[18.0,0.126491502...|[2.12135632430905...|[0.89296163766490...|       0.0|
|1115629498121846784|     44368|[18.0,0.079603561...|[2.13039215517524...|[0.89382223118747...|       0.0|
+-------------------+----------+--------------------+--------------------+--------------------+----------+



In [18]:
# 按照用户点击某篇文章的概率大小进行排序
# channel_id == article_id
def convert_type(row):
    return int(row.article_id), float(row.probability[1])

res = prediction.select(['article_id', 'probability']).rdd.map(convert_type).toDF(['article_id', 'probability']).sort('probability', ascending=False)

In [19]:
res.show()

+----------+-------------------+
|article_id|        probability|
+----------+-------------------+
|     44371|0.10703836233509091|
|     44368|0.10617776881252859|
|     17749|0.10560019994211393|
|     17748|0.10552492607992216|
+----------+-------------------+



In [20]:
# 进行筛选返回推荐的文章列表
article_list = [i.article_id for i in res.collect()]
if len(article_list) > 200:
    article_list = article_list[:200]
reco_set = list(map(int, article_list))

In [21]:
reco_set

[44371, 44368, 17749, 17748]