In [1]:
import os
import sys
# 如果当前代码文件运行测试需要加入修改路径，避免出现后导包问题
BASE_DIR = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.insert(0, os.path.join(BASE_DIR))

PYSPARK_PYTHON = "/miniconda2/envs/reco_sys/bin/python"
# 当存在多个版本时，不指定很可能会导致出错
os.environ["PYSPARK_PYTHON"] = PYSPARK_PYTHON
os.environ["PYSPARK_DRIVER_PYTHON"] = PYSPARK_PYTHON

from offline import SparkSessionBase

class UpdateRecall(SparkSessionBase):

    SPARK_APP_NAME = "updateRecall"
    ENABLE_HIVE_SUPPORT = True

    def __init__(self):
        self.spark = self._create_spark_session()

ur = UpdateRecall()

## 将数据处理成ALS模型需要的数据类型

In [2]:
# 读取数据
ur.spark.sql("use profile")
user_article_click = ur.spark.sql("select * from user_article_basic").select(['user_id', 'article_id', 'clicked'])

In [3]:
user_article_click.show()

+-------------------+----------+-------+
|            user_id|article_id|clicked|
+-------------------+----------+-------+
|1105045287866466304|     14225|  false|
|1106476833370537984|     14208|  false|
|1111189494544990208|     19322|  false|
|1111524501104885760|     44161|  false|
|1112727762809913344|     18172|   true|
|                  1|     44386|   true|
|                  1|     44696|  false|
|                 10|     43907|  false|
|1106473203766657024|     16005|  false|
|1108264901190615040|     15196|  false|
|                 23|     44739|   true|
|                 33|     13570|  false|
|                  1|     17632|  false|
|1106473203766657024|     17665|  false|
|1111189494544990208|     44368|  false|
|                 10|     44368|  false|
|1105093883106164736|     15750|  false|
|1106396183141548032|     19476|  false|
|1111524501104885760|     19233|  false|
|                  2|     44371|   true|
+-------------------+----------+-------+
only showing top

In [4]:
# 对clicked进行类型转化
def boolean_to_int(row):
    return row.user_id, row.article_id, int(row.clicked)

user_article_click = user_article_click.rdd.map(boolean_to_int).toDF(['user_id', 'article_id', 'clicked'])

In [5]:
user_article_click.show()

+-------------------+----------+-------+
|            user_id|article_id|clicked|
+-------------------+----------+-------+
|1105045287866466304|     14225|      0|
|1106476833370537984|     14208|      0|
|1111189494544990208|     19322|      0|
|1111524501104885760|     44161|      0|
|1112727762809913344|     18172|      1|
|                  1|     44386|      1|
|                  1|     44696|      0|
|                 10|     43907|      0|
|1106473203766657024|     16005|      0|
|1108264901190615040|     15196|      0|
|                 23|     44739|      1|
|                 33|     13570|      0|
|                  1|     17632|      0|
|1106473203766657024|     17665|      0|
|1111189494544990208|     44368|      0|
|                 10|     44368|      0|
|1105093883106164736|     15750|      0|
|1106396183141548032|     19476|      0|
|1111524501104885760|     19233|      0|
|                  2|     44371|      1|
+-------------------+----------+-------+
only showing top

In [6]:
# 处理user_id和article_id
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
# 用户和文章ID超过ALS能接受的最大整数值，需要使用StringIndexer进行转换
# stringIndexer根据类别出现的频次从0开始标号，出现频次最多的是0号
user_id_indexer = StringIndexer(inputCol='user_id', outputCol='als_user_id')
article_id_indexer = StringIndexer(inputCol='article_id', outputCol='als_article_id')
pip = Pipeline(stages=[user_id_indexer, article_id_indexer])
pip_fit = pip.fit(user_article_click)
als_user_article_click = pip_fit.transform(user_article_click)

In [7]:
als_user_article_click.show()

+-------------------+----------+-------+-----------+--------------+
|            user_id|article_id|clicked|als_user_id|als_article_id|
+-------------------+----------+-------+-----------+--------------+
|1105045287866466304|     14225|      0|        4.0|          15.0|
|1106476833370537984|     14208|      0|        2.0|           2.0|
|1111189494544990208|     19322|      0|        1.0|         133.0|
|1111524501104885760|     44161|      0|        9.0|          37.0|
|1112727762809913344|     18172|      1|       12.0|          54.0|
|                  1|     44386|      1|       10.0|          11.0|
|                  1|     44696|      0|       10.0|          97.0|
|                 10|     43907|      0|        3.0|           1.0|
|1106473203766657024|     16005|      0|        5.0|          32.0|
|1108264901190615040|     15196|      0|        6.0|           7.0|
|                 23|     44739|      1|       17.0|           4.0|
|                 33|     13570|      0|       1

In [8]:
# ALS模型训练和推荐
from pyspark.ml.recommendation import ALS
# 模型训练
als = ALS(userCol='als_user_id', itemCol='als_article_id', ratingCol='clicked', checkpointInterval=1)
model = als.fit(als_user_article_click)
# 为每个用户推荐100篇文章
recall_res = model.recommendForAllUsers(100)

In [9]:
recall_res.show()

+-----------+--------------------+
|als_user_id|     recommendations|
+-----------+--------------------+
|         12|[[206,0.23789279]...|
|          1|[[93,0.43207034],...|
|         13|[[86,0.2539775], ...|
|          6|[[263,0.16947696]...|
|         16|[[0,0.0], [10,0.0...|
|          3|[[194,0.041858666...|
|         20|[[0,0.0], [10,0.0...|
|          5|[[263,0.020656489...|
|         19|[[25,0.23241602],...|
|         15|[[82,0.71069384],...|
|         17|[[120,0.3641395],...|
|          9|[[0,0.0], [10,0.0...|
|          4|[[263,0.018508907...|
|          8|[[263,0.36379915]...|
|          7|[[120,0.063147336...|
|         10|[[206,0.78249323]...|
|         11|[[206,0.93888915]...|
|         14|[[206,0.6858768],...|
|          2|[[263,0.6295867],...|
|          0|[[251,0.64237577]...|
+-----------+--------------------+
only showing top 20 rows



In [10]:
# 保留通过stringIndexer转换后的user_id与article_id 和原始值的映射关系
refection_user = als_user_article_click.groupBy(['user_id']).max('als_user_id').withColumnRenamed('max(als_user_id)', 'als_user_id')
refection_article = als_user_article_click.groupBy(['article_id']).max('als_article_id').withColumnRenamed('max(als_article_id)', 'als_article_id')

In [11]:
refection_user.show()

+-------------------+-----------+
|            user_id|als_user_id|
+-------------------+-----------+
|1106473203766657024|        5.0|
|1103195673450250240|        7.0|
|1105045287866466304|        4.0|
|1111524501104885760|        9.0|
|1105105185656537088|        8.0|
|1113316420155867136|       18.0|
|                 33|       13.0|
|                  1|       10.0|
|1113244157343694848|       15.0|
|                 10|        3.0|
|1113053603926376448|       20.0|
|1112727762809913344|       12.0|
|                  2|       11.0|
|                  4|       14.0|
|1106476833370537984|        2.0|
|1106396183141548032|        0.0|
|                 38|       16.0|
|                 23|       17.0|
|1108264901190615040|        6.0|
|1111189494544990208|        1.0|
+-------------------+-----------+
only showing top 20 rows



In [12]:
# 合并推荐结果与 refection_user映射关系表
recall_res = recall_res.join(refection_user, on=['als_user_id'], how='left').select(['als_user_id', 'recommendations', 'user_id'])

In [13]:
recall_res.show()

+-----------+--------------------+-------------------+
|als_user_id|     recommendations|            user_id|
+-----------+--------------------+-------------------+
|          8|[[263,0.36379915]...|1105105185656537088|
|          0|[[251,0.64237577]...|1106396183141548032|
|          7|[[120,0.063147336...|1103195673450250240|
|         18|[[0,0.0], [10,0.0...|1113316420155867136|
|          1|[[93,0.43207034],...|1111189494544990208|
|          4|[[263,0.018508907...|1105045287866466304|
|         11|[[206,0.93888915]...|                  2|
|         14|[[206,0.6858768],...|                  4|
|          3|[[194,0.041858666...|                 10|
|         19|[[25,0.23241602],...|1105093883106164736|
|          2|[[263,0.6295867],...|1106476833370537984|
|         17|[[120,0.3641395],...|                 23|
|         10|[[206,0.78249323]...|                  1|
|         13|[[86,0.2539775], ...|                 33|
|          6|[[263,0.16947696]...|1108264901190615040|
|         

In [14]:
# 展开recommendations列，找到user_id和文章的一一对应关系
import pyspark.sql.functions as F
recall_res = recall_res.withColumn('recommendation', F.explode('recommendations')).drop('recommendations').drop('als_user_id')

In [15]:
recall_res.show()

+-------------------+----------------+
|            user_id|  recommendation|
+-------------------+----------------+
|1105105185656537088|[263,0.36379915]|
|1105105185656537088| [115,0.2490875]|
|1105105185656537088| [105,0.2490875]|
|1105105185656537088|[251,0.18787636]|
|1105105185656537088|[235,0.18787636]|
|1105105185656537088|[176,0.18787636]|
|1105105185656537088|[222,0.18787636]|
|1105105185656537088|[236,0.18787636]|
|1105105185656537088| [15,0.15890628]|
|1105105185656537088|[100,0.13856897]|
|1105105185656537088|[36,0.113783054]|
|1105105185656537088| [12,0.10552912]|
|1105105185656537088| [5,0.067999065]|
|1105105185656537088|[20,0.039709307]|
|1105105185656537088| [40,0.03743278]|
|1105105185656537088| [6,0.035484646]|
|1105105185656537088|  [3,0.03392289]|
|1105105185656537088|[13,0.032818202]|
|1105105185656537088| [8,0.030078007]|
|1105105185656537088|[38,0.026264708]|
+-------------------+----------------+
only showing top 20 rows



In [16]:
# 取出article_id进行反向解析
def _article_id(row):
  return row.user_id, row.recommendation[0]

In [17]:
recall_res = recall_res.rdd.map(_article_id).toDF(['user_id', 'als_article_id'])

In [18]:
recall_res.show()

+-------------------+--------------+
|            user_id|als_article_id|
+-------------------+--------------+
|1105105185656537088|           263|
|1105105185656537088|           115|
|1105105185656537088|           105|
|1105105185656537088|           251|
|1105105185656537088|           235|
|1105105185656537088|           176|
|1105105185656537088|           222|
|1105105185656537088|           236|
|1105105185656537088|            15|
|1105105185656537088|           100|
|1105105185656537088|            36|
|1105105185656537088|            12|
|1105105185656537088|             5|
|1105105185656537088|            20|
|1105105185656537088|            40|
|1105105185656537088|             6|
|1105105185656537088|             3|
|1105105185656537088|            13|
|1105105185656537088|             8|
|1105105185656537088|            38|
+-------------------+--------------+
only showing top 20 rows



In [19]:
recall_res = recall_res.join(refection_article, on=['als_article_id'], how='left')

In [20]:
recall_res.show()

+--------------+-------------------+----------+
|als_article_id|            user_id|article_id|
+--------------+-------------------+----------+
|           170|1113316420155867136|    134730|
|           170|1105093883106164736|    134730|
|           170|1113053603926376448|    134730|
|           170|1111524501104885760|    134730|
|           170|                 38|    134730|
|           184|1105093883106164736|     19494|
|           160|1105105185656537088|     18127|
|           160|1106396183141548032|     18127|
|           160|1103195673450250240|     18127|
|           160|1113316420155867136|     18127|
|           160|1111189494544990208|     18127|
|           160|1105045287866466304|     18127|
|           160|                  2|     18127|
|           160|                  4|     18127|
|           160|                 10|     18127|
|           160|1106476833370537984|     18127|
|           160|                 23|     18127|
|           160|                  1|    

In [21]:
# 获取每个文章对应的频道，推荐给用户时按照频道存储
ur.spark.sql("use toutiao")
news_article_basic = ur.spark.sql("select article_id, channel_id from news_article_basic")

als_recall = recall_res.join(news_article_basic, on=['article_id'], how='left')

In [22]:
als_recall.show()

+----------+--------------+-------------------+----------+
|article_id|als_article_id|            user_id|channel_id|
+----------+--------------+-------------------+----------+
|    134730|           170|1113316420155867136|        18|
|    134730|           170|1105093883106164736|        18|
|    134730|           170|1113053603926376448|        18|
|    134730|           170|1111524501104885760|        18|
|    134730|           170|                 38|        18|
|     19494|           184|1105093883106164736|        18|
|     18127|           160|1105105185656537088|        18|
|     18127|           160|1106396183141548032|        18|
|     18127|           160|1103195673450250240|        18|
|     18127|           160|1113316420155867136|        18|
|     18127|           160|1111189494544990208|        18|
|     18127|           160|1105045287866466304|        18|
|     18127|           160|                  2|        18|
|     18127|           160|                  4|        1

In [23]:
# 得到每篇文章的channel_id
# 按照用户和channel_id分组
als_recall = als_recall.groupBy(['user_id', 'channel_id']).agg(F.collect_list('article_id')).withColumnRenamed(
  'collect_list(article_id)', 'article_list')

In [24]:
# 去除缺失值
als_recall = als_recall.dropna()
als_recall.show()

+-------------------+----------+--------------------+
|            user_id|channel_id|        article_list|
+-------------------+----------+--------------------+
|1113244157343694848|         7|    [141437, 141469]|
|1108264901190615040|         7|            [141437]|
|                 33|        13|            [141431]|
|1106396183141548032|         7|    [141437, 141469]|
|1103195673450250240|         5|            [141440]|
|1108264901190615040|        18|[18127, 16421, 18...|
|1106473203766657024|        18|[18127, 16421, 18...|
|                  4|         7|    [141437, 141469]|
|                  2|         5|            [141440]|
|                 23|         7|    [141437, 141469]|
|1106396183141548032|        13|            [141431]|
|1113053603926376448|        13|            [141431]|
|1113053603926376448|         7|            [141437]|
|                 10|         7|    [141437, 141469]|
|1106476833370537984|         7|            [141437]|
|1111189494544990208|       

In [25]:
# 基于模型的召回结果存储

def save_offline_recall_hbase(partition):
    """
    离线模型召回结果存储
    """
    import happybase
    pool = happybase.ConnectionPool(size=10, host='hadoop-master', port=9090)
    for row in partition:
        with pool.connection() as conn:
            # 获取历史看过的该频道文章
            history_table = conn.table('history_recall')
            # 获取多个版本的历史数据
            # 结果为带时间戳的value值，这里value为article_id列表
            data = history_table.cells('reco:his:{}'.format(row.user_id).encode(), 'channel:{}'.format(row.channel_id).encode())

            history = []
            if len(data) >= 2:
                for l in data[:-1]:
                    history.extend(l)
            else:
                history = []

            # 过滤掉history中已经推荐过的文章
            reco_res = list(set(row.article_list) - set(history))
            
            # 过滤后的推荐结果，如果有数据就存，没有就不存
            if reco_res:

                table = conn.table('cb_recall')
                # 默认放在推荐频道
                table.put('recall:user:{}'.format(row.user_id).encode(),{'als:{}'.format(row.channel_id).encode(): str(reco_res).encode()})

                # 同时放入历史推荐过文章
                history_table.put("reco:his:{}".format(row.user_id).encode(),{'channel:{}'.format(row.channel_id): str(reco_res).encode()})
            conn.close()

# 数据已存，不再运行
# als_recall.foreachPartition(save_offline_recall_hbase)

In [26]:
# 基于内容的召回结果推荐
# 过滤点击过的文章
ur.spark.sql("use profile")
user_article_basic = ur.spark.sql("select * from user_article_basic").filter('clicked=True')

In [27]:
user_article_basic.show()

+-------------------+-------------------+----------+----------+------+-------+---------+--------+---------+
|            user_id|        action_time|article_id|channel_id|shared|clicked|collected|exposure|read_time|
+-------------------+-------------------+----------+----------+------+-------+---------+--------+---------+
|1112727762809913344|2019-04-03 12:51:57|     18172|        18| false|   true|     true|    true|    19413|
|                  1|2019-03-07 16:57:34|     44386|        18| false|   true|    false|    true|    17850|
|                 23|2019-04-03 08:10:23|     44739|        18| false|   true|    false|    true|    14216|
|                  2|2019-03-05 10:19:54|     44371|        18| false|   true|    false|    true|      938|
|                  2|2019-03-07 10:06:20|     18103|        18| false|   true|    false|    true|      648|
|1111189494544990208|2019-03-28 16:56:55|     44737|        18| false|   true|    false|    true|     4138|
|                  2|2019-03

In [None]:
def save_clicked_similar_article_recall(partition):
    """计算每个用户的每个操作文章的相似文章，过滤之后，写入content召回表当中（支持不同时间戳版本）
    """
    import happybase
    pool = happybase.ConnectionPool(size=10, host='hadoop-master')

    # 进行为相似文章获取
    with pool.connection() as conn:

        # key:   article_id,    column:  similar:article_id
        similar_table = conn.table('article_similar')
        # 循环partition
        for row in partition:
            # 获取相似文章结果表
            similar_article = similar_table.row(str(row.article_id).encode(),
                                                columns=[b'similar'])
            # 相似文章相似度排序过滤，召回不需要太大的数据， 百个，千
            # _srt = sorted(similar_article.items(), key=lambda item: item[1], reverse=True)
            _srt = sorted(similar_article.items(), key=lambda item: item[1])
            if _srt:
                # 每次行为推荐10篇文章
                reco_article = [int(i[0].split(b':')[1]) for i in _srt][:10]

                # 获取历史看过的该频道文章
                history_table = conn.table('history_recall')
                # 多个版本
                data = history_table.cells('reco:his:{}'.format(row.user_id).encode(),
                                           'channel:{}'.format(row.channel_id).encode())

                history = []
                if len(data) >= 2:
                    for l in data[:-1]:
                        history.extend(eval(l))
                else:
                    history = []

                # 过滤reco_article与history
                reco_res = list(set(reco_article) - set(history))

                # 进行推荐，放入基于内容的召回表当中以及历史看过的文章表当中
                if reco_res:
                    # content_table = conn.table('cb_content_recall')
                    content_table = conn.table('cb_recall')
                    content_table.put("recall:user:{}".format(row.user_id).encode(),
                                      {'content:{}'.format(row.channel_id).encode(): str(reco_res).encode()})

                    # 放入历史推荐过文章
                    history_table.put("reco:his:{}".format(row.user_id).encode(),
                                      {'channel:{}'.format(row.channel_id).encode(): str(reco_res).encode()})

        conn.close()

user_article_basic.foreachPartition(save_clicked_similar_article_recall)