In [1]:
import os
import sys
# 如果当前代码文件运行测试需要加入修改路径，避免出现后导包问题
BASE_DIR = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.insert(0, os.path.join(BASE_DIR))

PYSPARK_PYTHON = "/miniconda2/envs/reco_sys/bin/python"
# 当存在多个版本时，不指定很可能会导致出错
os.environ["PYSPARK_PYTHON"] = PYSPARK_PYTHON
os.environ["PYSPARK_DRIVER_PYTHON"] = PYSPARK_PYTHON

from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.sql.types import *
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import LogisticRegressionModel
from offline import SparkSessionBase

class CtrLogisticRegression(SparkSessionBase):

    SPARK_APP_NAME = "ctrLogisticRegression"
    ENABLE_HIVE_SUPPORT = True

    def __init__(self):

        self.spark = self._create_spark_hbase()

ctr = CtrLogisticRegression()

In [2]:
# 2、读取用户点击行为表，与用户画像和文章画像，构造训练样本
ctr.spark.sql('use profile')
news_article_basic = ctr.spark.sql("select user_id, article_id, channel_id, clicked from user_article_basic")

In [3]:
news_article_basic.show()

+-------------------+----------+----------+-------+
|            user_id|article_id|channel_id|clicked|
+-------------------+----------+----------+-------+
|1105045287866466304|     14225|         0|  false|
|1106476833370537984|     14208|         0|  false|
|1111189494544990208|     19322|         0|  false|
|1111524501104885760|     44161|         0|  false|
|1112727762809913344|     18172|        18|   true|
|                  1|     44386|        18|   true|
|                  1|     44696|         0|  false|
|                 10|     43907|         0|  false|
|1106473203766657024|     16005|         0|  false|
|1108264901190615040|     15196|         0|  false|
|                 23|     44739|        18|   true|
|                 33|     13570|         0|  false|
|                  1|     17632|         0|  false|
|1106473203766657024|     17665|         0|  false|
|1111189494544990208|     44368|         0|  false|
|                 10|     44368|         0|  false|
|11050938831

In [4]:
# 获取用户画像的数据
user_profile_hbase = ctr.spark.sql(
    "select user_id, information.birthday, information.gender, article_partial, env from user_profile_hbase")
user_profile_hbase = user_profile_hbase.drop('env')

In [5]:
user_profile_hbase.show()

+--------------------+--------+------+--------------------+
|             user_id|birthday|gender|     article_partial|
+--------------------+--------+------+--------------------+
|              user:1|     0.0|  null|Map(18:Animal -> ...|
|             user:10|     0.0|  null|Map(18:tp2 -> 0.1...|
|             user:11|     0.0|  null|               Map()|
|user:110249052282...|     0.0|  null|               Map()|
|user:110256196274...|     0.0|  null|               Map()|
|user:110319567345...|     0.0|  null|Map(18:Animal -> ...|
|user:110504528786...|     0.0|  null|Map(18:text -> 0....|
|user:110509388310...|     0.0|  null|Map(18:赋值 -> 0.16...|
|user:110510518565...|     0.0|  null|Map(18:SHOldboySt...|
|user:110639618314...|     0.0|  null|Map(18:tp2 -> 0.1...|
|user:110647320376...|     0.0|  null|Map(18:text -> 0....|
|user:110647683337...|     0.0|  null|Map(18:text -> 1....|
|user:110826490119...|    null|  null|Map(18:text -> 0....|
|user:111105316462...|     0.0|  null|  

In [6]:
# 对用户ID做处理
def get_user_id(row):
    return int(row.user_id.split(':')[1]), row.birthday, row.gender, row.article_partial

user_profile_hbase = user_profile_hbase.rdd.map(get_user_id)

In [7]:
# 对于其中toDF存在一些列没办法确定类型，手动指定DataFrame列的类型
_schema = StructType([
    StructField('user_id', LongType()),
    StructField('birthday', DoubleType()),
    StructField('gender', BooleanType()),
    StructField('article_partial', MapType(StringType(), DoubleType()))
])

user_profile_hbase = ctr.spark.createDataFrame(user_profile_hbase, schema=_schema)

In [8]:
user_profile_hbase.show()

+-------------------+--------+------+--------------------+
|            user_id|birthday|gender|     article_partial|
+-------------------+--------+------+--------------------+
|                  1|     0.0|  null|Map(18:Animal -> ...|
|                 10|     0.0|  null|Map(18:tp2 -> 0.1...|
|                 11|     0.0|  null|               Map()|
|1102490522829717504|     0.0|  null|               Map()|
|1102561962748805120|     0.0|  null|               Map()|
|1103195673450250240|     0.0|  null|Map(18:Animal -> ...|
|1105045287866466304|     0.0|  null|Map(18:text -> 0....|
|1105093883106164736|     0.0|  null|Map(18:赋值 -> 0.16...|
|1105105185656537088|     0.0|  null|Map(18:SHOldboySt...|
|1106396183141548032|     0.0|  null|Map(18:tp2 -> 0.1...|
|1106473203766657024|     0.0|  null|Map(18:text -> 0....|
|1106476833370537984|     0.0|  null|Map(18:text -> 1....|
|1108264901190615040|    null|  null|Map(18:text -> 0....|
|1111053164624216064|     0.0|  null|               Map(

In [9]:
# 合并用户点击行为表与用户画像表，并进行相应的删除无用特征
train = news_article_basic.join(user_profile_hbase, on=['user_id'], how='left').drop('birthday').drop('channel_id').drop('gender')


In [10]:
train.show()

+-------------------+----------+-------+--------------------+
|            user_id|article_id|clicked|     article_partial|
+-------------------+----------+-------+--------------------+
|1106473203766657024|     16005|  false|Map(18:text -> 0....|
|1106473203766657024|     17665|  false|Map(18:text -> 0....|
|1106473203766657024|     44664|  false|Map(18:text -> 0....|
|1106473203766657024|     44386|  false|Map(18:text -> 0....|
|1106473203766657024|     13778|  false|Map(18:text -> 0....|
|1106473203766657024|     13039|  false|Map(18:text -> 0....|
|1106473203766657024|     13648|  false|Map(18:text -> 0....|
|1106473203766657024|     17304|  false|Map(18:text -> 0....|
|1106473203766657024|     19233|  false|Map(18:text -> 0....|
|1106473203766657024|     44466|  false|Map(18:text -> 0....|
|1106473203766657024|     18795|  false|Map(18:text -> 0....|
|1106473203766657024|    134812|  false|Map(18:text -> 0....|
|1106473203766657024|     13357|  false|Map(18:text -> 0....|
|1106473

In [11]:
# 合并文章的向量以及文章的权重特征，文章所属的真正频道ID
ctr.spark.sql('use article')
article_vector = ctr.spark.sql("select * from article_vector")

In [12]:
train_user_article = train.join(article_vector, on=['article_id'], how='left')

In [13]:
train_user_article.show()

+-------------------+-------------------+-------+--------------------+----------+--------------------+
|         article_id|            user_id|clicked|     article_partial|channel_id|       articlevector|
+-------------------+-------------------+-------+--------------------+----------+--------------------+
|              13401|                 10|  false|Map(18:tp2 -> 0.1...|        18|[0.06157120217893...|
|              13401|1106396183141548032|  false|Map(18:tp2 -> 0.1...|        18|[0.06157120217893...|
|              14805|1106473203766657024|  false|Map(18:text -> 0....|        18|[0.11028526511434...|
|              14805|1103195673450250240|  false|Map(18:Animal -> ...|        18|[0.11028526511434...|
|              14805|1105045287866466304|  false|Map(18:text -> 0....|        18|[0.11028526511434...|
|              14805|1111524501104885760|  false|Map(18:text -> 0....|        18|[0.11028526511434...|
|              14805|1105105185656537088|  false|Map(18:SHOldboySt...|   

In [14]:
# 读取文章画像
article_profile = ctr.spark.sql("select article_id, keywords from article_profile")

def get_article_weights(row):
    
    try:
        weights = sorted(row.keywords.values())[:10]
    except Exception as e:
        weights = [0.0] * 10
    
    return row.article_id, weights

article_profile = article_profile.rdd.map(get_article_weights).toDF(['article_id', 'article_weights'])

In [15]:
article_profile.show()

+----------+--------------------+
|article_id|     article_weights|
+----------+--------------------+
|        26|[0.19827163395829...|
|        29|[0.26031398249056...|
|       474|[0.49818598558926...|
|       964|[0.42194661121527...|
|      1677|[0.19827339246090...|
|      1697|[0.25105539265038...|
|      1806|[0.18449119772340...|
|      1950|[0.33331407122173...|
|      2040|[0.38583431341698...|
|      2214|[0.43761156267670...|
|      2250|[0.46477621366740...|
|      2453|[0.50514620188273...|
|      2509|[0.15138306650944...|
|      2529|[0.11634963900866...|
|      2927|[0.28513034617795...|
|      3091|[0.23478830492918...|
|      3506|[0.22844780420769...|
|      3764|[0.27265314149033...|
|      4590|[0.40296288036812...|
|      4823|[0.21729897161021...|
+----------+--------------------+
only showing top 20 rows



In [16]:
# 合并文章权重
train_user_article = train_user_article.join(article_profile, on=['article_id'], how='left')

In [17]:
# 保留了用户的每个频道的关键词权重，找到用户对应操作文章的所属频道的关键词权重
train_user_article = train_user_article.dropna()

In [18]:
train_user_article.show()

+----------+-------------------+-------+--------------------+----------+--------------------+--------------------+
|article_id|            user_id|clicked|     article_partial|channel_id|       articlevector|     article_weights|
+----------+-------------------+-------+--------------------+----------+--------------------+--------------------+
|     13401|                 10|  false|Map(18:tp2 -> 0.1...|        18|[0.06157120217893...|[0.08196639249252...|
|     13401|1106396183141548032|  false|Map(18:tp2 -> 0.1...|        18|[0.06157120217893...|[0.08196639249252...|
|     14805|1106473203766657024|  false|Map(18:text -> 0....|        18|[0.11028526511434...|[0.15069781969741...|
|     14805|1103195673450250240|  false|Map(18:Animal -> ...|        18|[0.11028526511434...|[0.15069781969741...|
|     14805|1105045287866466304|  false|Map(18:text -> 0....|        18|[0.11028526511434...|[0.15069781969741...|
|     14805|1111524501104885760|  false|Map(18:text -> 0....|        18|[0.11028

In [19]:
train_user_article

DataFrame[article_id: bigint, user_id: bigint, clicked: boolean, article_partial: map<string,double>, channel_id: int, articlevector: array<double>, article_weights: array<double>]

In [20]:
from pyspark.ml.linalg import Vectors
columns = ['article_id', 'user_id', 'channel_id', 'articlevector', 'user_weights', 'article_weights', 'clicked']
def get_user_weights(row):

    from pyspark.ml.linalg import Vectors
    try:
        # map对象的存储形式：channel_id: key_words --->weights
        user_weights = sorted([row.article_partial[key] for key in row.article_partial.keys() 
                               if key.split(':')[0] == str(row.channel_id)])[:10]
    except Exception:
        user_weights = [0.0] * 10
    # 将权重向量转换成Vector对象，方便后面模型处理
    return row.article_id, row.user_id, row.channel_id, Vectors.dense(row.articlevector), Vectors.dense(user_weights), Vectors.dense(row.article_weights), int(row.clicked)

train_vector = train_user_article.rdd.map(get_user_weights).toDF(columns)


In [21]:
# 收集所有特征到一个features列
train_res = VectorAssembler().setInputCols(columns[2:6]).setOutputCol('features').transform(train_vector)

In [22]:
train_res.show()

+----------+-------------------+----------+--------------------+--------------------+--------------------+-------+--------------------+
|article_id|            user_id|channel_id|       articlevector|        user_weights|     article_weights|clicked|            features|
+----------+-------------------+----------+--------------------+--------------------+--------------------+-------+--------------------+
|     13401|                 10|        18|[0.06157120217893...|[0.16021155576912...|[0.08196639249252...|      0|[18.0,0.061571202...|
|     13401|1106396183141548032|        18|[0.06157120217893...|[0.16147375544143...|[0.08196639249252...|      0|[18.0,0.061571202...|
|     14805|1106473203766657024|        18|[0.11028526511434...|[0.16147375544143...|[0.15069781969741...|      0|[18.0,0.110285265...|
|     14805|1103195673450250240|        18|[0.11028526511434...|[0.16034784031666...|[0.15069781969741...|      0|[18.0,0.110285265...|
|     14805|1105045287866466304|        18|[0.11

In [23]:
train = train_res.select(['article_id', 'user_id', 'clicked', 'features'])

In [24]:
arr = train.collect()

In [25]:
arr

[Row(article_id=13401, user_id=10, clicked=0, features=DenseVector([18.0, 0.0616, 0.0357, -0.0008, 0.0916, 0.0128, 0.0312, 0.01, 0.0486, -0.0301, -0.0107, -0.0806, 0.0339, -0.0161, 0.0753, -0.0265, 0.0253, 0.0032, 0.0101, -0.0164, -0.0068, -0.0297, 0.0114, -0.0295, 0.0204, -0.0644, -0.0579, 0.0539, 0.0694, 0.0305, -0.0371, -0.0005, 0.0513, 0.0726, 0.076, -0.062, 0.0006, -0.0688, -0.056, 0.0494, -0.0069, 0.0606, -0.0675, -0.0136, 0.0348, 0.0012, 0.0384, 0.1002, 0.0362, -0.0677, 0.0049, -0.0127, -0.0424, 0.0532, 0.0469, 0.0091, 0.0149, 0.0103, -0.0039, -0.0102, 0.0628, -0.0004, -0.043, -0.0063, -0.0909, 0.0228, 0.0317, -0.0361, -0.0195, 0.0156, -0.0577, -0.0216, -0.0115, -0.0083, -0.006, 0.0198, 0.0407, 0.0341, 0.0037, 0.0411, -0.012, 0.0607, -0.0582, 0.0332, -0.0119, 0.0353, 0.0342, 0.0203, -0.0416, -0.0406, 0.0761, 0.0172, 0.0546, 0.0476, 0.0052, -0.0009, -0.0017, -0.0463, -0.0645, -0.0216, 0.1021, 0.1602, 0.1602, 0.1602, 0.1602, 0.1602, 0.1602, 0.1602, 0.1602, 0.1602, 0.1602, 0.082, 0

In [26]:
import pandas as pd
df = pd.DataFrame(arr)

In [27]:
df

Unnamed: 0,0,1,2,3
0,13401,10,0,"[18.0, 0.061571202178931625, 0.035721198358704..."
1,13401,1106396183141548032,0,"[18.0, 0.061571202178931625, 0.035721198358704..."
2,14805,1106473203766657024,0,"[18.0, 0.11028526511434833, 0.0474637816380961..."
3,14805,1103195673450250240,0,"[18.0, 0.11028526511434833, 0.0474637816380961..."
4,14805,1105045287866466304,0,"[18.0, 0.11028526511434833, 0.0474637816380961..."
5,14805,1111524501104885760,0,"[18.0, 0.11028526511434833, 0.0474637816380961..."
6,14805,1105105185656537088,0,"[18.0, 0.11028526511434833, 0.0474637816380961..."
7,14805,1,0,"[18.0, 0.11028526511434833, 0.0474637816380961..."
8,14805,10,0,"[18.0, 0.11028526511434833, 0.0474637816380961..."
9,14805,1112727762809913344,0,"[18.0, 0.11028526511434833, 0.0474637816380961..."


In [28]:
import tensorflow as tf
def write_to_tfrecords(click_batch, feature_batch):
        """
        将数据存进tfrecords，方便管理每个样本的属性
        :param image_batch: 特征值
        :param label_batch: 目标值
        :return: None
        """
        # 1、构造tfrecords的存储实例
        writer = tf.python_io.TFRecordWriter("./data/train_ctr_201909.tfrecords")
        # 2、循环将每个样本写入到文件当中
        for i in range(len(click_batch)):

            click = click_batch[i]
            feature = feature_batch[i].tostring()

            # 绑定每个样本的属性
            example = tf.train.Example(features=tf.train.Features(feature={
                "label": tf.train.Feature(int64_list=tf.train.Int64List(value=[click])),
                "feature": tf.train.Feature(bytes_list=tf.train.BytesList(value=[feature])),
            }))
            writer.write(example.SerializeToString())

        # 文件需要关闭
        writer.close()
        return None

# 开启会话打印内容
with tf.Session() as sess:
    # 创建线程协调器
    coord = tf.train.Coordinator()

    # 开启子线程去读取数据
    # 返回子线程实例
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)

    # 存入数据
    write_to_tfrecords(df.iloc[:, 2], df.iloc[:, 3])

    # 关闭子线程，回收
    coord.request_stop()

    coord.join(threads)

Instructions for updating:
To construct input pipelines, use the `tf.data` module.
