In [1]:
from pyspark.ml import Transformer
from pyspark.ml import Pipeline
import pyspark.sql.functions as F
from pyspark.sql.types import StringType, FloatType, IntegerType
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, Word2Vec, Tokenizer, RegexTokenizer
import math

In [2]:
from pyspark.sql import SparkSession

team = 16
nworkers = 3
cores = 1
warehouse = "project/hive/warehouse"

spark = SparkSession.builder\
        .appName("{} - spark ML".format(team))\
        .master("yarn")\
        .config("hive.metastore.uris", "thrift://hadoop-02.uni.innopolis.ru:9883")\
        .config("spark.sql.warehouse.dir", warehouse)\
        .config("spark.sql.avro.compression.codec", "snappy")\
        .config('spark.executor.instances', nworkers)\
        .config("spark.executor.cores", cores)\
        .config("spark.executor.cpus", cores)\
        .config("spark.executor.memory", "4g")\
        .enableHiveSupport()\
        .getOrCreate()
        # 
spark

In [3]:
spark.sql("USE team16_projectdb").show()
spark.sql("SELECT * FROM ecom_part_buck").show()

++
||
++
++

+--------+-------------------+----------+-------------------+--------------------+----------+-------+---------+--------------------+-----------+
|event_id|         event_time|product_id|        category_id|       category_code|     brand|  price|  user_id|        user_session|event_types|
+--------+-------------------+----------+-------------------+--------------------+----------+-------+---------+--------------------+-----------+
|42441406|2019-10-31 22:58:52|   4804409|2053013554658804075|electronics.audio...|     elari|  61.65|512831449|528d70c6-44ef-44e...|       cart|
|42433956|2019-10-31 22:08:14|   5100570|2053013553341792533|  electronics.clocks|     apple|  447.6|520814382|06ec3176-a5c8-427...|       cart|
|42412263|2019-10-31 20:37:34|   1005116|2053013555631882655|electronics.smart...|     apple|1013.86|515926715|f5453671-cfd2-4f7...|       cart|
|42412081|2019-10-31 20:37:05|   1004888|2053013555631882655|electronics.smart...|   samsung| 224.46|562130094|1bf38b

In [4]:
spark.sql("SELECT event_types, COUNT(*) FROM ecom_part_buck GROUP BY event_types").show()

+-----------+--------+
|event_types|count(1)|
+-----------+--------+
|       cart|   92738|
|   purchase|   74411|
|       view| 4076403|
+-----------+--------+



In [5]:
data = spark.read.format("avro").table('team16_projectdb.ecom_part_buck')
data.printSchema()

root
 |-- event_id: integer (nullable = true)
 |-- event_time: timestamp (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- category_id: long (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: float (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- user_session: string (nullable = true)
 |-- event_types: string (nullable = true)



In [6]:
class CyclicTransformer(Transformer):
    def __init__(self, input_col):
        super(CyclicTransformer, self).__init__()
        self.input_col = input_col

    def _transform(self, df):
        extract_year = F.udf(lambda x: x.year)
        extract_month = F.udf(lambda x: x.month)
        extract_day = F.udf(lambda x: x.day)
        extract_hour = F.udf(lambda x: x.hour)
        extract_minute = F.udf(lambda x: x.minute)
        extract_second = F.udf(lambda x: x.second)

        return df.withColumn('year', extract_year(self.input_col))\
                 .withColumn('month', extract_month(self.input_col))\
                 .withColumn('day', extract_day(self.input_col))\
                 .withColumn('hour', extract_hour(self.input_col))\
                 .withColumn('minute', extract_minute(self.input_col))\
                 .withColumn('second', extract_second(self.input_col))\
                 .withColumn('month_sin', F.sin(F.col('month') * 2 * math.pi / 12))\
                 .withColumn('month_cos', F.cos(F.col('month') * 2 * math.pi / 12))\
                 .withColumn('day_sin', F.sin(F.col('day') * 2 * math.pi / 31))\
                 .withColumn('day_cos', F.cos(F.col('day') * 2 * math.pi / 31))\
                 .withColumn('hour_sin', F.sin(F.col('hour') * 2 * math.pi / 24))\
                 .withColumn('hour_cos', F.cos(F.col('hour') * 2 * math.pi / 24))\
                 .withColumn('minute_sin', F.sin(F.col('minute') * 2 * math.pi / 60))\
                 .withColumn('minute_cos', F.cos(F.col('minute') * 2 * math.pi / 60))\
                 .withColumn('second_sin', F.sin(F.col('second') * 2 * math.pi / 60))\
                 .withColumn('second_cos', F.cos(F.col('second') * 2 * math.pi / 60))\
                 .drop(self.input_col).drop('month').drop('day')\
                 .drop('hour').drop('minute').drop('second')

In [7]:
cyclic_trans = CyclicTransformer('event_time')
data = cyclic_trans.transform(data)
data.show(1)

+--------+----------+-------------------+--------------------+-----+-----+---------+--------------------+-----------+----+-------------------+------------------+--------------------+-------+-------------------+------------------+--------------------+------------------+------------------+------------------+
|event_id|product_id|        category_id|       category_code|brand|price|  user_id|        user_session|event_types|year|          month_sin|         month_cos|             day_sin|day_cos|           hour_sin|          hour_cos|          minute_sin|        minute_cos|        second_sin|        second_cos|
+--------+----------+-------------------+--------------------+-----+-----+---------+--------------------+-----------+----+-------------------+------------------+--------------------+-------+-------------------+------------------+--------------------+------------------+------------------+------------------+
|42441406|   4804409|2053013554658804075|electronics.audio...|elari|61.65|51

In [8]:
data = data.na.drop(subset=data.columns)
data = data.filter(data.brand != '')
data = data.filter(data.category_code != '')

In [9]:
data.count()

2654471

data.groupBy("category_code").count().withColumnRenamed("count", "num").orderBy(F.col("num").desc()).show(20)

In [10]:
event_type_to_rating = F.udf(lambda x: 1 if x == 'purchase' else 0 if x == 'cart' else -1, IntegerType())
data = data.withColumn('rating', event_type_to_rating('event_types')).drop('event_types')
data.printSchema()

root
 |-- event_id: integer (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- category_id: long (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: float (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- user_session: string (nullable = true)
 |-- year: string (nullable = true)
 |-- month_sin: double (nullable = true)
 |-- month_cos: double (nullable = true)
 |-- day_sin: double (nullable = true)
 |-- day_cos: double (nullable = true)
 |-- hour_sin: double (nullable = true)
 |-- hour_cos: double (nullable = true)
 |-- minute_sin: double (nullable = true)
 |-- minute_cos: double (nullable = true)
 |-- second_sin: double (nullable = true)
 |-- second_cos: double (nullable = true)
 |-- rating: integer (nullable = true)



In [11]:
# brand preprocessing: take only popular brands (> 10000 interactions)

brand_counts = data.groupBy("brand").count()
rare_brands = brand_counts.filter(F.col("count") < 10000).select("brand").rdd.flatMap(lambda x: x).collect()
data = data.withColumn("brand", F.when(F.col("brand").isin(rare_brands), "other").otherwise(F.col("brand")))

In [12]:
# one hot encoding of brand

indexer = StringIndexer(inputCol='brand', outputCol="brand_indexed")
oh_encoder = OneHotEncoder(inputCol=indexer.getOutputCol(), outputCol="brand_encoded")

brand_pipeline = Pipeline(stages=[indexer, oh_encoder]).fit(data)
data = brand_pipeline.transform(data)

In [13]:
train_data, test_data = data.randomSplit([0.6, 0.4])

In [14]:
from pyspark.ml.evaluation import RegressionEvaluator 
from pyspark.ml.recommendation import ALS 

als = ALS(maxIter=5,
          regParam=0.01,
          userCol="user_id",
          itemCol="product_id",
          ratingCol="rating",
          coldStartStrategy="drop")

als_model = als.fit(train_data)

predictions = als_model.transform(test_data)

predictions.show()

+--------+----------+-------------------+--------------------+--------+------+---------+--------------------+----+-------------------+------------------+--------------------+-------------------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------+-------------+---------------+-----------+
|event_id|product_id|        category_id|       category_code|   brand| price|  user_id|        user_session|year|          month_sin|         month_cos|             day_sin|            day_cos|           hour_sin|            hour_cos|          minute_sin|          minute_cos|          second_sin|          second_cos|rating|brand_indexed|  brand_encoded| prediction|
+--------+----------+-------------------+--------------------+--------+------+---------+--------------------+----+-------------------+------------------+--------------------+-------------------+-------------------+--------------------+--------------------+------

In [16]:
evaluator = RegressionEvaluator(labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions, {evaluator.metricName: "rmse"})
r2 = evaluator.evaluate(predictions, {evaluator.metricName: "r2"})
print("Root-mean-square error = " + str(rmse) + '\n' + 'R2 = ' + str(r2))

Root-mean-square error = 0.8773887814449642
R2 = -5.796755412196162


In [17]:
spark.stop()