In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
from pyspark import SparkContext
sc = SparkContext()
from pyspark.sql import SQLContext
from pyspark.sql.types import Row, StructField, StructType, StringType, IntegerType
from pyspark.sql import SparkSession
spark = SparkSession(sc)
sqlContext = SQLContext(sc)
import pyspark.mllib.stat as st
import pyspark.sql.types as typ
import pyspark.sql.functions as F

### 数据处理

In [3]:
df = spark.read.csv('../data/UCL_Retail_kaggle.csv', header = True)

In [4]:
df.show(5)

+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|   InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010/12/1 8:26|     2.55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|2010/12/1 8:26|     3.39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|2010/12/1 8:26|     2.75|     17850|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|2010/12/1 8:26|     3.39|     17850|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|2010/12/1 8:26|     3.39|     17850|United Kingdom|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
only showing top 5 rows



In [5]:
# 目标格式是转换成这样：
'''
df = spark.createDataFrame([
    (0, [1, 2, 5]),
    (1, [1, 2, 3, 5]),
    (2, [1, 2])
], ["id", "items"])
'''

'\ndf = spark.createDataFrame([\n    (0, [1, 2, 5]),\n    (1, [1, 2, 3, 5]),\n    (2, [1, 2])\n], ["id", "items"])\n'

In [6]:
# 按照InvoiceNo进行合并
df = df.groupBy('InvoiceNo').agg(F.collect_set(F.col('Description')).alias('item_list'))
# 要注意这里必须用collect_set。如果用collect_list会报错

In [8]:
df.show(1, truncate= False)

+---------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|InvoiceNo|item_list                                                                                                                                                                            |
+---------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|536596   |[WAKE UP COCKEREL TILE COASTER, PACK OF 12 SKULL TISSUES,  SET 2 TEA TOWELS I LOVE LONDON , FAUX FUR CHOCOLATE THROW, HOT WATER BOTTLE TEA AND SYMPATHY, VINTAGE UNION JACK DOORSTOP]|
+---------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
only showing top 1 row



### FP_growth 操作

In [9]:
from pyspark.ml.fpm import FPGrowth

fpGrowth = FPGrowth(itemsCol="item_list", minSupport=0.01, minConfidence=0.3)
model = fpGrowth.fit(df)

#### 频繁项目集

In [14]:
# Display frequent itemsets.
model.freqItemsets.orderBy(F.desc("freq")).show(10, truncate= False)

+------------------------------------+----+
|items                               |freq|
+------------------------------------+----+
|[WHITE HANGING HEART T-LIGHT HOLDER]|2302|
|[REGENCY CAKESTAND 3 TIER]          |2169|
|[JUMBO BAG RED RETROSPOT]           |2135|
|[PARTY BUNTING]                     |1706|
|[LUNCH BAG RED RETROSPOT]           |1607|
|[ASSORTED COLOUR BIRD ORNAMENT]     |1467|
|[SET OF 3 CAKE TINS PANTRY DESIGN ] |1458|
|[PACK OF 72 RETROSPOT CAKE CASES]   |1334|
|[LUNCH BAG  BLACK SKULL.]           |1295|
|[NATURAL SLATE HEART CHALKBOARD ]   |1266|
+------------------------------------+----+
only showing top 10 rows



#### 关联规则

In [16]:
# Display generated association rules.
model.associationRules.orderBy(F.desc("lift")).show(10, truncate= False)

+-------------------------------+-------------------------------+------------------+------------------+
|antecedent                     |consequent                     |confidence        |lift              |
+-------------------------------+-------------------------------+------------------+------------------+
|[REGENCY TEA PLATE PINK]       |[REGENCY TEA PLATE GREEN ]     |0.8980891719745223|60.26038744595888 |
|[REGENCY TEA PLATE GREEN ]     |[REGENCY TEA PLATE PINK]       |0.7305699481865285|60.26038744595888 |
|[REGENCY TEA PLATE ROSES ]     |[REGENCY TEA PLATE PINK]       |0.5951859956236324|49.093367154942925|
|[REGENCY TEA PLATE PINK]       |[REGENCY TEA PLATE ROSES ]     |0.8662420382165605|49.093367154942925|
|[POPPY'S PLAYHOUSE LIVINGROOM ]|[POPPY'S PLAYHOUSE BEDROOM ]   |0.793939393939394 |48.270024185517144|
|[POPPY'S PLAYHOUSE BEDROOM ]   |[POPPY'S PLAYHOUSE LIVINGROOM ]|0.6150234741784038|48.270024185517144|
|[REGENCY TEA PLATE ROSES ]     |[REGENCY TEA PLATE GREEN ]     

#### 预测

具体的预测方式官方档案是这么写的：  
transform: For each transaction in itemsCol, the transform method will compare its items against the antecedents of each association rule. If the record contains all the antecedents of a specific association rule, the rule will be considered as applicable and its consequents will be added to the prediction result. The transform method will summarize the consequents from all the applicable rules as prediction. The prediction column has the same data type as itemsCol and does not contain existing items in the itemsCol.  
所以说是根据所有的关联规则进行预测，并且推荐的东西是不存在于原项目集的东西。所以不能有评估的可能性。

In [20]:
# transform examines the input items against all the association rules and summarize the
# consequents as prediction
model.transform(df).show(1, truncate = False)

+---------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------+
|InvoiceNo|item_list                                                                                                                                                                            |prediction                                                                               |
+---------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------+
|536596   |[WAKE UP COCKEREL TILE COASTER, PACK OF 12 SKULL TISSUES,  SET 2 TEA TOWELS I LOVE LONDON , FAUX FUR CHOCOLATE THROW, HOT WATER BOTTLE TE