## Pyspark - Association Rule

In [0]:


from pyspark.ml.fpm import FPGrowth

df = spark.createDataFrame([
    (0, [1, 2, 5]),
    (1, [1, 2, 3, 5]),
    (2, [1, 2])
], ["id", "items"])

fpGrowth = FPGrowth(itemsCol="items", minSupport=0.5, minConfidence=0.6)
model = fpGrowth.fit(df)

# Display frequent itemsets.
model.freqItemsets.show()

# Display generated association rules.
model.associationRules.show()

# transform examines the input items against all the association rules and summarize the
# consequents as prediction
model.transform(df).show()

+---------+----+
|    items|freq|
+---------+----+
|      [1]|   3|
|      [2]|   3|
|   [2, 1]|   3|
|      [5]|   2|
|   [5, 2]|   2|
|[5, 2, 1]|   2|
|   [5, 1]|   2|
+---------+----+

+----------+----------+------------------+----+------------------+
|antecedent|consequent|        confidence|lift|           support|
+----------+----------+------------------+----+------------------+
|       [5]|       [2]|               1.0| 1.0|0.6666666666666666|
|       [5]|       [1]|               1.0| 1.0|0.6666666666666666|
|    [5, 1]|       [2]|               1.0| 1.0|0.6666666666666666|
|    [5, 2]|       [1]|               1.0| 1.0|0.6666666666666666|
|       [2]|       [1]|               1.0| 1.0|               1.0|
|       [2]|       [5]|0.6666666666666666| 1.0|0.6666666666666666|
|    [2, 1]|       [5]|0.6666666666666666| 1.0|0.6666666666666666|
|       [1]|       [2]|               1.0| 1.0|               1.0|
|       [1]|       [5]|0.6666666666666666| 1.0|0.6666666666666666|
+-------

In [0]:

df = spark.createDataFrame([
    (0, ['apple', 'orange','eggs','milk']),
    (1, ['apple', 'orange','bread','milk']),
    (2, ['apple', 'orange']),
    (3, ['apple', 'bread']),
    (4, ['eggs','milk','bread']),
    (5, ['orange', 'tomato','apple']),
    (6, ['eggs','bread','milk','mushrooms','tomato']),
    (7, ['bread','milk','mushrooms','tomato']),
    (8, ['bread','milk']),
    (9, ['apple','orange','mushroom','tomato']),
    (10,['bread','milk','mushrooms','apple']),
    (11, ['apple', 'orange','eggs','milk','bread']), 
    (12, ['bread','milk','mushrooms']),
    (13, ['apple', 'orange','bread']),
    (14, ['eggs','mushrooms','tomato']),
    (15, ['mushrooms','tomato']),
    (16, ['milk', 'orange','apple']),
    (17, ['mushrooms','tomato']),
    (18, ['tomato', 'apple','orange']),
    (19, ['eggs','tomato']),
    (20, ['bread','apple'])
    
    
], ["id", "items"])

display(df)


id,items
0,"List(apple, orange, eggs, milk)"
1,"List(apple, orange, bread, milk)"
2,"List(apple, orange)"
3,"List(apple, bread)"
4,"List(eggs, milk, bread)"
5,"List(orange, tomato, apple)"
6,"List(eggs, bread, milk, mushrooms, tomato)"
7,"List(bread, milk, mushrooms, tomato)"
8,"List(bread, milk)"
9,"List(apple, orange, mushroom, tomato)"


In [0]:
#### 1.4
fpGrowth = FPGrowth(itemsCol="items", minSupport=0.2, minConfidence=0.3)
model = fpGrowth.fit(df)

In [0]:
# 1.5
model.freqItemsets.show()

+-------------------+----+
|              items|freq|
+-------------------+----+
|            [apple]|  12|
|            [bread]|  11|
|     [bread, apple]|   6|
|             [milk]|  10|
|      [milk, bread]|   8|
|      [milk, apple]|   5|
|           [tomato]|   9|
|           [orange]|   9|
|    [orange, apple]|   9|
|        [mushrooms]|   7|
|[mushrooms, tomato]|   5|
|             [eggs]|   6|
+-------------------+----+



In [0]:
# 1.6
model.associationRules.show()

+-----------+-----------+------------------+------------------+-------------------+
| antecedent| consequent|        confidence|              lift|            support|
+-----------+-----------+------------------+------------------+-------------------+
|     [milk]|    [bread]|               0.8|1.5272727272727273|0.38095238095238093|
|     [milk]|    [apple]|               0.5|             0.875|0.23809523809523808|
|[mushrooms]|   [tomato]|0.7142857142857143|1.6666666666666667|0.23809523809523808|
|   [tomato]|[mushrooms]|0.5555555555555556|1.6666666666666667|0.23809523809523808|
|    [apple]|    [bread]|               0.5|0.9545454545454545| 0.2857142857142857|
|    [apple]|     [milk]|0.4166666666666667|0.8750000000000001|0.23809523809523808|
|    [apple]|   [orange]|              0.75|              1.75|0.42857142857142855|
|    [bread]|    [apple]|0.5454545454545454|0.9545454545454545| 0.2857142857142857|
|    [bread]|     [milk]|0.7272727272727273|1.5272727272727273|0.38095238095

In [0]:
# 1.7
dfTest = spark.createDataFrame([
   
    (0, ['apple','bread','milk']),
    (1, ['bread']),
    (2, ['orange','milk']),
    (3, ['bread','mushrooms','milk']),
    (4, ['apple']),
    (5, ['orange','bread']),
    (6, ['mushrooms','bread'])
            
], ["id", "items"])
display(dfTest)

id,items
0,"List(apple, bread, milk)"
1,List(bread)
2,"List(orange, milk)"
3,"List(bread, mushrooms, milk)"
4,List(apple)
5,"List(orange, bread)"
6,"List(mushrooms, bread)"


In [0]:
#  1.8
model.transform(dfTest).show()

+---+--------------------+--------------------+
| id|               items|          prediction|
+---+--------------------+--------------------+
|  0|[apple, bread, milk]|            [orange]|
|  1|             [bread]|       [apple, milk]|
|  2|      [orange, milk]|      [bread, apple]|
|  3|[bread, mushrooms...|     [apple, tomato]|
|  4|             [apple]|[bread, milk, ora...|
|  5|     [orange, bread]|       [apple, milk]|
|  6|  [mushrooms, bread]|[tomato, apple, m...|
+---+--------------------+--------------------+



1.9<br>
Resources: <br>
1) Dr. Liao’s tutorials, code snippets, and programming hints in the lab
2) https://spark.apache.org/docs/latest/ml-frequent-pattern-mining.html

In [0]:
# 1.10
df1 = spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/shared_uploads/vrajuech@gmu.edu/groceries-2.csv")
df1.display()

items,_c1,_c2,_c3,_c4,_c5,_c6,_c7,_c8,_c9,_c10,_c11,_c12,_c13,_c14,_c15,_c16,_c17,_c18,_c19,_c20,_c21,_c22,_c23,_c24,_c25,_c26,_c27,_c28,_c29,_c30,_c31
citrus fruit,semi-finished bread,margarine,ready soups,,,,,,,,,,,,,,,,,,,,,,,,,,,,
tropical fruit,yogurt,coffee,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
whole milk,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
pip fruit,yogurt,cream cheese,meat spreads,,,,,,,,,,,,,,,,,,,,,,,,,,,,
other vegetables,whole milk,condensed milk,long life bakery product,,,,,,,,,,,,,,,,,,,,,,,,,,,,
whole milk,butter,yogurt,rice,abrasive cleaner,,,,,,,,,,,,,,,,,,,,,,,,,,,
rolls/buns,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
other vegetables,UHT-milk,rolls/buns,bottled beer,liquor (appetizer),,,,,,,,,,,,,,,,,,,,,,,,,,,
pot plants,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
whole milk,cereals,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [0]:
from pyspark.sql.functions import col, concat_ws
df2n = df1.withColumn('item', concat_ws(',', col('items'), col('_c1'),col('_c2'),col('_c3')))

In [0]:
df3=df2n.drop('items','_c1','_c2','_c3')

In [0]:
for field in df3.schema.fields:
    print(field.name +" , "+str(field.dataType))

_c4 , StringType()
_c5 , StringType()
_c6 , StringType()
_c7 , StringType()
_c8 , StringType()
_c9 , StringType()
_c10 , StringType()
_c11 , StringType()
_c12 , StringType()
_c13 , StringType()
_c14 , StringType()
_c15 , StringType()
_c16 , StringType()
_c17 , StringType()
_c18 , StringType()
_c19 , StringType()
_c20 , StringType()
_c21 , StringType()
_c22 , StringType()
_c23 , StringType()
_c24 , StringType()
_c25 , StringType()
_c26 , StringType()
_c27 , StringType()
_c28 , StringType()
_c29 , StringType()
_c30 , StringType()
_c31 , StringType()
item , StringType()


In [0]:
from pyspark.sql.functions import split, col
df4 = df3.select(split(col("item"),",").alias("itemsList")) \
    .drop("items")
df4.printSchema()
df4.display()

root
 |-- itemsList: array (nullable = false)
 |    |-- element: string (containsNull = false)



itemsList
"List(citrus fruit, semi-finished bread, margarine, ready soups)"
"List(tropical fruit, yogurt, coffee)"
List(whole milk)
"List(pip fruit, yogurt, cream cheese, meat spreads)"
"List(other vegetables, whole milk, condensed milk, long life bakery product)"
"List(whole milk, butter, yogurt, rice)"
List(rolls/buns)
"List(other vegetables, UHT-milk, rolls/buns, bottled beer)"
List(pot plants)
"List(whole milk, cereals)"


In [0]:
fpGrowth = FPGrowth(itemsCol="itemsList", minSupport=0.03, minConfidence=0.03)
model1 = fpGrowth.fit(df4)

In [0]:
model1.freqItemsets.display()

items,freq
List(domestic eggs),310
List(chicken),421
List(citrus fruit),806
List(berries),298
List(frankfurter),580
List(curd),379
List(rolls/buns),1245
List(root vegetables),987
"List(root vegetables, other vegetables)",333
"List(root vegetables, whole milk)",297


In [0]:
model1.associationRules.display()


antecedent,consequent,confidence,lift,support
List(other vegetables),List(root vegetables),0.1983323406789756,1.9762903450635512,0.033858668022369
List(other vegetables),List(whole milk),0.2942227516378797,1.320109836842403,0.0502287747839349
List(root vegetables),List(other vegetables),0.3373860182370821,1.9762903450635512,0.033858668022369
List(root vegetables),List(whole milk),0.3009118541033435,1.350122301599627,0.0301982714794102
List(whole milk),List(root vegetables),0.135492700729927,1.350122301599627,0.0301982714794102
List(whole milk),List(other vegetables),0.2253649635036496,1.320109836842403,0.0502287747839349


In [0]:
model1.transform(df4).display()

itemsList,prediction
"List(citrus fruit, semi-finished bread, margarine, ready soups)",List()
"List(tropical fruit, yogurt, coffee)",List()
List(whole milk),"List(root vegetables, other vegetables)"
"List(pip fruit, yogurt, cream cheese, meat spreads)",List()
"List(other vegetables, whole milk, condensed milk, long life bakery product)",List(root vegetables)
"List(whole milk, butter, yogurt, rice)","List(root vegetables, other vegetables)"
List(rolls/buns),List()
"List(other vegetables, UHT-milk, rolls/buns, bottled beer)","List(root vegetables, whole milk)"
List(pot plants),List()
"List(whole milk, cereals)","List(root vegetables, other vegetables)"
