# Frequent Pattern Mining

### Importing libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Create Spark Session and load data

In [21]:
from pyspark.sql.functions import monotonically_increasing_id
# Create a Spark session
spark = SparkSession.builder.appName("NewsgroupsPreprocessing").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
store_data = spark.read.format('csv')\
          .option('header','true')\
          .option('inferSchema', 'true')\
          .option('timestamp', 'true')\
          .load('s3a://test234/store_data.csv')
store_data = store_data.withColumn("id", monotonically_increasing_id() )
store_data.limit(5).toPandas()

Unnamed: 0,shrimp,almonds,avocado,vegetables_mix,green_grapes,whole_weat_flour,yams,cottage_cheese,energy_drink,tomato_juice,...,green_tea,honey,salad,mineral_water,salmon,antioxydant_juice,frozen_smoothie,spinach,olive_oil,id
0,burgers,meatballs,eggs,,,,,,,,...,,,,,,,,,,0
1,chutney,,,,,,,,,,...,,,,,,,,,,1
2,turkey,avocado,,,,,,,,,...,,,,,,,,,,2
3,mineral water,milk,energy bar,whole wheat rice,green tea,,,,,,...,,,,,,,,,,3
4,low fat yogurt,,,,,,,,,,...,,,,,,,,,,4


### Data Preprocessing

In [22]:
from pyspark.sql.functions import array,array_except,array_distinct
store_data = store_data.select(array(store_data.shrimp,store_data.almonds,store_data.avocado,store_data.vegetables_mix,store_data.green_grapes,store_data.whole_weat_flour,store_data.yams,store_data.cottage_cheese,store_data.energy_drink,store_data.tomato_juice,store_data.low_fat_yogurt,store_data.green_tea,store_data.honey,store_data.salad,store_data.mineral_water,store_data.salmon,store_data.antioxydant_juice,store_data.frozen_smoothie,store_data.spinach,store_data.olive_oil).alias("basket"))
store_data.limit(5).show(truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------------------+
|basket                                                                                                                                                  |
+--------------------------------------------------------------------------------------------------------------------------------------------------------+
|[burgers, meatballs, eggs, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL]                        |
|[chutney, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL]                             |
|[turkey, avocado, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL]                           |
|[mineral water, milk, energy bar, whole wheat rice, green tea, NULL, 

In [23]:
store_data = store_data.select(array_distinct(store_data.basket).alias("basket"))
store_data.show()

+--------------------+
|              basket|
+--------------------+
|[burgers, meatbal...|
|     [chutney, NULL]|
|[turkey, avocado,...|
|[mineral water, m...|
|[low fat yogurt, ...|
|[whole wheat past...|
|[soup, light crea...|
|[frozen vegetable...|
|[french fries, NULL]|
|[eggs, pet food, ...|
|     [cookies, NULL]|
|[turkey, burgers,...|
|[spaghetti, champ...|
|[mineral water, s...|
|[mineral water, N...|
|[shrimp, chocolat...|
|[turkey, eggs, NULL]|
|[turkey, fresh tu...|
|[meatballs, milk,...|
|[red wine, shrimp...|
+--------------------+
only showing top 20 rows



### Initialize the FP-Growth model, fit the model to the transactions
### Display frequent itemsets
### Display association rules

In [24]:
from pyspark.sql import SparkSession
from pyspark.ml.fpm import FPGrowth
# Initialize the FP-Growth model
fp_growth = FPGrowth(itemsCol="basket", minSupport=0.05, minConfidence=0.2)

# Fit the model to the transactions
model = fp_growth.fit(store_data)

# Get frequent itemsets
frequent_itemsets = model.freqItemsets

# Show the frequent itemsets
frequent_itemsets.show()

# Get association rules
association_rules = model.associationRules

# Show the association rules
association_rules.show()

+--------------------+----+
|               items|freq|
+--------------------+----+
|     [grated cheese]| 393|
|[grated cheese, N...| 393|
|       [ground beef]| 737|
| [ground beef, NULL]| 737|
|          [tomatoes]| 513|
|    [tomatoes, NULL]| 513|
|           [burgers]| 654|
|     [burgers, NULL]| 654|
|              [eggs]|1348|
|[eggs, mineral wa...| 382|
|[eggs, mineral wa...| 382|
|        [eggs, NULL]|1348|
|            [turkey]| 469|
|      [turkey, NULL]| 469|
|         [chocolate]|1229|
|[chocolate, miner...| 395|
|[chocolate, miner...| 395|
|   [chocolate, NULL]|1229|
|          [escalope]| 595|
|    [escalope, NULL]| 595|
+--------------------+----+
only showing top 20 rows

+--------------------+---------------+-------------------+------------------+-------------------+
|          antecedent|     consequent|         confidence|              lift|            support|
+--------------------+---------------+-------------------+------------------+-------------------+
|       