In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import SparkContext
from pyspark.sql import SparkSession

In [3]:
sc = SparkContext(master='local', appName='Ex6')
spark = SparkSession(sc)

In [4]:
from pyspark.sql.functions import count, col, when, isnan, isnull, concat,lit

### Data Exploring

In [5]:
data = spark.read.csv('./data/75000/75000-out1.csv',inferSchema=True, header=False)

In [6]:
data.show()

+---+----+----+
|_c0| _c1| _c2|
+---+----+----+
|  1|11.0|21.0|
|  2| 7.0|11.0|
|  3| 3.0|33.0|
|  4| 5.0|12.0|
|  5| 6.0|18.0|
|  6| 2.0| 4.0|
|  7|15.0|16.0|
|  8| 2.0| 3.0|
|  9|18.0|23.0|
| 10|44.0|45.0|
| 11|17.0|38.0|
| 12| 2.0| 3.0|
| 13| 3.0|17.0|
| 14|17.0|35.0|
| 15|15.0|37.0|
| 16| 0.0| 2.0|
| 17|17.0|47.0|
| 18|14.0|null|
| 19|16.0|39.0|
| 20|13.0|42.0|
+---+----+----+
only showing top 20 rows



In [7]:
data = data.select(col('_c0').alias('Order_ID'), col('_c1').alias('Item_1'), col('_c2').alias('Item_2'))

In [8]:
data.describe().show()

+-------+-----------------+------------------+------------------+
|summary|         Order_ID|            Item_1|            Item_2|
+-------+-----------------+------------------+------------------+
|  count|            75000|             75000|             71408|
|   mean|          37500.5|10.907106666666667|21.120266636791396|
| stddev|21650.77943169714| 9.988473077663798|  11.6884778948076|
|    min|                1|               0.0|               1.0|
|    max|            75000|              49.0|              49.0|
+-------+-----------------+------------------+------------------+



**Nhận xét**:
    - Thống kê cho thấy cột Item_2 có dữ liệu thiếu

In [9]:
data.printSchema()

root
 |-- Order_ID: integer (nullable = true)
 |-- Item_1: double (nullable = true)
 |-- Item_2: double (nullable = true)



## Cleaning data

### Replace ID by food's name

In [10]:
goods = spark.read.csv('./data/75000/goods.csv',inferSchema=True,header=True)

In [11]:
goods = goods.withColumn('Flavor_Food', concat(col('Flavor'),lit('-'),col('Food')))

In [12]:
goods.show()

+---+------------+--------+-----+------+--------------------+
| Id|      Flavor|    Food|Price|  Type|         Flavor_Food|
+---+------------+--------+-----+------+--------------------+
|  0| 'Chocolate'|  'Cake'| 8.95|'Food'|  'Chocolate'-'Cake'|
|  1|     'Lemon'|  'Cake'| 8.95|'Food'|      'Lemon'-'Cake'|
|  2|    'Casino'|  'Cake'|15.95|'Food'|     'Casino'-'Cake'|
|  3|     'Opera'|  'Cake'|15.95|'Food'|      'Opera'-'Cake'|
|  4|'Strawberry'|  'Cake'|11.95|'Food'| 'Strawberry'-'Cake'|
|  5|   'Truffle'|  'Cake'|15.95|'Food'|    'Truffle'-'Cake'|
|  6| 'Chocolate'|'Eclair'| 3.25|'Food'|'Chocolate'-'Eclair'|
|  7|    'Coffee'|'Eclair'|  3.5|'Food'|   'Coffee'-'Eclair'|
|  8|   'Vanilla'|'Eclair'| 3.25|'Food'|  'Vanilla'-'Eclair'|
|  9|  'Napoleon'|  'Cake'|13.49|'Food'|   'Napoleon'-'Cake'|
| 10|    'Almond'|  'Tart'| 3.75|'Food'|     'Almond'-'Tart'|
| 11|     'Apple'|   'Pie'| 5.25|'Food'|       'Apple'-'Pie'|
| 12|     'Apple'|  'Tart'| 3.25|'Food'|      'Apple'-'Tart'|
| 13|   

In [13]:
data_temp = data.join(goods,data.Item_1 == goods.Id,how='left').select('Order_ID',col('Flavor_Food').alias('Item_1'),'Item_2')

In [14]:
data_food = data_temp.join(goods,data_temp.Item_2 == goods.Id,how='left').select('Order_ID','Item_1',col('Flavor_Food').alias('Item_2'))

In [15]:
data_food.show()

+--------+--------------------+--------------------+
|Order_ID|              Item_1|              Item_2|
+--------+--------------------+--------------------+
|       1|       'Apple'-'Pie'|  'Ganache'-'Cookie'|
|       2|   'Coffee'-'Eclair'|       'Apple'-'Pie'|
|       3|      'Opera'-'Cake'|'Cheese'-'Croissant'|
|       4|    'Truffle'-'Cake'|      'Apple'-'Tart'|
|       5|'Chocolate'-'Eclair'|     'Cherry'-'Tart'|
|       6|     'Casino'-'Cake'| 'Strawberry'-'Cake'|
|       7| 'Blackberry'-'Tart'|  'Blueberry'-'Tart'|
|       8|     'Casino'-'Cake'|      'Opera'-'Cake'|
|       9|     'Cherry'-'Tart'|'Raspberry'-'Cookie'|
|      10|   'Bottled'-'Water'|      'Hot'-'Coffee'|
|      11|  'Chocolate'-'Tart'|'Almond'-'Bear Claw'|
|      12|     'Casino'-'Cake'|      'Opera'-'Cake'|
|      13|      'Opera'-'Cake'|  'Chocolate'-'Tart'|
|      14|  'Chocolate'-'Tart'|  'Apricot'-'Danish'|
|      15| 'Blackberry'-'Tart'|    'Almond'-'Twist'|
|      16|  'Chocolate'-'Cake'|     'Casino'-'

### Check missing values

In [16]:
data_food.select([count(when(isnan(col),col)).alias(col) for col in data.columns]).toPandas()

Unnamed: 0,Order_ID,Item_1,Item_2
0,0,0,0


In [17]:
data_food.select([count(when(isnull(col),col)).alias(col) for col in data.columns]).toPandas()

Unnamed: 0,Order_ID,Item_1,Item_2
0,0,0,3592


In [18]:
null_data = data_food.where(col('Item_2').isNull())
clean_data = data_food.dropna()

### Correct data format

In [19]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import array

In [20]:
final_data = clean_data.withColumn('items',array(col('Item_1'),col('Item_2')))

In [21]:
final_data.show()

+--------+--------------------+--------------------+--------------------+
|Order_ID|              Item_1|              Item_2|               items|
+--------+--------------------+--------------------+--------------------+
|       1|       'Apple'-'Pie'|  'Ganache'-'Cookie'|['Apple'-'Pie', '...|
|       2|   'Coffee'-'Eclair'|       'Apple'-'Pie'|['Coffee'-'Eclair...|
|       3|      'Opera'-'Cake'|'Cheese'-'Croissant'|['Opera'-'Cake', ...|
|       4|    'Truffle'-'Cake'|      'Apple'-'Tart'|['Truffle'-'Cake'...|
|       5|'Chocolate'-'Eclair'|     'Cherry'-'Tart'|['Chocolate'-'Ecl...|
|       6|     'Casino'-'Cake'| 'Strawberry'-'Cake'|['Casino'-'Cake',...|
|       7| 'Blackberry'-'Tart'|  'Blueberry'-'Tart'|['Blackberry'-'Ta...|
|       8|     'Casino'-'Cake'|      'Opera'-'Cake'|['Casino'-'Cake',...|
|       9|     'Cherry'-'Tart'|'Raspberry'-'Cookie'|['Cherry'-'Tart',...|
|      10|   'Bottled'-'Water'|      'Hot'-'Coffee'|['Bottled'-'Water...|
|      11|  'Chocolate'-'Tart'|'Almond

## Training

In [22]:
from pyspark.ml.fpm import FPGrowth

In [23]:
fpgrowth = FPGrowth(minSupport=0.01,minConfidence=0.01,itemsCol='items')

In [25]:
model = fpgrowth.fit(final_data)

In [29]:
#Display the frequent itemset:
model.freqItemsets.show(truncate=False)

+----------------------------------------+----+
|items                                   |freq|
+----------------------------------------+----+
|['Chocolate'-'Eclair']                  |2868|
|['Vanilla'-'Meringue']                  |1352|
|['Napoleon'-'Cake']                     |5053|
|['Napoleon'-'Cake', 'Strawberry'-'Cake']|2504|
|['Raspberry'-'Cookie']                  |3349|
|['Almond'-'Croissant']                  |1099|
|['Apple'-'Croissant']                   |2510|
|['Apple'-'Croissant', 'Apple'-'Tart']   |1574|
|['Apple'-'Pie']                         |4900|
|['Apple'-'Pie', 'Coffee'-'Eclair']      |2452|
|['Orange'-'Juice']                      |1858|
|['Orange'-'Juice', 'Cheese'-'Croissant']|1138|
|['Apple'-'Danish']                      |744 |
|['Lemon'-'Cookie']                      |3213|
|['Lemon'-'Cookie', 'Raspberry'-'Cookie']|1809|
|['Chocolate'-'Cake']                    |6206|
|['Truffle'-'Cake']                      |5906|
|['Marzipan'-'Cookie']                  

### Recommend items

In [27]:
mostpopularItems = model.transform(final_data)

In [31]:
mostpopularItems.drop('items').show(truncate=False)

+--------+--------------------+-----------------------+-------------------------------------------+
|Order_ID|Item_1              |Item_2                 |prediction                                 |
+--------+--------------------+-----------------------+-------------------------------------------+
|1       |'Apple'-'Pie'       |'Ganache'-'Cookie'     |['Coffee'-'Eclair']                        |
|2       |'Coffee'-'Eclair'   |'Apple'-'Pie'          |['Blackberry'-'Tart']                      |
|3       |'Opera'-'Cake'      |'Cheese'-'Croissant'   |['Cherry'-'Tart', 'Orange'-'Juice']        |
|4       |'Truffle'-'Cake'    |'Apple'-'Tart'         |['Apple'-'Croissant', 'Gongolais'-'Cookie']|
|5       |'Chocolate'-'Eclair'|'Cherry'-'Tart'        |['Opera'-'Cake']                           |
|6       |'Casino'-'Cake'     |'Strawberry'-'Cake'    |['Chocolate'-'Cake', 'Napoleon'-'Cake']    |
|7       |'Blackberry'-'Tart' |'Blueberry'-'Tart'     |['Apricot'-'Croissant', 'Coffee'-'Eclair'] |


### Apply on filling Null Data

In [33]:
null_data = null_data.withColumn('items',array(col('Item_1'),col('Item_2')))

In [34]:
recommendation = model.transform(null_data)

In [37]:
recommendation.drop('item').show(truncate=False)

+--------+----------------------+------+------------------------------+----------------------+
|Order_ID|Item_1                |Item_2|items                         |prediction            |
+--------+----------------------+------+------------------------------+----------------------+
|18      |'Berry'-'Tart'        |null  |['Berry'-'Tart', null]        |['Bottled'-'Water']   |
|50      |'Cheese'-'Croissant'  |null  |['Cheese'-'Croissant', null]  |['Orange'-'Juice']    |
|66      |'Chocolate'-'Meringue'|null  |['Chocolate'-'Meringue', null]|[]                    |
|72      |'Apple'-'Croissant'   |null  |['Apple'-'Croissant', null]   |['Apple'-'Tart']      |
|79      |'Green'-'Tea'         |null  |['Green'-'Tea', null]         |[]                    |
|81      |'Chocolate'-'Cake'    |null  |['Chocolate'-'Cake', null]    |['Casino'-'Cake']     |
|98      |'Cherry'-'Tart'       |null  |['Cherry'-'Tart', null]       |['Opera'-'Cake']      |
|101     |'Napoleon'-'Cake'     |null  |['Napoleon