### D. 'Bought Together'

In [1]:
# imports
import pandas as pd
import numpy as np
import random

from pyspark.sql import SparkSession
from pyspark.ml.feature import NGram

In [2]:
# load csv data
aisles = pd.read_csv('/Users/Nicole/Desktop/raw/aisles.csv')
dept = pd.read_csv('/Users/Nicole/Desktop/raw/departments.csv')
prod = pd.read_csv('/Users/Nicole/Desktop/raw/products.csv')
train = pd.read_csv('/Users/Nicole/Desktop/raw/order_products__prior.csv')
test = pd.read_csv('/Users/Nicole/Desktop/raw/order_products__train.csv')
orders = pd.read_csv('/Users/Nicole/Desktop/raw/orders.csv')
orders_clustered = pd.read_csv('/Users/Nicole/Desktop/raw/orders_clustered.csv')

df = [aisles, dept, prod, train, test, orders]
df_names = ['aisles','dept','prod','train','test','orders']

#### Product Bundle

**1) Data Preprocessing**

In [3]:
# merge user, order and product info into one df
order_prod = pd.merge(train,prod,on='product_id',how='left')
user_order_prod = pd.merge(order_prod,orders,on='order_id',how='left')
user_order_prod['product_name'][1]  ## seperated by whitespace

'Michigan Organic Kale'

In [4]:
# replace whitespaces with "_"
products = user_order_prod['product_name']
prod_no_space =[]
for product in products:
    product = product.replace(' ','_')
    prod_no_space.append(product)

# replace the original column with no_space one
user_order_prod.drop(['product_name'],axis=1)
user_order_prod['product_name'] = prod_no_space
user_order_prod['product_name'][1] 

'Michigan_Organic_Kale'

In [8]:
user_order_prod.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2,33120,1,1,Organic_Egg_Whites,86,16,202279,prior,3,5,9,8.0
1,2,28985,2,1,Michigan_Organic_Kale,83,4,202279,prior,3,5,9,8.0
2,2,9327,3,0,Garlic_Powder,104,13,202279,prior,3,5,9,8.0
3,2,45918,4,1,Coconut_Butter,19,13,202279,prior,3,5,9,8.0
4,2,30035,5,0,Natural_Sweetener,17,13,202279,prior,3,5,9,8.0


In [27]:
user_order_prod.to_csv('/Users/Nicole/Desktop/raw/user_order_prod.csv', index_label=False)

In [5]:
# aggregate all products by order id
all_prod = []
for p_name in user_order_prod.groupby('order_id')['product_name']:
    all_prod.append(' '.join(p_name[1]))

# put the results into new df
order_id = user_order_prod.groupby('order_id')['product_name'].agg('count').index
order_id_prod = pd.DataFrame({'order_id':order_id,'products':all_prod})
order_id_prod.head()  ## take a glimpse

Unnamed: 0,order_id,products
0,2,Organic_Egg_Whites Michigan_Organic_Kale Garli...
1,3,Total_2%_with_Strawberry_Lowfat_Greek_Strained...
2,4,Plain_Pre-Sliced_Bagels Honey/Lemon_Cough_Drop...
3,5,"Bag_of_Organic_Bananas Just_Crisp,_Parmesan Fr..."
4,6,Cleanse Dryer_Sheets_Geranium_Scent Clean_Day_...


**2)Extract Bigrams (PySpark)**

In [6]:
# prepare the df in PySpark format
df = []
index = 0
for row in order_id_prod['products']:
    prod_name = row.split(' ')
    tup = (index, prod_name)
    df.append(tup)
    index += 1

In [7]:
# randomly split data into train (70%) and test (30%)
random.shuffle(df)
train_df = df[:2250411]
test_df = df[2250411:]

In [8]:
# convert to spark df
spark = SparkSession.builder.appName("Bigram").getOrCreate()

# read 10,000 lines each time to reduce computation
N = len(train_df)//10000
mod = len(train_df) % 10000
trainDF = spark.createDataFrame(df[0:10000], ['id',"product_name"])

for i in range(1,N):
    trainDF_sub = spark.createDataFrame(train_df[10000*i:10000*(i+1)], ['id',"product_name"])
    traintDF = trainDF.union(trainDF_sub)

# combine into one spark df 
trainDF_sub = spark.createDataFrame(train_df[10000*N:len(train_df)], ['id',"product_name"])
trainDF = trainDF.union(trainDF_sub)

In [9]:
# collect bigrams
ngram = NGram(n=2, inputCol="product_name", outputCol="bigrams")
ngram_df = ngram.transform(trainDF)

In [10]:
# count frequency
bigrams = ngram_df.toPandas()['bigrams']
table = {}
total = len(bigrams)
completion = 0
for bigram in bigrams:
    for combination in bigram:
        components = combination.split(' ')
        key = components[0]
        valKey = components[1]
        if key in table:
            valueDict = table[key]
            if valKey in valueDict:
                valueDict[valKey] = valueDict[valKey] + 1
            else:
                valueDict[valKey] = 1
        else:
            # create new value for key
            valueDict =  {valKey: 1}
            table[key] = valueDict
    completion += 1

In [21]:
# product combinations appears more than 20 times
for firstWord in table:
    for secondWord in table[firstWord]:
        if table[firstWord][secondWord] > 20:
            print(firstWord, "&", secondWord, ",", table[firstWord][secondWord])

Banana & Organic_Avocado , 22
Banana & Strawberries , 21
Banana & Organic_Fuji_Apple , 24
Banana & Large_Lemon , 21
Organic_Strawberries & Bag_of_Organic_Bananas , 21
Bag_of_Organic_Bananas & Organic_Hass_Avocado , 27
Bag_of_Organic_Bananas & Organic_Strawberries , 27
Bag_of_Organic_Bananas & Organic_Raspberries , 21
Organic_Baby_Spinach & Banana , 21
Organic_Hass_Avocado & Bag_of_Organic_Bananas , 21


In [22]:
for firstWord in table:
    for secondWord in table[firstWord]:
        if table[firstWord][secondWord] > 10 and table[firstWord][secondWord] < 20:
            print(firstWord, "&", secondWord, ",", table[firstWord][secondWord])

Banana & Organic_Baby_Spinach , 17
Banana & Organic_Strawberries , 17
Banana & Cucumber_Kirby , 15
Banana & Organic_Hass_Avocado , 12
Organic_Strawberries & Organic_Baby_Spinach , 12
Organic_Strawberries & Banana , 15
Organic_Strawberries & Organic_Blueberries , 15
Organic_Strawberries & Organic_Raspberries , 16
Bag_of_Organic_Bananas & Organic_Whole_Milk , 14
Bag_of_Organic_Bananas & Organic_Baby_Spinach , 11
Organic_Baby_Spinach & Bag_of_Organic_Bananas , 15
Organic_Hass_Avocado & Organic_Strawberries , 14
Organic_Hass_Avocado & Organic_Raspberries , 14
Organic_Hass_Avocado & Organic_Baby_Spinach , 11
Organic_Hass_Avocado & Banana , 13
Organic_Raspberries & Bag_of_Organic_Bananas , 13
Organic_Raspberries & Organic_Strawberries , 17
Strawberries & Banana , 11
Strawberries & Blueberries , 14
Large_Lemon & Banana , 14
Organic_Yellow_Onion & Organic_Garlic , 14
Organic_Cucumber & Organic_Hass_Avocado , 12
Organic_Cilantro & Limes , 15
Limes & Large_Lemon , 15
Organic_Zucchini & Organic_Y

#### Aisle Bundle

**1) Data Processing**

In [3]:
# merge user, order and product info into one df
order_aisle = pd.merge(pd.merge(train,prod,how='left',on='product_id'),\
                       aisles, on="aisle_id", how="left")
user_order_aisle = pd.merge(order_aisle,orders,how='left',on='order_id')

In [4]:
# replace whitespaces with "_"
aisles_col = user_order_aisle['aisle']
aisle_no_space =[]
for aisle in aisles_col:
    aisle = aisle.replace(' ','_')
    aisle_no_space.append(aisle)

# replace the original column with no_space one
user_order_aisle.drop(['aisle'],axis=1)
user_order_aisle['aisle'] = aisle_no_space
user_order_aisle['aisle'][1] 

'fresh_vegetables'

In [6]:
# aggregate all aisles by order id
all_aisle = []
for a_name in user_order_aisle.groupby('order_id')['aisle']:
    all_aisle.append(' '.join(a_name[1]))

# put the results into new df
order_id2 = user_order_aisle.groupby('order_id')['aisle'].agg('count').index
order_id_aisle = pd.DataFrame({'order_id':order_id2,'aisles':all_aisle})
order_id_aisle.head()  ## take a glimpse

Unnamed: 0,order_id,aisles
0,2,eggs fresh_vegetables spices_seasonings oils_v...
1,3,yogurt soy_lactosefree packaged_vegetables_fru...
2,4,breakfast_bakery cold_flu_allergy energy_grano...
3,5,fresh_fruits salad_dressing_toppings prepared_...
4,6,refrigerated laundry air_fresheners_candles


**2)Extract Bigrams (PySpark)**

In [7]:
# prepare the df in PySpark format
df2 = []
index = 0
for row in order_id_aisle['aisles']:
    aisle_name = row.split(' ')
    tup = (index, aisle_name)
    df2.append(tup)
    index += 1

In [8]:
# randomly split data into train (70%) and test (30%)
random.shuffle(df2)
train_df2 = df2[:2250411]
test_df2 = df2[2250411:]

In [9]:
# convert to spark df
spark = SparkSession.builder.appName("Bigram").getOrCreate()

# read 10,000 lines each time to reduce computation
N2 = len(train_df2)//10000
mod2 = len(train_df2) % 10000
trainDF2 = spark.createDataFrame(df2[0:10000], ['id',"aisle"])

for i in range(1,N2):
    trainDF2_sub = spark.createDataFrame(train_df2[10000*i:10000*(i+1)], ['id',"aisle"])
    traintDF2 = trainDF2.union(trainDF2_sub)

# combine into one spark df 
trainDF2_sub = spark.createDataFrame(train_df2[10000*N2:len(train_df2)], ['id',"aisle"])
trainDF2 = trainDF2.union(trainDF2_sub)

In [10]:
# collect bigrams
ngram2 = NGram(n=2, inputCol="aisle", outputCol="bigrams")
ngram2_df = ngram2.transform(trainDF2)

In [11]:
# count frequency
bigrams2 = ngram2_df.toPandas()['bigrams']
table2 = {}
total2 = len(bigrams2)
completion = 0
for bigram in bigrams2:
    for combination in bigram:
        components = combination.split(' ')
        key = components[0]
        valKey = components[1]
        if key in table2:
            valueDict = table2[key]
            if valKey in valueDict:
                valueDict[valKey] = valueDict[valKey] + 1
            else:
                valueDict[valKey] = 1
        else:
            # create new value for key
            valueDict =  {valKey: 1}
            table2[key] = valueDict
    completion += 1

In [15]:
# product combinations appears more than 200 times
for firstWord in table2:
    for secondWord in table2[firstWord]:
        if table2[firstWord][secondWord] > 200:
            print(firstWord, "&", secondWord, ",", table2[firstWord][secondWord])

fresh_vegetables & fresh_vegetables , 2839
fresh_vegetables & fresh_fruits , 1183
fresh_vegetables & yogurt , 201
fresh_vegetables & packaged_vegetables_fruits , 778
fresh_vegetables & packaged_cheese , 269
fresh_vegetables & fresh_herbs , 305
fresh_fruits & packaged_cheese , 282
fresh_fruits & fresh_vegetables , 1339
fresh_fruits & fresh_fruits , 2295
fresh_fruits & water_seltzer_sparkling_water , 210
fresh_fruits & milk , 307
fresh_fruits & yogurt , 474
fresh_fruits & soy_lactosefree , 239
fresh_fruits & packaged_vegetables_fruits , 881
packaged_vegetables_fruits & packaged_vegetables_fruits , 515
packaged_vegetables_fruits & fresh_fruits , 818
packaged_vegetables_fruits & fresh_vegetables , 826
yogurt & yogurt , 1305
yogurt & fresh_fruits , 415
yogurt & fresh_vegetables , 268
ice_cream_ice & ice_cream_ice , 358
milk & fresh_fruits , 381
packaged_cheese & fresh_fruits , 276
packaged_cheese & packaged_cheese , 316
packaged_cheese & fresh_vegetables , 266
fresh_herbs & fresh_vegetables

### D. Recommendations

In [13]:
def getPureData(prodName):
    
    '''sort the bigram frequencies in descending order, 
       then return merely the corresponding product names in the same order'''
    
    if prodName not in table:
        return []
    sortedOringalList = sorted(table[prodName].items(), key=lambda x: x[1], reverse=True)
#     print(sortedOringalList)
    data = {}
    for tp in sortedOringalList:
        product = tp[0]
        number = tp[1]
        if number in data:
            productList = data[number]
            productList.append(product)
        else:
            productList = [product]
        data[number] = productList
#     print(data)
#     print("==> Get pure data name:")
    pureData = data.values()
#     print(pureData)
    return list(pureData)

def pickRecommendProds(pureData, numberOfRecommend):
    
    '''Pick certain number of products from the sorted product names'''
    
    recommendProds = []
    for prods in pureData:
        if len(prods) <= numberOfRecommend:
            recommendProds += prods
            numberOfRecommend -= len(prods)
        else:
            recommendProds += random.sample(prods, numberOfRecommend)
            numberOfRecommend = 0

        if numberOfRecommend == 0:
            break
    
    return recommendProds

# recommend products bought together with 'name'
# name: the product to start with
def getRecommend(name, numberOfRecommend):
    
    '''Recommend certain number of products bought after the given input name'''
    
    # numberOfRecommend = 10
    recommendProducts = []
    productName = name
    index = 0

    while (numberOfRecommend):
#         print("->Target: ", productName)
#         print("->numberOfRecommend: ", numberOfRecommend)
#         print("->Index: ", index)
        data = getPureData(productName)
    #     print("Pure data:", data)
        intermediate = pickRecommendProds(data, numberOfRecommend)
        recommendProducts += intermediate
#         print("Recommend: ", recommendProducts)
#         print("Recommend: ", recommendProducts)
        if len(intermediate) == 0 and index == len(recommendProducts):
            break
        numberOfRecommend -= len(intermediate)
        if numberOfRecommend > 0:
#             print("Still left: ", numberOfRecommend)
            productName = recommendProducts[index]
            index += 1

#         print("==================")

    return recommendProducts

In [19]:
print(getRecommend("Chocolate_Sandwich_Cookies", 5))

['Reduced_Fat_2%_Milk', 'Semi-Sweet_Chocolate_Morsels', 'Clementines', 'Little_Bites_Blueberry_Muffin_Pouches', 'Crunchy_Cheese_Flavored_Snacks']


In [20]:
print(getRecommend("Chocolate_Sandwich_Cookies", 15))

['Semi-Sweet_Chocolate_Morsels', 'Crunchy_Cheese_Flavored_Snacks', 'Reduced_Fat_2%_Milk', 'Clementines', 'Ritz_Crackers', 'Little_Bites_Blueberry_Muffin_Pouches', 'Raspberries', 'All-Purpose_Flour', 'Organic_Half_&_Half', 'Restaurant_Style_Medium_Salsa', 'Classic_Seasoning_with_Lemon_Skillet_Crisp_Tilapia', "Crunchy_Flamin'_Hot", 'Fat_Free_Milk', '0%_Greek_Strained_Yogurt', 'Organic_Unsweetened_Vanilla_Almond_Milk']


In [23]:
def TestScore(test_data):
    
    scores = []

    for order_info in test_data:
        this_order = order_info[1]
        order_len = len(this_order)
        #print('order:', this_order)
        #print('length of order', order_len)
        i = 0
        this_score = 0

        while (i < order_len):
            if this_order[i] in table:
                # use original order length as the num of recommendation
                recommends = getRecommend(this_order[i], order_len)
                #print('====> recommends of ', this_order[i], " : ", recommends)
                laterProds = this_order[i:]
                # check if the recommended products is included in order
                for prod in laterProds:
                    if prod in recommends:
                        #print("-->", prod)
                        this_score += 1
                i += 1
            else:
                # if the product is not trained in model, skip
                i += 1
                order_len -= 1

        #print(this_score)
        if not order_len == 0:
            scores.append(this_score/order_len)
        #print(scores)
        
    # return a list of predicted scores
    return(scores)

In [26]:
scores = TestScore(test_df)
print("Mean Test Scores: ", np.mean(scores))

Mean Test Scores:  0.18265900066835283
