# Market Basket Analysis - Recommend Items Frequently Purchased Together
This notebook demonstrates how recommending items that are frequently purchased together is effective.This notebook's strategy is as follows:
* recommend items previously purchased
* recommend items that are bought together with previous purchases
* recommend popular items

In [1]:
import cudf
import pandas as pd
import gc

print('RAPIDS version',cudf.__version__)

RAPIDS version 21.10.01


# Load Transactions, Reduce Memory


In [2]:
# Read the CSV file into a cuDF DataFrame
train = cudf.read_csv('../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv')

# Convert the last 16 characters of 'customer_id' to hexadecimal and then to an integer
train['customer_id'] = train['customer_id'].str[-16:].str.hex_to_int().astype('int64')

# Convert 'article_id' to integer with 32 bits precision
train['article_id'] = train['article_id'].astype('int32')

# Convert 't_dat' column to datetime
train['t_dat'] = cudf.to_datetime(train['t_dat'])

# Select specific columns in the desired order
train = train[['t_dat', 'customer_id', 'article_id']]

# Save the cuDF DataFrame to a Parquet file
train.to_parquet('train.pqt', index=False)

# Print the shape of the DataFrame and display the first few rows
print(train.shape)
train.head()


(31788324, 3)


Unnamed: 0,t_dat,customer_id,article_id
0,2018-09-20,-6846340800584936,663713001
1,2018-09-20,-6846340800584936,541518023
2,2018-09-20,-8334631767138808638,505221004
3,2018-09-20,-8334631767138808638,685687003
4,2018-09-20,-8334631767138808638,685687004


In [3]:
# Group by 'customer_id' and find the maximum date ('t_dat') for each customer
tmp = train.groupby('customer_id')['t_dat'].max().reset_index()
tmp.columns = ['customer_id', 'max_dat']

# Merge the temporary DataFrame back into the original DataFrame based on 'customer_id'
train = train.merge(tmp, on=['customer_id'], how='left')

# Calculate the difference in days between 'max_dat' and 't_dat'
train['diff_dat'] = (train['max_dat'] - train['t_dat']).dt.days

# Keep only the rows where the difference is less than or equal to 6
train = train.loc[train['diff_dat'] <= 6]

# Print the shape of the resulting DataFrame
print('Train shape:', train.shape)


Train shape: (5181535, 5)


* # Apriori Transformation:


In [4]:
#train
train = train.loc[train.t_dat >= cudf.to_datetime('2020-09-16')]
train

Unnamed: 0,t_dat,customer_id,article_id,max_dat,diff_dat
31520960,2020-09-16,825696797860015377,874754016,2020-09-16,0
31520961,2020-09-16,825696797860015377,874754002,2020-09-16,0
31520962,2020-09-16,825696797860015377,935694002,2020-09-16,0
31520963,2020-09-16,-2424921637038555937,751567009,2020-09-16,0
31520964,2020-09-16,825696797860015377,717490075,2020-09-16,0
...,...,...,...,...,...
31788319,2020-09-22,5776383470964371557,573085057,2020-09-22,0
31788320,2020-09-22,8705515275876785235,890074001,2020-09-22,0
31788321,2020-09-22,-731468567251345846,868879003,2020-09-22,0
31788322,2020-09-22,8705515275876785235,886566001,2020-09-22,0


In [5]:
#Apriori treats duplicates as a single occurence
train_apri = train.groupby('customer_id')['article_id'].unique().reset_index()
train_apri

Unnamed: 0,customer_id,article_id
0,-9223100958908512198,"[673677024, 673677027, 785034009]"
1,-9223002534477110135,[793699001]
2,-9222810895170663723,"[892309001, 903306004]"
3,-9221924794303263774,"[708138013, 888343003, 892857002]"
4,-9221811157628158522,[891663002]
...,...,...
68979,9221607025486275188,"[699755071, 699755081, 751994003]"
68980,9221813808370389952,"[562245099, 865929002, 869331006]"
68981,9222310594107555341,"[884319001, 884319003]"
68982,9223099843213569889,[903924002]


In [6]:
from mlxtend.preprocessing import TransactionEncoder
te = TransactionEncoder()

In [7]:
te.fit(train_apri['article_id'].to_pandas())
orders_1hot = te.transform(train_apri['article_id'].to_pandas())

orders_1hot = pd.DataFrame(orders_1hot, columns =te.columns_)
orders_1hot.head()

Unnamed: 0,108775044,111565001,111586001,111593001,111609001,123173001,129085001,129085027,130035001,144993001,...,948152002,949198001,949551001,949551002,949594001,952267001,952938001,953450001,953763001,956217002
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [8]:
from mlxtend.frequent_patterns import apriori

In [9]:
is_ap = apriori(orders_1hot, min_support=0.003, max_len=2, use_colnames=True)

In [10]:
is_ap.sort_values(by=['support'])

Unnamed: 0,support,itemsets
58,0.003001,(914441001)
7,0.003030,(573085043)
55,0.003030,(910601002)
3,0.003059,(456163060)
21,0.003059,(781613006)
...,...,...
34,0.006973,(866731001)
68,0.007437,(923758001)
70,0.007726,(924243002)
64,0.008248,(918522001)


# Calculate Association Rules

In [11]:
from mlxtend.frequent_patterns import association_rules

In [12]:
is_ap

Unnamed: 0,support,itemsets
0,0.003233,(372860001)
1,0.003131,(372860002)
2,0.005624,(448509014)
3,0.003059,(456163060)
4,0.003349,(456163086)
...,...,...
72,0.004088,(929165002)
73,0.004566,(929275001)
74,0.004247,(930380001)
75,0.004813,(934835001)


In [13]:
# Assuming is_ap contains frequent itemsets from Apriori
rules = association_rules(is_ap, metric="lift")

# Print the first few rows of the resulting DataFrame
rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction


# Find Each Customer's Last Week of Purchases

We will organize our predictions in the same order as our dataframe rows. Later, we'll create the prediction string by summing up the concatenated `article_id` values grouped by customer_id `train.groupby('customer_id').article_id.sum()`.

# (1) Recommend Most Often Previously Purchased Items


In [14]:
# Group by 'customer_id' and 'article_id' and count occurrences
tmp = train.groupby(['customer_id','article_id'])['t_dat'].agg('count').reset_index()
tmp.columns = ['customer_id','article_id','ct']
# Merge the count information back into the original DataFrame
train = train.merge(tmp,on=['customer_id','article_id'],how='left')
# Sort the DataFrame based on count and date in descending order
train = train.sort_values(['ct','t_dat'],ascending=False)
# Drop duplicate rows based on 'customer_id' and 'article_id', keeping the first occurrence (highest count)
train = train.drop_duplicates(['customer_id','article_id'])
train = train.sort_values(['ct','t_dat'],ascending=False)
train.head()

Unnamed: 0,t_dat,customer_id,article_id,max_dat,diff_dat,ct
64,2020-09-16,-6341822073530655775,685814001,2020-09-16,0,30
66016,2020-09-18,954634701602309532,895555001,2020-09-18,0,26
162796,2020-09-21,-1032181102150868027,677930023,2020-09-21,0,25
77416,2020-09-18,-6439897037483747065,685813003,2020-09-19,1,25
114984,2020-09-18,6200564461742194660,715624001,2020-09-18,0,20


# (2) Recommend Items Purchased Together


In [15]:
# USE PANDAS TO MAP COLUMN WITH DICTIONARY
import pandas as pd, numpy as np
train = train.to_pandas()
pairs = np.load('../input/hmitempairs/pairs_cudf.npy',allow_pickle=True).item()
train['article_id2'] = train.article_id.map(pairs)


In [16]:
# RECOMMENDATION OF PAIRED ITEMS
# Create a new DataFrame 'train2' with relevant columns
train2 = train[['customer_id', 'article_id2']].copy()

# Filter out rows where 'article_id2' is not null
train2 = train2.loc[train2['article_id2'].notnull()]

# Drop duplicates based on 'customer_id' and 'article_id2'
train2 = train2.drop_duplicates(['customer_id', 'article_id2'])

# Rename the 'article_id2' column back to 'article_id'
train2 = train2.rename({'article_id2': 'article_id'}, axis=1)

In [17]:
# CONCATENATE PAIRED ITEM RECOMMENDATION AFTER PREVIOUS PURCHASED RECOMMENDATIONS
# Select relevant columns from the original DataFrame 'train'
train = train[['customer_id', 'article_id']]

# Concatenate recommendations for paired items ('train2') with previous purchased recommendations ('train')
train = pd.concat([train, train2], axis=0, ignore_index=True)

# Convert 'article_id' to integer type and drop duplicate rows based on 'customer_id' and 'article_id'
train['article_id'] = train['article_id'].astype('int32')
train = train.drop_duplicates(['customer_id', 'article_id'])

In [18]:
# CONVERT RECOMMENDATIONS INTO SINGLE STRING
# Convert 'article_id' to a single string with a leading space '0'
train['article_id'] = ' 0' + train['article_id'].astype('str')

# Group by 'customer_id' and concatenate 'article_id' strings into a single string
preds = cudf.DataFrame(train.groupby('customer_id')['article_id'].sum().reset_index())

# Rename columns for clarity
preds.columns = ['customer_id', 'prediction']

# Print the first few rows of the resulting DataFrame 'preds'
preds.head()

Unnamed: 0,customer_id,prediction
0,-9223100958908512198,0673677024 0673677027 0785034009 0673677002 0...
1,-9223002534477110135,0793699001 0751542002
2,-9222810895170663723,0892309001 0903306004 0786187005
3,-9221924794303263774,0708138013 0888343003 0892857002 0741717002 0...
4,-9221811157628158522,0891663002 0706016001


# (3) Recommend Last Week's Most Popular Items


In [19]:
# Read the Parquet file into a cuDF DataFrame
train = cudf.read_parquet('train.pqt')

# Convert the 't_dat' column to datetime
train.t_dat = cudf.to_datetime(train.t_dat)

# Filter the data for entries on or after '2020-09-16'
train = train.loc[train.t_dat >= cudf.to_datetime('2020-09-16')]

# Identify the top 12 popular items based on their occurrence count
top12 = ' 0' + ' 0'.join(train.article_id.value_counts().to_pandas().index.astype('str')[:12])

# Print the result
print("Last week's top 12 popular items:")
print(top12)

Last week's top 12 popular items:
 0924243001 0924243002 0918522001 0923758001 0866731001 0909370001 0751471001 0915529003 0915529005 0448509014 0762846027 0714790020


# Write Submission CSV
We will merge our predictions onto `sample_submission.csv` and submit to Kaggle.

In [20]:
# Read the sample submission file into a cuDF DataFrame
sub = cudf.read_csv('../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv')
sub = sub[['customer_id']]
# Extract the last 16 characters from 'customer_id', convert to int64, and create a new column 'customer_id_2'
sub['customer_id_2'] = sub['customer_id'].str[-16:].str.hex_to_int().astype('int64')
# Merge the 'sub' DataFrame with the 'preds' DataFrame based on 'customer_id_2', filling NaN values with empty strings
sub = sub.merge(preds.rename({'customer_id':'customer_id_2'},axis=1),\
    on='customer_id_2', how='left').fillna('')
# Delete the 'customer_id_2' column
del sub['customer_id_2']
# Concatenate 'prediction' with 'top12'
sub.prediction = sub.prediction + top12
# Strip leading and trailing whitespaces from 'prediction'
sub.prediction = sub.prediction.str.strip()
# Limit the length of 'prediction' to 131 character
sub.prediction = sub.prediction.str[:131]
# Save the final submission file to 'submission.csv'
sub.to_csv(f'submission.csv',index=False)
sub.head()

Unnamed: 0,customer_id,prediction
0,031eae8665c6b7b97a6eb6376d57c46c439bb28ea7b591...,0924243001 0924243002 0918522001 0923758001 08...
1,031f5f30a4490ca8cd5c963338086d633d782b18924d31...,0924243001 0924243002 0918522001 0923758001 08...
2,031f0e83c36d2d8dd8bfd924059acb655b45ec96c9e398...,0924243001 0924243002 0918522001 0923758001 08...
3,031fc7ec750c9be3141b29b8a80e71cd064ff5c3c7699f...,0924243001 0924243002 0918522001 0923758001 08...
4,031ec24658c051297b7c53e3dc86d3a3799bfce699d790...,0924243001 0924243002 0918522001 0923758001 08...
