In [21]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from sklearn import preprocessing


In [2]:
transactions = pd.read_csv("data/transactions_train.csv")
customers = pd.read_csv("data/customers.csv")
articles = pd.read_csv("data/articles.csv")

In [3]:
# adding product code to transactions df
transactions = pd.merge(transactions, articles[['article_id', 'product_code']], left_on="article_id",right_on="article_id")
transactions.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,product_code
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2,663713
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2,541518
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2,505221
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2,685687
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2,685687


In [4]:
# dropping customer fn and fn freq
customers = customers[["customer_id","age"]]

In [5]:
# handling nan in customers df
print("Total customers",customers.shape[0])
print("Customers with missing age",customers["age"].isna().sum())
print("%: ",customers["age"].isna().sum()/customers.shape[0]*100)


Total customers 1371980
Customers with missing age 15861
%:  1.1560664149623172


In [6]:
customers.dropna(subset=["age"],inplace=True)
print("Total customers after drop: ",customers.shape[0])
print("Customers with missing age",customers["age"].isna().sum())

Total customers after drop:  1356119
Customers with missing age 0


In [10]:
transactions.t_dat = pd.to_datetime(transactions.t_dat)

In [16]:
def association_rules1(transactions):
    last_month_transactions = transactions.loc[transactions.t_dat >= pd.to_datetime('2020-08-12')]
    transactions_apri = last_month_transactions.groupby('customer_id')['product_code'].unique().reset_index()
    transactions_apri
    
    te = TransactionEncoder()
    
    te.fit(transactions_apri['product_code'])
    orders_1hot = te.transform(transactions_apri['product_code'])
    
    orders_1hot = pd.DataFrame(orders_1hot, columns =te.columns_)
    orders_1hot.shape
    
    frequent_itemsets = apriori(orders_1hot, min_support=0.005, max_len=3, use_colnames=True)
    frequent_itemsets.sort_values(by=['support'], ascending=False)
    
    frequent_itemsets['itemset_size'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
    
    print(frequent_itemsets.head())
    
    print(frequent_itemsets.describe())
    
    # getting association rules
    assoc_rules = association_rules(frequent_itemsets,metric='lift')
    
    assoc_rules['antecedents_length'] = assoc_rules['antecedents'].apply(lambda x: len(x))
    assoc_rules['consequents_length'] = assoc_rules['consequents'].apply(lambda x: len(x))
    assoc_rules.sort_values(by='confidence',ascending=False).reset_index()

    return assoc_rules 

In [29]:
def association_rules2(articles, transactions, date='2020-08-12'):
    
    # association rule mining : method 2
    le = preprocessing.LabelEncoder()

    # concatenating 'perceived_colour_master_name' and 'product_type_name' attributes to 'item'
    articles_copy = articles 
    # print(articles_copy['perceived_colour_master_name'].nunique())
    # print(articles_copy['perceived_colour_master_name'].unique())
    # print(articles_copy['product_type_name'].nunique())
    # print(articles_copy['product_type_name'].unique())
    
    articles_copy = articles_copy[['article_id','perceived_colour_master_name','product_type_name']]
    articles_copy['item'] = articles_copy['perceived_colour_master_name'] + ['-'] + articles['product_type_name']
    # print(articles_copy.head())
    
    articles_copy = articles_copy[['article_id','item']]
    # print(articles_copy.head())
    # print(articles_copy.shape) 

    # merging 'item' attribute from articles in transactions
    last_month_transactions_copy = transactions.loc[transactions.t_dat >= pd.to_datetime(date)]
    last_month_transactions_copy = pd.merge(last_month_transactions_copy,articles_copy,how="left",on='article_id')
    # print(last_month_transactions_copy.head())
    # print(last_month_transactions_copy.shape)

    # converting into 1 hot encoding form 
    last_month_transactions_copy['customer_id'] = le.fit_transform(last_month_transactions_copy['customer_id'])
    # print(last_month_transactions_copy.head())
    # print(last_month_transactions_copy.shape) 

    last_month_transactions_copy['customer_id'] = last_month_transactions_copy['customer_id'].astype(str)
    # print(last_month_transactions_copy.head())
    # print(last_month_transactions_copy.shape)
    
    last_month_transactions_copy['item'].nunique()
    
    last_month_transactions_copy = last_month_transactions_copy[["t_dat","customer_id","item"]].groupby(['t_dat','customer_id'])['item'].unique().reset_index()
    # print(last_month_transactions_copy.head())
    # print(last_month_transactions_copy.shape)
    
    te = TransactionEncoder()
    
    te.fit(last_month_transactions_copy['item'])
    orders_1hot = te.transform(last_month_transactions_copy['item'])
    orders_1hot = pd.DataFrame(orders_1hot, columns =te.columns_)
    
    # print(orders_1hot.head())
    # print(orders_1hot.shape)

    # applying apriori to get frequent itemsets
    frequent_itemsets = apriori(orders_1hot, min_support=0.01, max_len=3, use_colnames=True)
    frequent_itemsets.sort_values(by=['support'], ascending=False)

    # print(frequent_itemsets.head())
    # print(frequent_itemsets.shape)
    frequent_itemsets['itemset_size'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
    
    # print(frequent_itemsets['itemset_size'].describe())

    # getting association rules
    assoc_rules = association_rules(frequent_itemsets,metric='confidence', min_threshold=0.01)
    
    assoc_rules['antecedents_length'] = assoc_rules['antecedents'].apply(lambda x: len(x))
    assoc_rules['consequents_length'] = assoc_rules['consequents'].apply(lambda x: len(x))
    assoc_rules.sort_values(by='confidence',ascending=False).reset_index()

    return assoc_rules 

In [30]:
association_rules2(articles, transactions)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  articles_copy['item'] = articles_copy['perceived_colour_master_name'] + ['-'] + articles['product_type_name']


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric,antecedents_length,consequents_length
0,(Black-Trousers),(Black-Sweater),0.119929,0.067071,0.012521,0.104405,1.556618,0.004477,1.041685,0.40631,1,1
1,(Black-Sweater),(Black-Trousers),0.067071,0.119929,0.012521,0.186684,1.556618,0.004477,1.082077,0.38329,1,1
2,(Black-Trousers),(Black-Top),0.119929,0.066872,0.010177,0.084862,1.269032,0.002158,1.019659,0.240887,1,1
3,(Black-Top),(Black-Trousers),0.066872,0.119929,0.010177,0.152194,1.269032,0.002158,1.038057,0.22719,1,1
4,(Black-Trousers),(Blue-Trousers),0.119929,0.106008,0.027003,0.225156,2.123943,0.014289,1.15377,0.60129,1,1
5,(Blue-Trousers),(Black-Trousers),0.106008,0.119929,0.027003,0.254723,2.123943,0.014289,1.180864,0.591927,1,1
6,(Black-Trousers),(Grey-Trousers),0.119929,0.04338,0.012285,0.102437,2.361364,0.007083,1.065796,0.655079,1,1
7,(Grey-Trousers),(Black-Trousers),0.04338,0.119929,0.012285,0.283197,2.361364,0.007083,1.227772,0.60266,1,1
8,(Blue-Trousers),(Grey-Trousers),0.106008,0.04338,0.011864,0.111912,2.57979,0.007265,1.077168,0.684986,1,1
9,(Grey-Trousers),(Blue-Trousers),0.04338,0.106008,0.011864,0.27348,2.57979,0.007265,1.230511,0.640141,1,1
