In [None]:
import pandas as pd 
import numpy as np
import sys,getopt
import requests
import csv
import Orange
from Orange.data import Table,Domain, DiscreteVariable, ContinuousVariable
from orangecontrib.associate.fpgrowth import * 

#stats
from scipy import sparse
import scipy.stats as ss

#viz
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec 
import seaborn as sns
from PIL import Image
import matplotlib_venn as venn

%matplotlib inline

In [None]:
#Read the csv and convert it to DataFrame
items=set()

with open('groceries.csv') as data:
    read_data = csv.reader(data,delimiter=",")
    for i,line in enumerate(read_data):
        items.update(line)
        
output_list = list()

with open('groceries.csv') as data:
    read_data = csv.reader(data,delimiter=",")
    for i,line in enumerate(read_data):
        row_value = {item:0 for item in items}
        row_value.update({item:1 for item in line})   #if item is present in that transcation, set row_value to 1 for that item
        output_list.append(row_value)
        
grocery_df = pd.DataFrame(output_list)

In [None]:
grocery_df.head()

In [None]:
#Shape of the DataFrame
grocery_df.shape

In [None]:
#Statistical Description
grocery_df.describe()

In [None]:
#Top 20 "sold items" that occur in the dataset
total_count_of_items = sum(grocery_df.sum())
print("Total count of items: ", total_count_of_items)

item_sort_df = grocery_df.sum().sort_values(ascending = False).reset_index()
item_sort_df.rename(columns={item_sort_df.columns[0]:'item_name',item_sort_df.columns[1]:'item_count'}, inplace=True)
item_sort_df.head(20)

In [None]:
#Visualization of top 20 "sold items" that occur in the dataset
objects = (list(item_sort_df['item_name'].head(20)))
y = np.arange(len(objects))
count = list(item_sort_df['item_count'].head(20))
 
plt.bar(y, count, align='center', alpha=0.8)
plt.xticks(y, objects, rotation='vertical')
plt.ylabel('Item count')
plt.title('Sales distribution of top 20 sold items')

In [None]:
#Contribution of top 20 "sold items" to total sales
item_sort_df['item_perc'] = item_sort_df['item_count']/total_count_of_items #each item's contribution 
item_sort_df['total_perc'] = item_sort_df.item_perc.cumsum() #cumulative contribution of top items

print(item_sort_df[item_sort_df.total_perc <= 0.5].shape)

item_sort_df.head(20)

In [None]:
#Make Orange Table
input_assoc_rules = grocery_df
domain_grocery = Domain([DiscreteVariable.make(name='item',values=['0', '1']) for item in input_assoc_rules.columns])
data_gro_1 = Orange.data.Table.from_numpy(domain=domain_grocery,  X=input_assoc_rules.as_matrix(),Y= None)
data_gro_1

In [None]:
#Prune Dataset for frequently purchased items
def prune_dataset(input_df, length_trans, total_sales_perc, start_item = None, end_item = None):
    if 'total_items' in input_df.columns:
        del(input_df['total_items'])
    item_count = input_df.sum().sort_values(ascending = False).reset_index()  
    total_items = sum(input_df.sum().sort_values(ascending = False))
    item_count.rename(columns={item_count.columns[0]:'item_name',item_count.columns[1]:'item_count'}, inplace=True)
    
    if not start_item and not end_item: 
        item_count['item_perc'] = item_count['item_count']/total_items  #each percent
        item_count['total_perc'] = item_count.item_perc.cumsum()     #cumulative
        selected_items= list(item_count[item_count.total_perc < total_sales_perc].item_name.sort_values())
        input_df['total_items'] = input_df[selected_items].sum(axis = 1)
        input_df = input_df[input_df.total_items >= length_trans]   #transactions with at least length_trans items
        del(input_df['total_items'])
        return input_df[selected_items], item_count[item_count.total_perc < total_sales_perc] #comparing cumulative perc
    
    elif end_item > start_item:
        selected_items = list(item_count[start_item:end_item].item_name)
        input_df['total_items'] = input_df[selected_items].sum(axis = 1)
        input_df = input_df[input_df.total_items >= length_trans]
        del(input_df['total_items'])
        return input_df[selected_items],item_count[start_item:end_item]

In [None]:
output_df, item_counts = prune_dataset(input_df=grocery_df, length_trans=2,total_sales_perc=0.4)
print("Shape: ",output_df.shape)
print("Selected items: ", list(output_df.columns))

In [None]:
#Association Rule Mining with FP Growth
input_assoc_rules = output_df  
domain_grocery = Domain([DiscreteVariable.make(name=item,values=['0', '1']) for item in input_assoc_rules.columns])
data_gro_1 = Orange.data.Table.from_numpy(domain=domain_grocery,  X=input_assoc_rules.as_matrix(),Y= None)
data_gro_1_en, mapping = OneHot.encode(data_gro_1, include_class=False)

In [None]:
min_support=0.01
num_trans = input_assoc_rules.shape[0]*min_support
print("Number of required transactions = ", int(num_trans))
itemsets = dict(frequent_itemsets(data_gro_1_en, min_support=min_support))   #dict-- key:value pair
print(len(itemsets), " itemsets have a support of ", min_support*100, "%")

In [None]:
confidence = 0.3
rules_df = pd.DataFrame()

if len(itemsets) < 1000000: 
    rules = [(P, Q, supp, conf)
    for P, Q, supp, conf in association_rules(itemsets, confidence)
       if len(Q) == 1 ]
    print(len(rules))

    names = {item: '{}={}'.format(var.name, val)
        for item, var, val in OneHot.decode(mapping, data_gro_1, mapping)}
    
    eligible_antecedent = [v for k,v in names.items() if v.endswith("1")]
    
    N = input_assoc_rules.shape[0]
    
    rule_stats = list(rules_stats(rules, itemsets, N))
    
    rule_list_df = []
    for ex_rule_from_rule_stat in rule_stats:
        ante = ex_rule_from_rule_stat[0]            
        cons = ex_rule_from_rule_stat[1]
        named_cons = names[next(iter(cons))]
        if named_cons in eligible_antecedent:
            rule_lhs = [names[i][:-2] for i in ante if names[i] in eligible_antecedent]
            ante_rule = ', '.join(rule_lhs)
            if ante_rule and len(rule_lhs)>1 :
                rule_dict = {'support' : ex_rule_from_rule_stat[2],
                             'confidence' : ex_rule_from_rule_stat[3],
                             'coverage' : ex_rule_from_rule_stat[4],
                             'strength' : ex_rule_from_rule_stat[5],
                             'lift' : ex_rule_from_rule_stat[6],
                             'leverage' : ex_rule_from_rule_stat[7],
                             'antecedent': ante_rule,
                             'consequent':named_cons[:-2] }
                rule_list_df.append(rule_dict)
    rules_df = pd.DataFrame(rule_list_df)
    print("Raw rules data frame of {} rules generated".format(rules_df.shape[0]))
    if not rules_df.empty:
        pruned_rules_df = rules_df.groupby(['antecedent','consequent']).max().reset_index()
    else:
        print("Unable to generate any rule")

In [None]:
#Sorting rules in grocery data set
(pruned_rules_df[['antecedent','consequent',
                  'support','confidence','lift']].groupby('consequent')
                                                 .max()
                                                 .reset_index()
                                                 .sort_values(['lift', 'support','confidence'],
                                                              ascending=False))