# Importing the libraries

In [1]:
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori, association_rules

# Reading the datasets

In [2]:
#list and orders
lst = pd.read_csv('list.csv')
orders = pd.read_csv('Order.csv')

In [3]:
lst.head()

Unnamed: 0,Order ID,Order Date,CustomerName,State,City
0,B-25601,01-04-2018,Bharat,Gujarat,Ahmedabad
1,B-25602,01-04-2018,Pearl,Maharashtra,Pune
2,B-25603,03-04-2018,Jahan,Madhya Pradesh,Bhopal
3,B-25604,03-04-2018,Divsha,Rajasthan,Jaipur
4,B-25605,05-04-2018,Kasheen,West Bengal,Kolkata


In [4]:
orders.head()

Unnamed: 0,Order ID,Amount,Profit,Quantity,Category,Sub-Category
0,B-25601,1275.0,-1148.0,7,Furniture,Bookcases
1,B-25601,66.0,-12.0,5,Clothing,Stole
2,B-25601,8.0,-2.0,3,Clothing,Hankerchief
3,B-25601,80.0,-56.0,4,Electronics,Electronic Games
4,B-25602,168.0,-111.0,2,Electronics,Phones


In [5]:
#number of columns and rows
lst.shape

(560, 5)

In [6]:
orders.shape

(1500, 6)

# Merging the dataset

In [7]:
#merging the dataset based on order ID
data_merge_df = orders.merge(lst,on='Order ID')

In [8]:
#dropping order date attribute
data_merge_df.drop('Order Date',axis='columns', inplace=True)
data_merge_df.drop('CustomerName',axis='columns', inplace=True)


In [9]:
#the number of columns and rows
data_merge_df

Unnamed: 0,Order ID,Amount,Profit,Quantity,Category,Sub-Category,State,City
0,B-25601,1275.0,-1148.0,7,Furniture,Bookcases,Gujarat,Ahmedabad
1,B-25601,66.0,-12.0,5,Clothing,Stole,Gujarat,Ahmedabad
2,B-25601,8.0,-2.0,3,Clothing,Hankerchief,Gujarat,Ahmedabad
3,B-25601,80.0,-56.0,4,Electronics,Electronic Games,Gujarat,Ahmedabad
4,B-25602,168.0,-111.0,2,Electronics,Phones,Maharashtra,Pune
...,...,...,...,...,...,...,...,...
1495,B-26099,835.0,267.0,5,Electronics,Phones,Maharashtra,Mumbai
1496,B-26099,2366.0,552.0,5,Clothing,Trousers,Maharashtra,Mumbai
1497,B-26100,828.0,230.0,2,Furniture,Chairs,Madhya Pradesh,Indore
1498,B-26100,34.0,10.0,2,Clothing,T-shirt,Madhya Pradesh,Indore


In [10]:
#description of the numeric values
data_merge_df.describe()

Unnamed: 0,Amount,Profit,Quantity
count,1500.0,1500.0,1500.0
mean,287.668,15.97,3.743333
std,461.050488,169.140565,2.184942
min,4.0,-1981.0,1.0
25%,45.0,-9.25,2.0
50%,118.0,9.0,3.0
75%,322.0,38.0,5.0
max,5729.0,1698.0,14.0


In [11]:
#attribute name and type
data_merge_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1500 entries, 0 to 1499
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Order ID      1500 non-null   object 
 1   Amount        1500 non-null   float64
 2   Profit        1500 non-null   float64
 3   Quantity      1500 non-null   int64  
 4   Category      1500 non-null   object 
 5   Sub-Category  1500 non-null   object 
 6   State         1500 non-null   object 
 7   City          1500 non-null   object 
dtypes: float64(2), int64(1), object(5)
memory usage: 105.5+ KB


In [12]:
#removing extra spaces present in state attribute
data_merge_df['State']=data_merge_df['State'].str.strip()

In [13]:
#Listing the states and each of their orders
data_merge_df['Sub-Category'].value_counts()


Saree               210
Hankerchief         198
Stole               192
Phones               83
Electronic Games     79
Bookcases            79
T-shirt              77
Printers             74
Chairs               74
Furnishings          73
Accessories          72
Shirt                69
Skirt                64
Leggings             53
Kurti                47
Trousers             39
Tables               17
Name: Sub-Category, dtype: int64

# Using apriori algorithm for recommendation based on state

## Market basket analysis




In [14]:
#listing all the states
states = list(data_merge_df['State'].unique())
states

['Gujarat',
 'Maharashtra',
 'Madhya Pradesh',
 'Rajasthan',
 'West Bengal',
 'Karnataka',
 'Jammu and Kashmir',
 'Tamil Nadu',
 'Uttar Pradesh',
 'Bihar',
 'Kerala',
 'Punjab',
 'Haryana',
 'Himachal Pradesh',
 'Sikkim',
 'Goa',
 'Nagaland',
 'Andhra Pradesh',
 'Delhi']

In [15]:
state ="Karnataka"

In [16]:
#creating a basket for each state for analysis based on state if the state given is present in the dataset
if state in states:
    #for each order in that state calculating the quantity of each item ordered
    mybasket=(data_merge_df[data_merge_df['State']==state]
              .groupby(['Order ID','Sub-Category'])['Quantity']
              .sum().unstack().reset_index().fillna(0)
              .set_index('Order ID')
             )
else:
    print("State is not present")

In [19]:
mybasket

Sub-Category,Accessories,Bookcases,Chairs,Electronic Games,Furnishings,Hankerchief,Kurti,Leggings,Phones,Printers,Saree,Shirt,Skirt,Stole,T-shirt,Tables,Trousers
Order ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
B-25606,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
B-25624,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B-25642,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0,0.0,0.0,0.0,0.0,0.0
B-25660,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
B-25678,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0,2.0,2.0,0.0,0.0,0.0,0.0
B-25696,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,3.0,3.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0
B-25714,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0
B-25732,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
B-25768,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
B-25786,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,0.0,3.0,0.0


In [20]:
def my_encode_units(x):
    if x<=0:
        return 0
    if x>=1:
        return 1
    
my_basket_sets = mybasket.applymap(my_encode_units)

In [21]:
#frequent items with a minimum support value of 0.01
#applying apriori algo to generate the frequent item sets
my_frequent_itemsets =  apriori(my_basket_sets,min_support=0.01,use_colnames=True)
my_frequent_itemsets


Unnamed: 0,support,itemsets
0,0.142857,(Accessories)
1,0.095238,(Bookcases)
2,0.095238,(Chairs)
3,0.095238,(Electronic Games)
4,0.047619,(Furnishings)
...,...,...
140,0.047619,"(Kurti, T-shirt, Accessories, Shirt, Stole)"
141,0.047619,"(Saree, T-shirt, Accessories, Shirt, Stole)"
142,0.047619,"(Chairs, Hankerchief, Printers, Leggings, Elec..."
143,0.047619,"(Kurti, Saree, T-shirt, Shirt, Stole)"


In [22]:
#genarating the association rules with a minimum threshold value for the lift
my_rules = association_rules(my_frequent_itemsets,metric="lift",min_threshold=3)
my_rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Accessories),(Kurti),0.142857,0.095238,0.047619,0.333333,3.50,0.034014,1.357143
1,(Kurti),(Accessories),0.095238,0.142857,0.047619,0.500000,3.50,0.034014,1.714286
2,(Accessories),(T-shirt),0.142857,0.095238,0.047619,0.333333,3.50,0.034014,1.357143
3,(T-shirt),(Accessories),0.095238,0.142857,0.047619,0.500000,3.50,0.034014,1.714286
4,(Skirt),(Bookcases),0.095238,0.095238,0.047619,0.500000,5.25,0.038549,1.809524
...,...,...,...,...,...,...,...,...,...
937,(Saree),"(Kurti, T-shirt, Accessories, Shirt, Stole)",0.285714,0.047619,0.047619,0.166667,3.50,0.034014,1.142857
938,(T-shirt),"(Kurti, Saree, Accessories, Shirt, Stole)",0.095238,0.047619,0.047619,0.500000,10.50,0.043084,1.904762
939,(Accessories),"(Kurti, Saree, T-shirt, Shirt, Stole)",0.142857,0.047619,0.047619,0.333333,7.00,0.040816,1.428571
940,(Shirt),"(Kurti, Saree, T-shirt, Accessories, Stole)",0.333333,0.047619,0.047619,0.142857,3.00,0.031746,1.111111


In [23]:
#selecting the items that have lift greater than 3 and confidence greater than 0.7
df=my_rules[(my_rules['lift'] >= 3)& (my_rules['confidence']>=0.7)]
df=df.sort_values('lift', ascending=False)
df

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
551,"(Leggings, Electronic Games)","(Hankerchief, Printers)",0.047619,0.047619,0.047619,1.0,21.0,0.045351,inf
683,"(Saree, Stole, Shirt)","(Accessories, Kurti)",0.047619,0.047619,0.047619,1.0,21.0,0.045351,inf
709,"(Shirt, T-shirt, Kurti)","(Accessories, Saree)",0.047619,0.047619,0.047619,1.0,21.0,0.045351,inf
707,"(Saree, Kurti, Shirt)","(Accessories, T-shirt)",0.047619,0.047619,0.047619,1.0,21.0,0.045351,inf
706,"(Accessories, Saree, Kurti)","(Shirt, T-shirt)",0.047619,0.047619,0.047619,1.0,21.0,0.045351,inf
...,...,...,...,...,...,...,...,...,...
761,"(Accessories, T-shirt, Stole, Kurti)",(Shirt),0.047619,0.333333,0.047619,1.0,3.0,0.031746,inf
392,"(Accessories, T-shirt, Kurti)",(Shirt),0.047619,0.333333,0.047619,1.0,3.0,0.031746,inf
293,"(Skirt, Phones)",(Shirt),0.047619,0.333333,0.047619,1.0,3.0,0.031746,inf
298,"(Skirt, Saree)",(Shirt),0.047619,0.333333,0.047619,1.0,3.0,0.031746,inf


In [24]:
dataFrame= df.head()
consequents = list(dataFrame['consequents'].unique())
for x in consequents:
        print(list(x))

['Hankerchief', 'Printers']
['Accessories', 'Kurti']
['Accessories', 'Saree']
['Accessories', 'T-shirt']
['Shirt', 'T-shirt']


# getting consequent values from antecedents

In [25]:
df = df.head(10)
df

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
551,"(Leggings, Electronic Games)","(Hankerchief, Printers)",0.047619,0.047619,0.047619,1.0,21.0,0.045351,inf
683,"(Saree, Stole, Shirt)","(Accessories, Kurti)",0.047619,0.047619,0.047619,1.0,21.0,0.045351,inf
709,"(Shirt, T-shirt, Kurti)","(Accessories, Saree)",0.047619,0.047619,0.047619,1.0,21.0,0.045351,inf
707,"(Saree, Kurti, Shirt)","(Accessories, T-shirt)",0.047619,0.047619,0.047619,1.0,21.0,0.045351,inf
706,"(Accessories, Saree, Kurti)","(Shirt, T-shirt)",0.047619,0.047619,0.047619,1.0,21.0,0.045351,inf
705,"(Saree, T-shirt, Kurti)","(Accessories, Shirt)",0.047619,0.047619,0.047619,1.0,21.0,0.045351,inf
694,"(Shirt, Stole)","(Accessories, Saree, Kurti)",0.047619,0.047619,0.047619,1.0,21.0,0.045351,inf
693,"(Accessories, Stole)","(Saree, Kurti, Shirt)",0.047619,0.047619,0.047619,1.0,21.0,0.045351,inf
692,"(Accessories, Shirt)","(Saree, Stole, Kurti)",0.047619,0.047619,0.047619,1.0,21.0,0.045351,inf
691,"(Saree, Stole)","(Accessories, Shirt, Kurti)",0.047619,0.047619,0.047619,1.0,21.0,0.045351,inf


In [27]:
#items purchased
items=['Shirt', 'Stole']
antecedents = list(df['antecedents'].unique())
lst = []
for x in antecedents:
    lst.append(list(x))

if items in lst:
    #print("Yes")
    dataFrame = df[['antecedents','consequents']]
    data_dict = dataFrame.set_index('antecedents')['consequents'].to_dict()
    if frozenset(items) in data_dict.keys():
        print(data_dict.get(frozenset(items)))
        
    else:
        print("No")
else:
    print("No")


frozenset({'Accessories', 'Saree', 'Kurti'})
