# Insurance Purchase Prediction

### _Part III - Machine Learning Modelling : Market Basket Analysis_

## Group 3: Vivek | Gurunathan C | Gnana CP

In [1]:
# Importing relevant libraries 
# as pd/np/plt gives the library other name which makes it easy for coders to call the library again using 
# shortened given names

import pandas as pd
import numpy as np 
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


In [2]:
# Loading the data from cleaned file
df_train = pd.read_csv('../data/train_cleaned_1.csv')

In [3]:
# Checking loaded data

df_train.shape

df_train

(665249, 25)

Unnamed: 0.1,Unnamed: 0,customer_ID,shopping_pt,record_type,day,time,state,location,group_size,homeowner,...,C_previous,duration_previous,A,B,C,D,E,F,G,cost
0,0,10000000.0,1.0,0.0,0,8.0,10,0,2.0,0.0,...,1.0,2.0,1.0,0.0,2.0,2.0,1.0,2.0,2.0,633.0
1,1,10000000.0,2.0,0.0,0,8.0,10,0,2.0,0.0,...,1.0,2.0,1.0,0.0,2.0,2.0,1.0,2.0,1.0,630.0
2,2,10000000.0,3.0,0.0,0,8.0,10,0,2.0,0.0,...,1.0,2.0,1.0,0.0,2.0,2.0,1.0,2.0,1.0,630.0
3,3,10000000.0,4.0,0.0,0,8.0,10,0,2.0,0.0,...,1.0,2.0,1.0,0.0,2.0,2.0,1.0,2.0,1.0,630.0
4,4,10000000.0,5.0,0.0,0,11.0,10,0,2.0,0.0,...,1.0,2.0,1.0,0.0,2.0,2.0,1.0,2.0,1.0,630.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
665244,665244,10152724.0,2.0,0.0,3,13.0,12,203,1.0,1.0,...,1.0,4.0,1.0,0.0,2.0,3.0,0.0,2.0,2.0,677.0
665245,665245,10152724.0,3.0,0.0,3,13.0,12,203,1.0,1.0,...,1.0,4.0,1.0,0.0,2.0,3.0,0.0,2.0,2.0,677.0
665246,665246,10152724.0,4.0,0.0,3,13.0,12,203,1.0,1.0,...,1.0,4.0,1.0,0.0,2.0,3.0,0.0,2.0,2.0,677.0
665247,665247,10152724.0,5.0,0.0,3,13.0,12,203,1.0,1.0,...,1.0,4.0,1.0,0.0,2.0,3.0,0.0,2.0,2.0,685.0


In [4]:
# the <unnamed> column is redundant (carried over from cleaning, while exporting the data), so dropping it

df_train.drop(df_train.columns[[0]], axis = 1, inplace=True)

In [5]:
# retaining only the records that were actual purchases

purchase = df_train.loc[df_train['record_type'] == 1]

In [6]:
purchase

Unnamed: 0,customer_ID,shopping_pt,record_type,day,time,state,location,group_size,homeowner,car_age,...,C_previous,duration_previous,A,B,C,D,E,F,G,cost
8,10000000.0,9.0,1.0,0,12.0,10,0,2.0,0.0,2.0,...,1.0,2.0,1.0,0.0,2.0,2.0,1.0,2.0,1.0,634.0
14,10000005.0,6.0,1.0,3,9.0,23,5,1.0,0.0,10.0,...,3.0,13.0,0.0,0.0,3.0,2.0,0.0,0.0,2.0,731.0
22,10000007.0,8.0,1.0,4,14.0,27,7,1.0,0.0,11.0,...,2.0,4.0,0.0,0.0,1.0,2.0,0.0,0.0,1.0,602.0
26,10000013.0,4.0,1.0,4,9.0,34,13,2.0,1.0,3.0,...,3.0,3.0,1.0,1.0,3.0,2.0,1.0,1.0,3.0,626.0
32,10000014.0,6.0,1.0,1,17.0,15,14,1.0,0.0,5.0,...,1.0,2.0,1.0,1.0,1.0,1.0,0.0,2.0,2.0,614.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
665225,10152718.0,9.0,1.0,2,10.0,6,1013,2.0,1.0,1.0,...,4.0,15.0,1.0,1.0,4.0,3.0,1.0,2.0,4.0,628.0
665233,10152720.0,8.0,1.0,4,8.0,6,4041,1.0,0.0,13.0,...,1.0,8.0,1.0,0.0,1.0,2.0,0.0,2.0,3.0,681.0
665239,10152721.0,6.0,1.0,4,10.0,3,1878,1.0,0.0,8.0,...,4.0,5.0,1.0,0.0,3.0,3.0,1.0,0.0,2.0,716.0
665242,10152723.0,3.0,1.0,1,10.0,6,708,1.0,1.0,0.0,...,3.0,7.0,1.0,0.0,3.0,3.0,1.0,2.0,3.0,651.0


In [17]:
# Dropping the columns containing demographic values as they are redundent for MBA analysis

purchase_basket = purchase.drop(columns= ['shopping_pt','record_type','day','time','state','location','group_size',
                                          'homeowner','car_age','car_value','age_oldest','age_youngest','married_couple',
                                          'C_previous','duration_previous','cost'])

In [18]:
# Customer Id is shown as Float, resetting it to int
purchase_basket['customer_ID'] = purchase_basket['customer_ID'].astype('int64')

purchase_basket

Unnamed: 0,customer_ID,A,B,C,D,E,F,G
8,10000000,1.0,0.0,2.0,2.0,1.0,2.0,1.0
14,10000005,0.0,0.0,3.0,2.0,0.0,0.0,2.0
22,10000007,0.0,0.0,1.0,2.0,0.0,0.0,1.0
26,10000013,1.0,1.0,3.0,2.0,1.0,1.0,3.0
32,10000014,1.0,1.0,1.0,1.0,0.0,2.0,2.0
...,...,...,...,...,...,...,...,...
665225,10152718,1.0,1.0,4.0,3.0,1.0,2.0,4.0
665233,10152720,1.0,0.0,1.0,2.0,0.0,2.0,3.0
665239,10152721,1.0,0.0,3.0,3.0,1.0,0.0,2.0
665242,10152723,1.0,0.0,3.0,3.0,1.0,2.0,3.0


In [None]:
# Data is already in the tabuler format

# purchase_basket.fillna(0).reset_index().set_index('customer_ID')


In [19]:
# Data is already in the tabuler format

# Setting Customer ID as the index columnn

# The encoding function
def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1

# Applying the encoding function to the products A:G

purchase_basket['A'] = purchase_basket['A'].map(encode_units)
purchase_basket['B'] = purchase_basket['B'].map(encode_units)
purchase_basket['C'] = purchase_basket['C'].map(encode_units)
purchase_basket['D'] = purchase_basket['D'].map(encode_units)
purchase_basket['E'] = purchase_basket['E'].map(encode_units)
purchase_basket['F'] = purchase_basket['F'].map(encode_units)
purchase_basket['G'] = purchase_basket['G'].map(encode_units)


purchase_basket

Unnamed: 0,customer_ID,A,B,C,D,E,F,G
8,10000000,1,0,1,1,1,1,1
14,10000005,0,0,1,1,0,0,1
22,10000007,0,0,1,1,0,0,1
26,10000013,1,1,1,1,1,1,1
32,10000014,1,1,1,1,0,1,1
...,...,...,...,...,...,...,...,...
665225,10152718,1,1,1,1,1,1,1
665233,10152720,1,0,1,1,0,1,1
665239,10152721,1,0,1,1,1,0,1
665242,10152723,1,0,1,1,1,1,1


In [21]:
# Setting customer ID as the index column 

basket = purchase_basket.set_index('customer_ID')

basket

Unnamed: 0_level_0,A,B,C,D,E,F,G
customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
10000000,1,0,1,1,1,1,1
10000005,0,0,1,1,0,0,1
10000007,0,0,1,1,0,0,1
10000013,1,1,1,1,1,1,1
10000014,1,1,1,1,0,1,1
...,...,...,...,...,...,...,...
10152718,1,1,1,1,1,1,1
10152720,1,0,1,1,0,1,1
10152721,1,0,1,1,1,0,1
10152723,1,0,1,1,1,1,1


In [22]:
# Running the MBA using Apriori algorithm

frequent_itemsets = apriori(basket, min_support=0.01, use_colnames=True)

rules = association_rules(frequent_itemsets, metric="lift")
rules.sort_values('confidence', ascending = False, inplace = True)

rules.head(15)
rules.tail(15)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
730,"(E, G, F)",(D),0.373264,1.0,0.373264,1.0,1.0,0.0,inf
252,"(A, D, B)",(C),0.41056,1.0,0.41056,1.0,1.0,0.0,inf
1462,"(A, B, E)","(D, G, C)",0.3156,1.0,0.3156,1.0,1.0,0.0,inf
247,"(E, F)",(G),0.373264,1.0,0.373264,1.0,1.0,0.0,inf
733,"(E, F)","(D, G)",0.373264,1.0,0.373264,1.0,1.0,0.0,inf
742,"(A, B, C, E)",(D),0.3156,1.0,0.3156,1.0,1.0,0.0,inf
745,"(D, A, B, E)",(C),0.3156,1.0,0.3156,1.0,1.0,0.0,inf
243,(F),"(D, G)",0.677906,1.0,0.677906,1.0,1.0,0.0,inf
241,"(F, G)",(D),0.677906,1.0,0.677906,1.0,1.0,0.0,inf
240,"(F, D)",(G),0.677906,1.0,0.677906,1.0,1.0,0.0,inf


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
1430,(C),"(A, F, B, E, D)",1.0,0.261615,0.261615,0.261615,1.0,0.0,1.0
1921,"(D, C)","(A, G, F, B, E)",1.0,0.261615,0.261615,0.261615,1.0,0.0,1.0
1928,(C),"(A, G, F, B, E, D)",1.0,0.261615,0.261615,0.261615,1.0,0.0,1.0
1424,"(D, C)","(A, B, F, E)",1.0,0.261615,0.261615,0.261615,1.0,0.0,1.0
1918,"(D, G)","(A, F, C, B, E)",1.0,0.261615,0.261615,0.261615,1.0,0.0,1.0
1896,"(D, G, C)","(A, B, F, E)",1.0,0.261615,0.261615,0.261615,1.0,0.0,1.0
1672,"(D, G)","(A, B, F, E)",1.0,0.261615,0.261615,0.261615,1.0,0.0,1.0
1915,"(G, C)","(A, F, B, E, D)",1.0,0.261615,0.261615,0.261615,1.0,0.0,1.0
1608,"(G, C)","(A, B, F, E)",1.0,0.261615,0.261615,0.261615,1.0,0.0,1.0
1616,(G),"(A, F, C, B, E)",1.0,0.261615,0.261615,0.261615,1.0,0.0,1.0


In [24]:
# Saving the output to a CSV file

rules.to_csv("../data/MBA_rule.csv")

In [26]:
# When 'C', 'D' and 'G' are Consequent, Consequent_Support is 1. This implies
# its observed that 'C', 'D' and 'G' are always bought

# re-running the analysis after dropping these products from basket

basket_4 = basket.drop(columns= ['C','D','G'])


In [27]:
basket_4

Unnamed: 0_level_0,A,B,E,F
customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10000000,1,0,1,1
10000005,0,0,0,0
10000007,0,0,0,0
10000013,1,1,1,1
10000014,1,1,0,1
...,...,...,...,...
10152718,1,1,1,1
10152720,1,0,0,1
10152721,1,0,1,0
10152723,1,0,1,1


In [28]:
# Re-Running the MBA using Apriori algorithm on the shrunk basket

frequent_itemsets_4 = apriori(basket_4, min_support=0.01, use_colnames=True)

rules_4 = association_rules(frequent_itemsets_4, metric="lift")
rules_4.sort_values('confidence', ascending = False, inplace = True)

rules_4.head(15)
rules_4.tail(15)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
26,"(F, E)",(A),0.373264,0.779629,0.369327,0.98945,1.26913,0.078319,20.889123
39,"(B, F, E)",(A),0.26445,0.779629,0.261615,0.98928,1.268912,0.055442,20.557838
14,"(B, E)",(A),0.319692,0.779629,0.3156,0.987199,1.266242,0.066359,17.215051
3,(E),(A),0.462607,0.779629,0.456607,0.987031,1.266027,0.095946,16.992445
20,"(B, F)",(A),0.360647,0.779629,0.34937,0.96873,1.242553,0.068199,7.04745
5,(F),(A),0.677906,0.779629,0.650074,0.958943,1.23,0.121559,5.36751
1,(B),(A),0.47609,0.779629,0.41056,0.862358,1.106114,0.039387,1.601046
18,"(A, B)",(F),0.41056,0.677906,0.34937,0.850959,1.255276,0.071049,2.161111
4,(A),(F),0.779629,0.677906,0.650074,0.833825,1.23,0.121559,1.938278
37,"(A, B, E)",(F),0.3156,0.677906,0.261615,0.828946,1.222803,0.047668,1.882991


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
33,(B),"(F, E)",0.47609,0.373264,0.26445,0.555462,1.488119,0.086742,1.409858
11,(F),(E),0.677906,0.462607,0.373264,0.550614,1.190242,0.05966,1.195838
47,(B),"(A, F, E)",0.47609,0.369327,0.261615,0.549507,1.487863,0.085782,1.399964
28,(F),"(A, E)",0.677906,0.456607,0.369327,0.544805,1.193159,0.05979,1.193758
19,"(A, F)",(B),0.650074,0.47609,0.34937,0.537431,1.128843,0.039876,1.132609
9,(F),(B),0.677906,0.47609,0.360647,0.532001,1.117439,0.037903,1.119469
0,(A),(B),0.779629,0.47609,0.41056,0.526609,1.106114,0.039387,1.106719
23,(F),"(A, B)",0.677906,0.41056,0.34937,0.515366,1.255276,0.071049,1.216258
27,(A),"(F, E)",0.779629,0.373264,0.369327,0.473721,1.26913,0.078319,1.190881
21,(A),"(B, F)",0.779629,0.360647,0.34937,0.448123,1.242553,0.068199,1.158507
