In [None]:
#Importing packages
import sys
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from scipy import stats
import warnings
warnings.filterwarnings("ignore")

In [None]:
#Input the processed CRM data
santander_crm = pd.read_csv("Data/Processed/One_but_last_month_data_processsed.csv")

In [None]:
santander_crm = santander_crm[santander_crm['no_of_accounts'] > 1]

### Multi-level Rule Based Segmentation
<b>Level 1 : Gross_Income</b><br>
<b>Level 2 : Age / No of Accounts</b>

In [None]:
#Define rules to bin gross_income variable
santander_crm = santander_crm.assign(inc_bin = pd.qcut(santander_crm['gross_income'],q=3,duplicates='drop',labels=[0,1,2]))
santander_crm = santander_crm.assign(age_bin = pd.qcut(santander_crm['age'],q=2,duplicates='drop',labels=[0,1]))
santander_crm = santander_crm.assign(naccount_bin = pd.cut(santander_crm['no_of_accounts'],bins=[1,4,7,pd.np.inf],duplicates='drop',labels=[0,1,2]))

In [None]:
#Income and Age segments
inclow_ageyoung = santander_crm[(santander_crm['inc_bin'] == 0) & (santander_crm['age_bin'] == 0)]
inclow_ageold = santander_crm[(santander_crm['inc_bin'] == 0) & (santander_crm['age_bin'] == 1)]

incmid_ageyoung = santander_crm[(santander_crm['inc_bin'] == 1) & (santander_crm['age_bin'] == 0)]
incmid_ageold = santander_crm[(santander_crm['inc_bin'] == 1) & (santander_crm['age_bin'] == 1)]

inchigh_ageyoung = santander_crm[(santander_crm['inc_bin'] == 2) & (santander_crm['age_bin'] == 0)]
inchigh_ageold = santander_crm[(santander_crm['inc_bin'] == 2) & (santander_crm['age_bin'] == 1)]

#Income and No of accounts segments
inclow_nacclow = santander_crm[(santander_crm['inc_bin'] == 0) & (santander_crm['naccount_bin'] == 0)]
inclow_naccmid = santander_crm[(santander_crm['inc_bin'] == 0) & (santander_crm['naccount_bin'] == 1)]
inclow_nacchigh = santander_crm[(santander_crm['inc_bin'] == 0) & (santander_crm['naccount_bin'] == 2)]

incmid_nacclow = santander_crm[(santander_crm['inc_bin'] == 1) & (santander_crm['naccount_bin'] == 0)]
incmid_naccmid = santander_crm[(santander_crm['inc_bin'] == 1) & (santander_crm['naccount_bin'] == 1)]
incmid_nacchigh = santander_crm[(santander_crm['inc_bin'] == 1) & (santander_crm['naccount_bin'] == 2)]

inchigh_nacclow = santander_crm[(santander_crm['inc_bin'] == 2) & (santander_crm['naccount_bin'] == 0)]
inchigh_naccmid = santander_crm[(santander_crm['inc_bin'] == 2) & (santander_crm['naccount_bin'] == 1)]
inchigh_nacchigh = santander_crm[(santander_crm['inc_bin'] == 2) & (santander_crm['naccount_bin'] == 2)]

In [None]:
#Getting the account columns
list_columns = list(santander_crm)
list_accounts = list_columns[11:33]
list_accounts.append('customer_code')

In [None]:
#Creating transaction data
def maketrans(transactions):
    transactions_filter = transactions[list_accounts]
    transactions_filter=transactions_filter.set_index('customer_code')
    return transactions_filter

In [None]:
#Create transactions for income and age segments
inclow_ageyoung_trans = maketrans(inclow_ageyoung)
inclow_ageold_trans = maketrans(inclow_ageold)

incmid_ageyoung_trans = maketrans(incmid_ageyoung)
incmid_ageold_trans = maketrans(incmid_ageold)

inchigh_ageyoung_trans = maketrans(inchigh_ageyoung)
inchigh_ageold_trans = maketrans(inchigh_ageold)

#Create transactions for income and no of accounts segments
inclow_nacclow_trans = maketrans(inclow_nacclow)
inclow_naccmid_trans = maketrans(inclow_naccmid)
inclow_nacchigh_trans = maketrans(inclow_nacchigh)

incmid_nacclow_trans = maketrans(incmid_nacclow)
incmid_naccmid_trans = maketrans(incmid_naccmid)
incmid_nacchigh_trans = maketrans(incmid_nacchigh)

inchigh_nacclow_trans = maketrans(inchigh_nacclow)
inchigh_naccmid_trans = maketrans(inchigh_naccmid)
inchigh_nacchigh_trans = maketrans(inchigh_nacchigh)


In [None]:
f, axes = plt.subplots(1, 1,figsize=(10,4))
plt.title('Customer Segments based on Income along with Age')
sns.countplot(x="inc_bin", hue="age_bin", data=santander_crm)

In [None]:
f, axes = plt.subplots(1, 1,figsize=(10,4))
plt.title('Customer Segments based on Income along with No. of Accounts')
sns.countplot(x="inc_bin", hue="naccount_bin", data=santander_crm)

### MBA for Income and Age Bins

#### 1. Low Income and Young Age

In [None]:
#Low income and young age segment
frequent_itemsets = apriori(inclow_ageyoung_trans, min_support=0.02, use_colnames=True)
frequent_itemsets = frequent_itemsets[frequent_itemsets.itemsets.map(len)<3]
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.2)

rules['chisq'] = ''
rules['pvalue'] = ''
for i in range(0,len(rules)):
    if not ((rules['support'][i] == rules['confidence'][i]) or (rules['lift'][i] == rules['confidence'][i])):
        a=len(inclow_ageyoung)
        b=(rules['lift'][i]-1)**2
        c=(rules['support'][i])*(rules['confidence'][i])
        d=(rules['confidence'][i] - rules['support'][i])
        e=(rules['lift'][i]-rules['confidence'][i])
        rules['chisq'][i] = ((a*b*c)/(d*e))
        rules['pvalue'][i] = 1-(stats.chi2.cdf(rules['chisq'][i], 1))   
        
#Removing insignificant rules based on pvalue
rules = rules[rules['pvalue'] < 0.05]
rules['lift']=round(rules['lift'],5)
rules.sort_values(by=['lift', 'antecedent support'], ascending = [False,False])

#### 2. Low Income and Old Age

In [None]:
#Low income and old age segment
frequent_itemsets = apriori(inclow_ageold_trans, min_support=0.02, use_colnames=True)
frequent_itemsets = frequent_itemsets[frequent_itemsets.itemsets.map(len)<3]
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.2)

rules['chisq'] = ''
rules['pvalue'] = ''
for i in range(0,len(rules)):
    if not ((rules['support'][i] == rules['confidence'][i]) or (rules['lift'][i] == rules['confidence'][i])):
        a=len(inclow_ageold)
        b=(rules['lift'][i]-1)**2
        c=(rules['support'][i])*(rules['confidence'][i])
        d=(rules['confidence'][i] - rules['support'][i])
        e=(rules['lift'][i]-rules['confidence'][i])
        rules['chisq'][i] = ((a*b*c)/(d*e))
        rules['pvalue'][i] = 1-(stats.chi2.cdf(rules['chisq'][i], 1))   
        
#Removing insignificant rules based on pvalue
rules = rules[rules['pvalue'] < 0.05]
rules['lift']=round(rules['lift'],5)
rules.sort_values(by=['lift', 'antecedent support'], ascending = [False,False])

#### 3. Mid Income and Young Age

In [None]:
#mid income and low age segment
frequent_itemsets = apriori(incmid_ageyoung_trans, min_support=0.02, use_colnames=True)
frequent_itemsets = frequent_itemsets[frequent_itemsets.itemsets.map(len)<3]
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.2)

rules['chisq'] = ''
rules['pvalue'] = ''
for i in range(0,len(rules)):
    if not ((rules['support'][i] == rules['confidence'][i]) or (rules['lift'][i] == rules['confidence'][i])):
        a=len(incmid_ageyoung)
        b=(rules['lift'][i]-1)**2
        c=(rules['support'][i])*(rules['confidence'][i])
        d=(rules['confidence'][i] - rules['support'][i])
        e=(rules['lift'][i]-rules['confidence'][i])
        rules['chisq'][i] = ((a*b*c)/(d*e))
        rules['pvalue'][i] = 1-(stats.chi2.cdf(rules['chisq'][i], 1))   
        
#Removing insignificant rules based on pvalue
rules = rules[rules['pvalue'] < 0.05]
rules['lift']=round(rules['lift'],5)
rules.sort_values(by=['lift', 'antecedent support'], ascending = [False,False])

#### 4. Mid Income and Old Age

In [None]:
#mid income and mid age segment
frequent_itemsets = apriori(incmid_ageold_trans, min_support=0.02, use_colnames=True)
frequent_itemsets = frequent_itemsets[frequent_itemsets.itemsets.map(len)<3]
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.2)

rules['chisq'] = ''
rules['pvalue'] = ''
for i in range(0,len(rules)):
    if not ((rules['support'][i] == rules['confidence'][i]) or (rules['lift'][i] == rules['confidence'][i])):
        a=len(incmid_ageold)
        b=(rules['lift'][i]-1)**2
        c=(rules['support'][i])*(rules['confidence'][i])
        d=(rules['confidence'][i] - rules['support'][i])
        e=(rules['lift'][i]-rules['confidence'][i])
        rules['chisq'][i] = ((a*b*c)/(d*e))
        rules['pvalue'][i] = 1-(stats.chi2.cdf(rules['chisq'][i], 1))   
        
#Removing insignificant rules based on pvalue
rules = rules[rules['pvalue'] < 0.05]
rules['lift']=round(rules['lift'],5)
rules.sort_values(by=['lift', 'antecedent support'], ascending = [False,False])

#### 5. High Income and Young Age

In [None]:
#High income and low age segment
frequent_itemsets = apriori(inchigh_ageyoung_trans, min_support=0.02, use_colnames=True)
frequent_itemsets = frequent_itemsets[frequent_itemsets.itemsets.map(len)<3]
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.2)

rules['chisq'] = ''
rules['pvalue'] = ''
for i in range(0,len(rules)):
    if not ((rules['support'][i] == rules['confidence'][i]) or (rules['lift'][i] == rules['confidence'][i])):
        a=len(inchigh_ageyoung)
        b=(rules['lift'][i]-1)**2
        c=(rules['support'][i])*(rules['confidence'][i])
        d=(rules['confidence'][i] - rules['support'][i])
        e=(rules['lift'][i]-rules['confidence'][i])
        rules['chisq'][i] = ((a*b*c)/(d*e))
        rules['pvalue'][i] = 1-(stats.chi2.cdf(rules['chisq'][i], 1))   
        
#Removing insignificant rules based on pvalue
rules = rules[rules['pvalue'] < 0.05]
rules['lift']=round(rules['lift'],5)
rules.sort_values(by=['lift', 'antecedent support'], ascending = [False,False])

#### 6. High Income and Old Age

In [None]:
#High income and mid age segment
frequent_itemsets = apriori(inchigh_ageold_trans, min_support=0.02, use_colnames=True)
frequent_itemsets = frequent_itemsets[frequent_itemsets.itemsets.map(len)<3]
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.2)

rules['chisq'] = ''
rules['pvalue'] = ''
for i in range(0,len(rules)):
    if not ((rules['support'][i] == rules['confidence'][i]) or (rules['lift'][i] == rules['confidence'][i])):
        a=len(inchigh_ageold)
        b=(rules['lift'][i]-1)**2
        c=(rules['support'][i])*(rules['confidence'][i])
        d=(rules['confidence'][i] - rules['support'][i])
        e=(rules['lift'][i]-rules['confidence'][i])
        rules['chisq'][i] = ((a*b*c)/(d*e))
        rules['pvalue'][i] = 1-(stats.chi2.cdf(rules['chisq'][i], 1))   
        
#Removing insignificant rules based on pvalue
rules = rules[rules['pvalue'] < 0.05]
rules['lift']=round(rules['lift'],5)
rules.sort_values(by=['lift', 'antecedent support'], ascending = [False,False])

### MBA for Income and No of accounts bins

#### 1. Low Income and Less No of accounts

In [None]:
#Low income and low nacc segment
frequent_itemsets = apriori(inclow_nacclow_trans, min_support=0.02, use_colnames=True)
frequent_itemsets = frequent_itemsets[frequent_itemsets.itemsets.map(len)<3]
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.2)

rules['chisq'] = ''
rules['pvalue'] = ''
for i in range(0,len(rules)):
    if not ((rules['support'][i] == rules['confidence'][i]) or (rules['lift'][i] == rules['confidence'][i])):
        a=len(inclow_nacclow)
        b=(rules['lift'][i]-1)**2
        c=(rules['support'][i])*(rules['confidence'][i])
        d=(rules['confidence'][i] - rules['support'][i])
        e=(rules['lift'][i]-rules['confidence'][i])
        rules['chisq'][i] = ((a*b*c)/(d*e))
        rules['pvalue'][i] = 1-(stats.chi2.cdf(rules['chisq'][i], 1))   
        
#Removing insignificant rules based on pvalue
rules = rules[rules['pvalue'] < 0.05]
rules['lift']=round(rules['lift'],5)
rules.sort_values(by=['lift', 'antecedent support'], ascending = [False,False])

#### 2. Low Income and Average No of accounts

In [None]:
#Low income and mid nacc segment
frequent_itemsets = apriori(inclow_naccmid_trans, min_support=0.02, use_colnames=True)
frequent_itemsets = frequent_itemsets[frequent_itemsets.itemsets.map(len)<3]
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.2)

rules['chisq'] = ''
rules['pvalue'] = ''
for i in range(0,len(rules)):
    if not ((rules['support'][i] == rules['confidence'][i]) or (rules['lift'][i] == rules['confidence'][i])):
        a=len(inclow_naccmid)
        b=(rules['lift'][i]-1)**2
        c=(rules['support'][i])*(rules['confidence'][i])
        d=(rules['confidence'][i] - rules['support'][i])
        e=(rules['lift'][i]-rules['confidence'][i])
        rules['chisq'][i] = ((a*b*c)/(d*e))
        rules['pvalue'][i] = 1-(stats.chi2.cdf(rules['chisq'][i], 1))   
        
#Removing insignificant rules based on pvalue
rules = rules[rules['pvalue'] < 0.05]
rules['lift']=round(rules['lift'],5)
rules.sort_values(by=['lift', 'antecedent support'], ascending = [False,False])

#### 3. Low Income and More No of Accounts

In [None]:
#Low income and high nacc segment
frequent_itemsets = apriori(inclow_nacchigh_trans, min_support=0.02, use_colnames=True)
frequent_itemsets = frequent_itemsets[frequent_itemsets.itemsets.map(len)<3]
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.2)

rules['chisq'] = ''
rules['pvalue'] = ''
for i in range(0,len(rules)):
    if not ((rules['support'][i] == rules['confidence'][i]) or (rules['lift'][i] == rules['confidence'][i])):
        a=len(inclow_nacchigh)
        b=(rules['lift'][i]-1)**2
        c=(rules['support'][i])*(rules['confidence'][i])
        d=(rules['confidence'][i] - rules['support'][i])
        e=(rules['lift'][i]-rules['confidence'][i])
        rules['chisq'][i] = ((a*b*c)/(d*e))
        rules['pvalue'][i] = 1-(stats.chi2.cdf(rules['chisq'][i], 1))   
        
#Removing insignificant rules based on pvalue
rules = rules[rules['pvalue'] < 0.05]
rules['lift']=round(rules['lift'],5)
rules.sort_values(by=['lift', 'antecedent support'], ascending = [False,False])

#### 4. Mid Income and Less No of Accounts

In [None]:
#Mid income and low nacc segment
frequent_itemsets = apriori(incmid_nacclow_trans, min_support=0.02, use_colnames=True)
frequent_itemsets = frequent_itemsets[frequent_itemsets.itemsets.map(len)<3]
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.2)

rules['chisq'] = ''
rules['pvalue'] = ''
for i in range(0,len(rules)):
    if not ((rules['support'][i] == rules['confidence'][i]) or (rules['lift'][i] == rules['confidence'][i])):
        a=len(incmid_nacclow)
        b=(rules['lift'][i]-1)**2
        c=(rules['support'][i])*(rules['confidence'][i])
        d=(rules['confidence'][i] - rules['support'][i])
        e=(rules['lift'][i]-rules['confidence'][i])
        rules['chisq'][i] = ((a*b*c)/(d*e))
        rules['pvalue'][i] = 1-(stats.chi2.cdf(rules['chisq'][i], 1))   
        
#Removing insignificant rules based on pvalue
rules = rules[rules['pvalue'] < 0.05]
rules['lift']=round(rules['lift'],5)
rules.sort_values(by=['lift', 'antecedent support'], ascending = [False,False])

#### 5. Mid Income and Average No of Accounts

In [None]:
#Mid income and mid nacc segment
frequent_itemsets = apriori(incmid_naccmid_trans, min_support=0.02, use_colnames=True)
frequent_itemsets = frequent_itemsets[frequent_itemsets.itemsets.map(len)<3]
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.2)

rules['chisq'] = ''
rules['pvalue'] = ''
for i in range(0,len(rules)):
    if not ((rules['support'][i] == rules['confidence'][i]) or (rules['lift'][i] == rules['confidence'][i])):
        a=len(incmid_naccmid)
        b=(rules['lift'][i]-1)**2
        c=(rules['support'][i])*(rules['confidence'][i])
        d=(rules['confidence'][i] - rules['support'][i])
        e=(rules['lift'][i]-rules['confidence'][i])
        rules['chisq'][i] = ((a*b*c)/(d*e))
        rules['pvalue'][i] = 1-(stats.chi2.cdf(rules['chisq'][i], 1))   
        
#Removing insignificant rules based on pvalue
rules = rules[rules['pvalue'] < 0.05]
rules['lift']=round(rules['lift'],5)
rules.sort_values(by=['lift', 'antecedent support'], ascending = [False,False])

#### 6. Mid Income and More No of Accounts

In [None]:
#Mid income and high nacc segment
frequent_itemsets = apriori(incmid_nacchigh_trans, min_support=0.02, use_colnames=True)
frequent_itemsets = frequent_itemsets[frequent_itemsets.itemsets.map(len)<3]
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.2)

rules['chisq'] = ''
rules['pvalue'] = ''
for i in range(0,len(rules)):
    if not ((rules['support'][i] == rules['confidence'][i]) or (rules['lift'][i] == rules['confidence'][i])):
        a=len(incmid_nacchigh)
        b=(rules['lift'][i]-1)**2
        c=(rules['support'][i])*(rules['confidence'][i])
        d=(rules['confidence'][i] - rules['support'][i])
        e=(rules['lift'][i]-rules['confidence'][i])
        rules['chisq'][i] = ((a*b*c)/(d*e))
        rules['pvalue'][i] = 1-(stats.chi2.cdf(rules['chisq'][i], 1))   
        
#Removing insignificant rules based on pvalue
rules = rules[rules['pvalue'] < 0.05]
rules['lift']=round(rules['lift'],5)
rules.sort_values(by=['lift', 'antecedent support'], ascending = [False,False])

#### 7. High Income and Less No of Accounts

In [None]:
#High income and low nacc segment
frequent_itemsets = apriori(inchigh_nacclow_trans, min_support=0.02, use_colnames=True)
frequent_itemsets = frequent_itemsets[frequent_itemsets.itemsets.map(len)<3]
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.2)

rules['chisq'] = ''
rules['pvalue'] = ''
for i in range(0,len(rules)):
    if not ((rules['support'][i] == rules['confidence'][i]) or (rules['lift'][i] == rules['confidence'][i])):
        a=len(inchigh_nacclow)
        b=(rules['lift'][i]-1)**2
        c=(rules['support'][i])*(rules['confidence'][i])
        d=(rules['confidence'][i] - rules['support'][i])
        e=(rules['lift'][i]-rules['confidence'][i])
        rules['chisq'][i] = ((a*b*c)/(d*e))
        rules['pvalue'][i] = 1-(stats.chi2.cdf(rules['chisq'][i], 1))   
        
#Removing insignificant rules based on pvalue
rules = rules[rules['pvalue'] < 0.05]
rules['lift']=round(rules['lift'],5)
rules.sort_values(by=['lift', 'antecedent support'], ascending = [False,False])

#### 8. High Income and Average No of Accounts

In [None]:
#High income and mid nacc segment
frequent_itemsets = apriori(inchigh_naccmid_trans, min_support=0.02, use_colnames=True)
frequent_itemsets = frequent_itemsets[frequent_itemsets.itemsets.map(len)<3]
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.2)

rules['chisq'] = ''
rules['pvalue'] = ''
for i in range(0,len(rules)):
    if not ((rules['support'][i] == rules['confidence'][i]) or (rules['lift'][i] == rules['confidence'][i])):
        a=len(inchigh_naccmid)
        b=(rules['lift'][i]-1)**2
        c=(rules['support'][i])*(rules['confidence'][i])
        d=(rules['confidence'][i] - rules['support'][i])
        e=(rules['lift'][i]-rules['confidence'][i])
        rules['chisq'][i] = ((a*b*c)/(d*e))
        rules['pvalue'][i] = 1-(stats.chi2.cdf(rules['chisq'][i], 1))   
        
#Removing insignificant rules based on pvalue
rules = rules[rules['pvalue'] < 0.05]
rules['lift']=round(rules['lift'],5)
rules.sort_values(by=['lift', 'antecedent support'], ascending = [False,False])

#### 9. High Income and More No of Accounts

In [None]:
#High income and high nacc segment
frequent_itemsets = apriori(incmid_nacchigh_trans, min_support=0.02, use_colnames=True)
frequent_itemsets = frequent_itemsets[frequent_itemsets.itemsets.map(len)<3]
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.2)

rules['chisq'] = ''
rules['pvalue'] = ''
for i in range(0,len(rules)):
    if not ((rules['support'][i] == rules['confidence'][i]) or (rules['lift'][i] == rules['confidence'][i])):
        a=len(incmid_nacchigh)
        b=(rules['lift'][i]-1)**2
        c=(rules['support'][i])*(rules['confidence'][i])
        d=(rules['confidence'][i] - rules['support'][i])
        e=(rules['lift'][i]-rules['confidence'][i])
        rules['chisq'][i] = ((a*b*c)/(d*e))
        rules['pvalue'][i] = 1-(stats.chi2.cdf(rules['chisq'][i], 1))   
        
#Removing insignificant rules based on pvalue
rules = rules[rules['pvalue'] < 0.05]
rules['lift']=round(rules['lift'],5)
rules.sort_values(by=['lift', 'antecedent support'], ascending = [False,False])