# Hypothesis contrast per clusters.

In this step, we want to perfom the same hypothesis contrast from previous steps in order to detect differences in average visits and average expenditures when customers are subject of a marketing campaign and when they are not.

As we know, there is statistical evidence to claim that the marketing efforts are making a difference in the purchase behavior of customers but now, we wanted to take a closer look to see if this behavior is present or not in the macro clusters.

### Importing the libraries

In [1]:
import pandas as pd
pd.set_option('display.max_columns', 50)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy import create_engine
import scipy
from scipy import stats
import math as m

### Importing the datasets of interest for this analysis.

The transactions_clean data contains all the transactions' information of the household, and the macro_cluster dataset contains the information regarding the cluster they belong to. The idea is to merge them and then perform the hypothesis testing.

- H0: There are no differences in avg number of trips and avg expenditures when applying maketing campaigns. 
- HA: There are differences in avg number of trips and avg expenditures when the supermarket performs marketing campaigns.

In [2]:
transactions = pd.read_csv("../../data_final_project/transactions_clean.csv")

In [3]:
macro_cluster_df = pd.read_csv("../../data_final_project/macro_cluster_df.csv")

In [4]:
macro_cluster_df.head()

Unnamed: 0,HOUSEHOLD_KEY,baskets,avg_unit_price,avg_qty_per_basket,total_expenditure,cluster,AGE_DESC,INCOME_DESC,ADULTS_NUM,KIDS_NUM,NUMEROUS_FAM,SINGLE_GENDER,AGE_MED,INCOME_LEVEL,GENDER_CAT
0,1,14,2.782795,24.7857,805.58,3,65+,35-49K,2,0,0,,60.0,2.0,
1,7,6,2.873263,15.8333,228.18,2,45-54,50-74K,2,0,0,,50.0,3.0,
2,20,5,2.400513,7.8,75.95,2,25-34,75-99K,2,0,0,,30.0,4.0,
3,25,17,2.854803,13.4706,534.2,2,35-44,50-74K,1,0,0,,40.0,3.0,
4,27,41,2.335259,2.8293,260.8,4,45-54,25-34K,1,0,0,Single Female,50.0,2.0,1.0


### Slicing the dataframe to get only the relevant information

In [5]:
macro_cluster_df_red = macro_cluster_df.iloc[:,0:1]

In [6]:
macro_cluster_df_red["cluster"]=macro_cluster_df["cluster"]

In [7]:
macro_cluster_df_red.head()

Unnamed: 0,HOUSEHOLD_KEY,cluster
0,1,3
1,7,2
2,20,2
3,25,2
4,27,4


#### The query below is just informative. We ran the query in SEQUEL PRO and saved the results into a csv called mk_effect (called below)

In [None]:
query_mk_effect = """
-- statistics before campaign; 

create table ai.b_1 as
select distinct a.household_key, count(distinct a.basket_id) as total_trips,
sum(a.sales_value) as total_spend
from ai.transactions_data a, ai.aa b
where a.household_key = b.household_key and a.day < b.first_start_day
group by b.household_key; 

create table ai.b_2 as
select distinct a.household_key, a.total_trips/b.first_start_week as avg_trips,
a.total_spend/b.first_start_week as avg_spend
from ai.b_1 a, ai.aa b
where a.household_key = b.household_key
order by a.household_key; 

-- statistics after campaign;
create table ai.c_1 as
select distinct a.household_key, count(distinct a.basket_id) as total_trips,
sum(a.sales_value) as total_spend
from ai.transactions_data a, ai.aa b
where a.household_key = b.household_key and a.day >= b.first_start_day
group by a.household_key;

-- 1581 households have data after campaign dates

create table ai.c_11 as
select a.household_key, a.total_trips, a.total_spend, b.campaigns, b.first_start_week 
from ai.c_1 a
join ai.aa b
USING(household_key);

create table ai.c_2 as
select distinct household_key, total_trips/(104 - first_start_week) as avg_trips,
total_spend/(104 - first_start_week) as avg_spend, campaigns
from ai.c_11
order by household_key; 

-- combining data for differences in stats;

create table ai.mk_effect as ### The result of this query is the one that we are going to use to perform the analysis.
select distinct a.household_key, b.avg_trips - a.avg_trips as diff_avg_trips,
b.avg_spend - a.avg_spend as diff_avg_spend, b.campaigns
from ai.b_2 a, ai.c_2 b
where a.household_key = b.household_key
order by a.household_key; 
"""

mk_effect = pd.read_sql(query_mk_effect, engine)

### We then exported the resulting query into the mk_effect table and imported as a csv.

In [16]:
mk_effect = pd.read_csv("../../data_final_project/mk_effect.csv")

In [18]:
mk_effect.head()

Unnamed: 0,household_key,diff_avg_trips,diff_avg_spend,campaigns
0,1,0.2749,9.829216,8
1,2,-0.1663,-2.743128,1
2,3,-0.2499,-25.708983,3
3,4,-0.2353,-12.156438,1
4,6,0.6687,0.972535,4


### Renaming the household_key column

In [19]:
mk_effect = mk_effect.rename(columns={"household_key":"HOUSEHOLD_KEY"})

In [20]:
mk_effect_df = macro_cluster_df_red.merge(mk_effect, how="inner", on="HOUSEHOLD_KEY")

In [21]:
mk_effect_df.head()

Unnamed: 0,HOUSEHOLD_KEY,cluster,diff_avg_trips,diff_avg_spend,campaigns
0,1,3,0.2749,9.829216,8
1,7,2,0.3289,22.730689,4
2,20,2,2.3246,49.374415,7
3,25,2,0.3377,10.511198,4
4,27,4,3.684,66.776076,7


### Slicing the datagrame into the different 5 cluster to perform individual contrast hypothesis analysis.

In [22]:
mk_effect_df_c0 = mk_effect_df[mk_effect_df["cluster"]==0]
mk_effect_df_c1 = mk_effect_df[mk_effect_df["cluster"]==1]
mk_effect_df_c2 = mk_effect_df[mk_effect_df["cluster"]==2]
mk_effect_df_c3 = mk_effect_df[mk_effect_df["cluster"]==3]
mk_effect_df_c4 = mk_effect_df[mk_effect_df["cluster"]==4]

In [23]:
mk_effect_df_c1.head()

Unnamed: 0,HOUSEHOLD_KEY,cluster,diff_avg_trips,diff_avg_spend,campaigns
11,67,1,1.6441,57.986248,8
13,77,1,-0.1123,6.973535,2
16,97,1,0.9582,11.828601,4
37,219,1,1.2708,60.281285,7
40,242,1,0.1281,13.148032,5


### Creating lists with the relevant information:

- Lists per cluster with the differences in avg. trips and avg. expenditures

#### Differences between the number of trips during campaigns and off campaigns.

In [24]:
diff_avg_trips_c0 = list(mk_effect_df_c0["diff_avg_trips"])

In [25]:
diff_avg_trips_c1 = list(mk_effect_df_c1["diff_avg_trips"])

In [26]:
diff_avg_trips_c2 = list(mk_effect_df_c2["diff_avg_trips"])

In [27]:
diff_avg_trips_c3 = list(mk_effect_df_c3["diff_avg_trips"])

In [28]:
diff_avg_trips_c4 = list(mk_effect_df_c4["diff_avg_trips"])

#### Differences between the total_expendites during campaigns and off campaigns.

In [29]:
diff_avg_spend_c0 = list(mk_effect_df_c0["diff_avg_spend"])

In [30]:
diff_avg_spend_c1 = list(mk_effect_df_c1["diff_avg_spend"])

In [31]:
diff_avg_spend_c2 = list(mk_effect_df_c2["diff_avg_spend"])

In [32]:
diff_avg_spend_c3 = list(mk_effect_df_c3["diff_avg_spend"])

In [33]:
diff_avg_spend_c4 = list(mk_effect_df_c4["diff_avg_spend"])

### Performing the hypothesis contrasts per cluster and per variable.

#### Cluster 0: Trips

In [34]:
n=len(diff_avg_trips_c0)
ci = 0.95 
sample_std = np.std(diff_avg_trips_c0)
quasi_std = sample_std*(m.sqrt(n/(n-1)))
sample_meam = np.mean(diff_avg_trips_c0)

scipy.stats.t.interval(ci, n-1, loc=sample_meam, scale=quasi_std/m.sqrt(n))

print(f"We are 95% confident that the population mean lies between \
{scipy.stats.t.interval(ci, n-1, loc=sample_meam, scale=quasi_std/m.sqrt(n))}")

We are 95% confident that the population mean lies between (0.34749054438373084, 0.654241323748137)


#### Cluster 0: Total expeditures

In [35]:
n=len(diff_avg_spend_c0)
ci = 0.95 
sample_std = np.std(diff_avg_spend_c0)
quasi_std = sample_std*(m.sqrt(n/(n-1)))
sample_meam = np.mean(diff_avg_spend_c0)

scipy.stats.t.interval(ci, n-1, loc=sample_meam, scale=quasi_std/m.sqrt(n))

print(f"We are 95% confident that the population mean lies between \
{scipy.stats.t.interval(ci, n-1, loc=sample_meam, scale=quasi_std/m.sqrt(n))}")

We are 95% confident that the population mean lies between (13.590162192905268, 23.712769413153346)


#### Cluster 1: Trips

In [36]:
n=len(diff_avg_trips_c1)
ci = 0.95 
sample_std = np.std(diff_avg_trips_c1)
quasi_std = sample_std*(m.sqrt(n/(n-1)))
sample_meam = np.mean(diff_avg_trips_c1)

scipy.stats.t.interval(ci, n-1, loc=sample_meam, scale=quasi_std/m.sqrt(n))

print(f"We are 95% confident that the population mean lies between \
{scipy.stats.t.interval(ci, n-1, loc=sample_meam, scale=quasi_std/m.sqrt(n))}")

We are 95% confident that the population mean lies between (0.1987147123025118, 0.8393411016509765)


#### Cluster 1: Total expeditures

In [37]:
n=len(diff_avg_spend_c1)
ci = 0.95 
sample_std = np.std(diff_avg_spend_c1)
quasi_std = sample_std*(m.sqrt(n/(n-1)))
sample_meam = np.mean(diff_avg_spend_c1)

scipy.stats.t.interval(ci, n-1, loc=sample_meam, scale=quasi_std/m.sqrt(n))

print(f"We are 95% confident that the population mean lies between \
{scipy.stats.t.interval(ci, n-1, loc=sample_meam, scale=quasi_std/m.sqrt(n))}")

We are 95% confident that the population mean lies between (5.6346048401624795, 23.068980249463856)


#### Cluster 2: Trips

In [38]:
n=len(diff_avg_trips_c2)
ci = 0.95 
sample_std = np.std(diff_avg_trips_c2)
quasi_std = sample_std*(m.sqrt(n/(n-1)))
sample_meam = np.mean(diff_avg_trips_c2)

scipy.stats.t.interval(ci, n-1, loc=sample_meam, scale=quasi_std/m.sqrt(n))

print(f"We are 95% confident that the population mean lies between \
{scipy.stats.t.interval(ci, n-1, loc=sample_meam, scale=quasi_std/m.sqrt(n))}")

We are 95% confident that the population mean lies between (0.37138978552384994, 0.6399983723708869)


#### Cluster 2: Total expeditures

In [39]:
n=len(diff_avg_spend_c2)
ci = 0.95 
sample_std = np.std(diff_avg_spend_c2)
quasi_std = sample_std*(m.sqrt(n/(n-1)))
sample_meam = np.mean(diff_avg_spend_c2)

scipy.stats.t.interval(ci, n-1, loc=sample_meam, scale=quasi_std/m.sqrt(n))

print(f"We are 95% confident that the population mean lies between \
{scipy.stats.t.interval(ci, n-1, loc=sample_meam, scale=quasi_std/m.sqrt(n))}")

We are 95% confident that the population mean lies between (9.749711799082826, 15.995067479807668)


#### Cluster 3: Trips

In [40]:
n=len(diff_avg_trips_c3)
ci = 0.95 
sample_std = np.std(diff_avg_trips_c3)
quasi_std = sample_std*(m.sqrt(n/(n-1)))
sample_meam = np.mean(diff_avg_trips_c3)

scipy.stats.t.interval(ci, n-1, loc=sample_meam, scale=quasi_std/m.sqrt(n))

print(f"We are 95% confident that the population mean lies between \
{scipy.stats.t.interval(ci, n-1, loc=sample_meam, scale=quasi_std/m.sqrt(n))}")

We are 95% confident that the population mean lies between (0.20085059193627056, 0.6455298958686075)


#### Cluster 3: Total expeditures

In [41]:
n=len(diff_avg_spend_c3)
ci = 0.95 
sample_std = np.std(diff_avg_spend_c3)
quasi_std = sample_std*(m.sqrt(n/(n-1)))
sample_meam = np.mean(diff_avg_spend_c3)

scipy.stats.t.interval(ci, n-1, loc=sample_meam, scale=quasi_std/m.sqrt(n))

print(f"We are 95% confident that the population mean lies between \
{scipy.stats.t.interval(ci, n-1, loc=sample_meam, scale=quasi_std/m.sqrt(n))}")

We are 95% confident that the population mean lies between (13.326024627285538, 28.656609569187804)


#### Cluster 4: Trips

In [42]:
n=len(diff_avg_trips_c4)
ci = 0.95 
sample_std = np.std(diff_avg_trips_c4)
quasi_std = sample_std*(m.sqrt(n/(n-1)))
sample_meam = np.mean(diff_avg_trips_c4)

scipy.stats.t.interval(ci, n-1, loc=sample_meam, scale=quasi_std/m.sqrt(n))

print(f"We are 95% confident that the population mean lies between \
{scipy.stats.t.interval(ci, n-1, loc=sample_meam, scale=quasi_std/m.sqrt(n))}")

We are 95% confident that the population mean lies between (-0.29236246662659987, 0.3456568062492414)


#### Cluster 4: Total expeditures

In [43]:
n=len(diff_avg_spend_c4)
ci = 0.95 
sample_std = np.std(diff_avg_spend_c4)
quasi_std = sample_std*(m.sqrt(n/(n-1)))
sample_meam = np.mean(diff_avg_spend_c4)

scipy.stats.t.interval(ci, n-1, loc=sample_meam, scale=quasi_std/m.sqrt(n))

print(f"We are 95% confident that the population mean lies between \
{scipy.stats.t.interval(ci, n-1, loc=sample_meam, scale=quasi_std/m.sqrt(n))}")

We are 95% confident that the population mean lies between (2.7896424204597023, 15.156276788045599)


### Commenting the results:

As it seems from the results, there are clusters where their confidence intervals are further from 0 which gives us information regarding the sensitivity to the campaigns per cluster. In all the clusters we could reject the null hypothesis and claim that there is evidencee to suggest that there is an effect in the marketing efforts in terms of avg number of trips and avg expenditure. The only cluster where we could not reject the null hypothesis was in cluster 4 regarding the number of trips but we have seen that this group does not visit the supermarket very often (we could assume that it is because they are not loyal yet) but when they do, they tend to buy many products. 

### For further steps, we want to see the effects on discounts per quater

### Performing an inner join between left: transactions and right the info from the macro_cluster data.

In [8]:
mk = transactions.merge(macro_cluster_df_red, how="inner", on="HOUSEHOLD_KEY")

In [9]:
mk.shape

(614066, 19)

In [10]:
mk.head(2)

Unnamed: 0.1,Unnamed: 0,HOUSEHOLD_KEY,BASKET_ID,WEEK_NO,DAY,TRANS_TIME,STORE_ID,PRODUCT_ID,QUANTITY,SALES_VALUE,RETAIL_DISC,COUPON_DISC,COUPON_MATCH_DISC,SHELF_PRICE,CLIENT_PRICE,%_TOTAL_DISCOUNT,%_LOYALTY_DISCOUNT,%_COUPON_DISCOUNT,cluster
0,11,1364,26984896261,1,1,1520,31742,842930,1,2.19,0.0,0.0,0.0,2.19,2.19,0.0,0.0,0.0,4
1,12,1364,26984896261,1,1,1520,31742,897044,1,2.99,-0.4,0.0,0.0,3.39,2.99,0.117994,0.117994,0.0,4


In [12]:
mk = mk.drop("Unnamed: 0", axis=1)

In [13]:
mk["DISCOUNT_APPLIED"] = mk["COUPON_DISC"].apply(lambda x: 1 if x<0 else 0)

In [14]:
col = 'WEEK_NO'
conditions = [mk[col] <= 13, (mk[col] >13) & (mk[col]<=26), (mk[col] >26) & (mk[col]<=39),
              (mk[col] >39) & (mk[col]<=52), (mk[col] >52) & (mk[col]<=65), (mk[col] >65) & (mk[col]<=78),
              (mk[col] >78) & (mk[col]<=91), mk[col] >91]  
choices = [1,2,3,4,5,6,7,8]

mk["QUARTER"] = np.select(conditions, choices, default=np.nan)