## Hypothesis testing and affinity analysis

In [1]:
import pandas as pd
import numpy as np
from scipy import stats
df = pd.read_csv('QVI_data_joined.csv')
df['PREMIUM_CUSTOMER'] = df['PREMIUM_CUSTOMER'].str.strip()
df['BRAND'] = df['BRAND'].str.strip()
df['LIFESTAGE'] = df['LIFESTAGE'].str.strip()
df.head()



Unnamed: 0,ADATE,STORE_NBR,LYLTY_CARD_NBR,TXN_ID,PROD_NBR,PROD_NAME,PROD_QTY,TOT_SALES,WEIGHT_IN_GRAMS,PACK_SIZE,BRAND,LIFESTAGE,PREMIUM_CUSTOMER
0,17-10-2018,1,1000,1,5,Natural Chip Compny SeaSalt175g,2,6.0,175,Medium,Natural,YOUNG SINGLES/COUPLES,Premium
1,14-05-2019,1,1307,348,66,CCs Nacho Cheese 175g,3,6.3,175,Medium,CCs,MIDAGE SINGLES/COUPLES,Budget
2,20-05-2019,1,1343,383,61,Smiths Crinkle Cut Chips Chicken 170g,2,2.9,170,Medium,Smiths,MIDAGE SINGLES/COUPLES,Budget
3,17-08-2018,2,2373,974,69,Smiths Chip Thinly S/Cream&Onion 175g,5,15.0,175,Medium,Smiths,MIDAGE SINGLES/COUPLES,Budget
4,18-08-2018,2,2426,1038,108,Kettle Tortilla ChpsHny&Jlpno Chili 150g,3,13.8,150,Medium,Kettle,MIDAGE SINGLES/COUPLES,Budget


#### Independent t-test to check statistical significance
#### Mainstream vs Budget & Premium customer segments for these life-stages: Midage singles / couples , Young singles / couples

In [None]:
group1 = df[(df['PREMIUM_CUSTOMER'] == 'Mainstream') & (df['LIFESTAGE'].isin(['MIDAGE SINGLES/COUPLES', 'YOUNG SINGLES/COUPLES']))].groupby(['PREMIUM_CUSTOMER','LIFESTAGE'], as_index=False).agg({'TOT_SALES':'sum','PROD_QTY':'sum'})
group1['AVG_PRICE_PER_UNIT'] = group1['TOT_SALES'] / group1['PROD_QTY']
print(group1)
grp1ppu = group1['AVG_PRICE_PER_UNIT']

group2 = df[(df['PREMIUM_CUSTOMER'].isin(['Budget','Premium'])) & (df['LIFESTAGE'].isin(['MIDAGE SINGLES/COUPLES', 'YOUNG SINGLES/COUPLES']))].groupby(['PREMIUM_CUSTOMER','LIFESTAGE'], as_index=False).agg({'TOT_SALES':'sum','PROD_QTY':'sum'})
group2['AVG_PRICE_PER_UNIT'] = group2['TOT_SALES'] / group2['PROD_QTY']
print(group2)
grp2ppu = group2['AVG_PRICE_PER_UNIT']

stats.ttest_ind(grp1ppu,grp2ppu,equal_var=False)

  PREMIUM_CUSTOMER               LIFESTAGE  TOT_SALES  PROD_QTY  \
0       Mainstream  MIDAGE SINGLES/COUPLES   90803.85     22699   
1       Mainstream   YOUNG SINGLES/COUPLES  157621.60     38632   

   AVG_PRICE_PER_UNIT  
0            4.000346  
1            4.080079  
  PREMIUM_CUSTOMER               LIFESTAGE  TOT_SALES  PROD_QTY  \
0           Budget  MIDAGE SINGLES/COUPLES   35514.80      9496   
1           Budget   YOUNG SINGLES/COUPLES   61141.60     16671   
2          Premium  MIDAGE SINGLES/COUPLES   58432.65     15526   
3          Premium   YOUNG SINGLES/COUPLES   41642.10     11331   

   AVG_PRICE_PER_UNIT  
0            3.739975  
1            3.667542  
2            3.763535  
3            3.675060  


TtestResult(statistic=np.float64(7.081639382806015), pvalue=np.float64(0.02681531869636337), df=np.float64(1.7629408734710912))

We got a p value < 0.05 So, we can conclude that the avg. price per unit for mainstream segment is significantly higher than budget and premium segments for the following life-stages: midage singles/couples & young singles/couples.

#### We can target groups that are a significant contributor to sales like Mainstream- young singles/couples in order to retain them. Let’s find out if this group tend to buy a particular brand of chips using affinity analysis.

In [3]:

# Filter the data for the target segment
segment1 = df[(df["LIFESTAGE"] == "YOUNG SINGLES/COUPLES") & (df["PREMIUM_CUSTOMER"] == "Mainstream")]
other = df[~((df["LIFESTAGE"] == "YOUNG SINGLES/COUPLES") & (df["PREMIUM_CUSTOMER"] == "Mainstream"))]

# Check unique values in both columns

# Calculate total quantity for each segment
quantity_segment1 = segment1["PROD_QTY"].sum()
quantity_other = other["PROD_QTY"].sum()

# Calculate brand proportions for both groups
quantity_segment1_by_brand = segment1.groupby("BRAND")["PROD_QTY"].sum().reset_index()
quantity_segment1_by_brand["targetSegment"] = quantity_segment1_by_brand["PROD_QTY"] / quantity_segment1
quantity_segment1_by_brand.drop(columns=["PROD_QTY"], inplace=True)

quantity_other_by_brand = other.groupby("BRAND")["PROD_QTY"].sum().reset_index()
quantity_other_by_brand["other"] = quantity_other_by_brand["PROD_QTY"] / quantity_other
quantity_other_by_brand.drop(columns=["PROD_QTY"], inplace=True)

# Merge the two dataframes and calculate brand affinity
brand_proportions = pd.merge(quantity_segment1_by_brand, quantity_other_by_brand, on="BRAND")
brand_proportions["affinityToBrand"] = brand_proportions["targetSegment"] / brand_proportions["other"]

# Sort by affinity in descending order
brand_proportions = brand_proportions.sort_values(by="affinityToBrand", ascending=False)

print(brand_proportions)

           BRAND  targetSegment     other  affinityToBrand
20      Tyrrells       0.029587  0.023933         1.236230
19      Twisties       0.043306  0.035283         1.227396
9         Kettle       0.185649  0.154217         1.203818
18      Tostitos       0.042581  0.035377         1.203633
12   Old El Paso       0.041598  0.034753         1.196953
13      Pringles       0.111980  0.093744         1.194531
5        Doritos       0.122877  0.105278         1.167171
4           Cobs       0.041856  0.036375         1.150696
8      Infuzions       0.060649  0.053157         1.140942
17         Thins       0.056611  0.053084         1.066440
7    Grain Waves       0.030674  0.029052         1.055821
3       Cheezels       0.016851  0.017370         0.970137
15        Smiths       0.093420  0.121710         0.767559
6         French       0.003702  0.005364         0.690110
2        Cheetos       0.007533  0.011240         0.670143
14           RRD       0.045377  0.068427         0.6631

We can see that :

Mainstream young singles/couples are `23% more likely` to purchase `Tyrrells` chips compared to the
rest of the population

And they are `56% less likely` to purchase `Burger Rings` compared to the rest
of the population

#### Let’s also find out if this group tends to buy larger packs of chips

In [4]:
# Calculate PACK_SIZE proportions for both groups
quantity_segment1_by_packsize = segment1.groupby("PACK_SIZE")["PROD_QTY"].sum().reset_index()
quantity_segment1_by_packsize["targetSegment"] = quantity_segment1_by_packsize["PROD_QTY"] / quantity_segment1
quantity_segment1_by_packsize.drop(columns=["PROD_QTY"], inplace=True)

quantity_other_by_packsize = other.groupby("PACK_SIZE")["PROD_QTY"].sum().reset_index()
quantity_other_by_packsize["other"] = quantity_other_by_packsize["PROD_QTY"] / quantity_other
quantity_other_by_packsize.drop(columns=["PROD_QTY"], inplace=True)

# Merge the two dataframes and calculate affinity to PACK_SIZE
packsize_proportions = pd.merge(quantity_segment1_by_packsize, quantity_other_by_packsize, on="PACK_SIZE")
packsize_proportions["affinityToPackSize"] = packsize_proportions["targetSegment"] / packsize_proportions["other"]

# Sort by affinity in descending order
packsize_proportions = packsize_proportions.sort_values(by="affinityToPackSize", ascending=False)

print(packsize_proportions)

     PACK_SIZE  targetSegment     other  affinityToPackSize
0  Extra-Large       0.087622  0.070559            1.241815
1        Large       0.128313  0.122092            1.050953
2       Medium       0.775264  0.789740            0.981670
3        Small       0.008801  0.017608            0.499824


This group is `24% more likely` to purchase chips that have `extra-large` pack size.