In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('whitegrid')

print(pd.__version__)
print(matplotlib.__version__)
print(sns.__version__)

# Objective
To analyse the data to understand the current purchasing trends and behaviours, in particular to customer segments and chip purchasing behaviour.

Tasks:
1. Create and Interpret High level summaries
2. Outlier detection and removal
3. Checking data format and correction
4. Feature engg
    1. extra features such as packs size and brand name
5. Metrics: consider what metrics will help describe customers' purchasing behaviour
    1. who spends on chips
    2. what drives spends for each customer segment
    
    You will also want to derive extra features such as pack size and brand name from the data and define metrics of interest to enable you to draw insights on who spends on chips and what drives spends for each customer segment. Remember our end goal is to form a strategy based on the findings to provide a clear recommendation to Julia the Category Manager so make sure your insights can have a commercial application.

# Table of Contents
1. [Examining Transaction Data](#tranxn)
2. [Examining Purchase Behaviour Data](#pb)
3. [Adding new Features](#features)
4. [Data Analysis](#ana)

---
## Examining Transaction Data - Observation Summary <a id='tranxn'></a>
1. [Dates in DATE column were transformed. Found a missing date  ✔](#standard)
2. It is seen that people can buy mutiple products together 🤔
3. No any missing values were observed.
4. [Removed the chip bulk buyer outliers.](#tranoutliers)
5. [Removed non-chip transactions from transactions data.](#nonchip)


In [None]:
trandata = pd.read_csv('/kaggle/input/quantium-data-analytics-virtual-experience-program/Transactions.csv')
trandata.shape

In [None]:
trandata.columns

In [None]:
trandata.info()

### Standardising dates values to YYYY-MM-DD format <a id='standard'></a>

In [None]:
from datetime import datetime, timedelta

def from_excel_ordinal(ordinal, _epoch0=datetime(1899, 12, 31)):
    if ordinal >= 60:
        ordinal -= 1  # Excel leap year bug, 1900 is not a leap year!
    return (_epoch0 + timedelta(days=ordinal)).replace(microsecond=0)

trandata.DATE = trandata.DATE.apply(from_excel_ordinal)

In [None]:
trandata.head()

In [None]:
trandata.describe()

In [None]:
print(trandata.STORE_NBR.nunique())
print(trandata.TXN_ID.nunique())
print(trandata.PROD_NBR.nunique())
print(trandata.TOT_SALES.nunique())
print(trandata.LYLTY_CARD_NBR.nunique())

In [None]:
trandata.TXN_ID.value_counts()

It is seen that people have bought multiple products together, which is expected and normal. See the below records, for example.

In [None]:
trandata[trandata.TXN_ID.isin(['102237'])]

Now checking for outliers in transaction data <a id='tranoutliers'></a>

In [None]:
trandata.columns

In [None]:
print(trandata.DATE.min(), trandata.DATE.max())

In [None]:
# sns.distplot(trandata.DATE, kde=True)
trandata.DATE.hist()
# looks pretty much balanced
#  we have the transaction data of a year from july 18 to june 19

In [None]:
# missed a missing DATE

trandata.DATE.describe(datetime_is_numeric=False)

In [None]:
# 1 date is missing as there are 364 unique dates
# lets find out with the help of a line chart

graph = trandata[['DATE','TXN_ID']].groupby('DATE').count().sort_values(by='DATE')
# plt.figure(figsize=(20,6))
# ax = sns.lineplot(data=graph) # seaborn does not breaks line in case of missing dates (not NAN/null case)
#                               # advised to use matplotlib here  

ax = graph.plot(figsize=(20,6))
# ax.xaxis.set_major_locator(matplotlib.dates.MonthLocator(interval=1))
plt.show()

In [None]:
# the above graph does not shows any line breaks, which it should.
# i looked up on SO and decided to manually lookup for missing date instead

dates = trandata[['DATE']]
dates.drop_duplicates('DATE', inplace=True)
dates['month'] = dates.DATE.dt.month_name()
grp = dates.groupby('month').count()
grp

In [None]:
# looks like we don't have all the dates in the month of December
# lets deepdive in December month

dates[dates.month=='December'].sort_values(['DATE'])

In [None]:
# missing 25 dec. its a holiday 
# so no sales on that day

In [None]:
trandata.STORE_NBR.hist()
plt.show()

In [None]:
trandata.PROD_NBR.hist()

In [None]:
# missed the bulk chip buyer last time
# tackling this time

trandata.PROD_QTY.describe()

In [None]:
# we see that there is atleast 1 transaction where 200 packets are bought, which
# is not normal. Lets examine these transactions

trandata[trandata.PROD_QTY==10]

In [None]:
# there are infact, 2 transactions of big sales, that we should not consider in our analysis.
# lets check if the customer's other transactions

trandata[trandata.LYLTY_CARD_NBR == 226000]

In [None]:
# so there are no any more transactions with the same buyer.
# lets remove these records

print(trandata.shape)
trandata = trandata[trandata.LYLTY_CARD_NBR != 226000]
print(trandata.shape)

removing non-chip data <a id='nonchip'></a>  
text analysis of prod names column


In [None]:
# removing non-chip data
# text analysis of prod names column

from nltk.corpus import stopwords
import re

stopwords = set(stopwords.words('english'))

prod_words=[]

for doc in trandata.PROD_NAME:
    docx = re.sub(r'\d+g', '', doc)
    clean_doc = re.sub(r'[^A-Za-z_ ]+', '', docx) 
    clean_words = [w.strip() for w in clean_doc.split()]
    for w in clean_words:
        if w not in stopwords:
            prod_words.append(w)
    
prod_words = pd.Series(prod_words)
prod_words.value_counts()


In [None]:
prod_words.value_counts()[:20]

In [None]:
print(trandata.shape)
indexes = [not bool(re.search(r'Salsa', name, re.IGNORECASE)) for name in trandata.PROD_NAME]
trandata = trandata[indexes]

print(trandata.shape)

---
## Examining Purchase Behaviour Data - Observation Summary <a id='pb'></a>
1. This data contains information with unique loyalty card numbers.
2. There are 7 clusters of buyers based on Lifestage.
3. Based on the buying choices and prices, buyers are identified in 3 classes as in `PREMIUM_CUSTOMER`
4. No any missing/ null values or outliers were found
5. *All the purchase behaviour data we have is based on chip purchases*

In [None]:
pbdata = pd.read_csv('../input/quantium-data-analytics-virtual-experience-program/PurchaseBehaviour.csv')
pbdata.shape

In [None]:
pbdata.head()

In [None]:
pbdata.info()

In [None]:
# checking for outliers in loyalty card numbers
pbdata.LYLTY_CARD_NBR.nunique()

> Since the number of unique loyalty cards is same in both the purchase behavior data as well as transaction data, I think we have the purchase behaviour details of chips-buying customers only.  
> To validate this assumption, let's check if both the data have same set of loyalty cards or not

In [None]:
set(pbdata.LYLTY_CARD_NBR.unique()) == set(trandata.LYLTY_CARD_NBR.unique())

> Both the datasets share the same number of loyalty cards with same numbers. Hence, we conclude, the transaction data and the purchase behaviour data, both belongs to our target, chip-buying customers. Yay! 🙌

In [None]:
# checking distribution of LIFESTAGE
pbdata.LIFESTAGE.value_counts().plot(kind='bar')
plt.show()

In [None]:
# checking distribution of PREMIUM CUSTOMER
pbdata.PREMIUM_CUSTOMER.value_counts().plot(kind='bar')
plt.show()

---
## Features: <a id='features'></a>
1. The `PROD_NAME` contains brand name, product variant and weight as well. We can separate this information for better analysis.
2. We can derive `PROD_UPRICE` - product's unit price from `PROD_QTY` and `TOT_SALES`
3. Similarly, we can get product brand name under `PROD_BRAND`

1. Adding column `PROD_WTT` - product weight

In [None]:
trandata['PROD_WTT'] = trandata.PROD_NAME.str[-4:-1]

In [None]:
trandata.PROD_WTT.value_counts()

In [None]:
# one particular product name 'Kettle 135g Swt Pot Sea Salt' does not ends with
# product weight in name, hence we will update this information manually 
# in the prod_WTT column

trandata[trandata.PROD_NAME.str.endswith('Salt')].PROD_NAME.value_counts()

In [None]:
indexes = trandata[trandata.PROD_WTT == 'Sal'].index
trandata.loc[indexes, 'PROD_WTT'] = 135

trandata.PROD_WTT.value_counts()

In [None]:
# trandata.PROD_WTT.dtype # this results in dtype('O')
trandata.PROD_WTT=trandata.PROD_WTT.astype('int')

2. Adding column `PROD_UPRICE` - product unit price

In [None]:
# adding the column for unit price of product

trandata['PROD_UPRICE'] = trandata['TOT_SALES'] / trandata['PROD_QTY']

3. Adding column `PROD_BRAND` - product brand name

In [None]:
# looking out to extract brand names
products = trandata['PROD_NAME'].unique()
products.sort()
products[:10]

In [None]:
trandata.PROD_NAME.describe(include='all')

In [None]:
trandata['PROD_BRAND']= trandata['PROD_NAME'].apply(lambda x:x.split()[0])

In [None]:
# Essentialy, words like Dorito/Doritos, Smith/Smiths, Grain/GrnWves etc. are same
# we need to correct these
# Some of the names like Burger need complete name for better understanding

trandata.PROD_BRAND.value_counts().sort_index()

In [None]:
replacements = {'Dorito':'Doritos', 'Grain':'GrnWves', 'Infzns':'Infuzions', 'Smiths':'Smith', 'Snbts':'Sunbites',\
                'Burger':'Burger Rings', 'French': 'French Fries', 'Natural': 'Natural Chip Co', 'Old':'Old El Paso', \
                'Red': 'Red Rock Deli', 'RRD': 'Red Rock Deli', 'NCC': 'Natural Chip Co', 'WW': 'Woolworths'}
trandata.PROD_BRAND.replace(to_replace=replacements, inplace=True)
trandata.PROD_BRAND.value_counts().sort_index()

___
___

Data analysis and customer segments – in your analysis make sure you define the metrics – look at total sales, drivers of sales, where the highest sales are coming from etc. Explore the data, create charts and graphs as well as noting any interesting trends and/or insights you find. These will all form part of our report to Julia.



Metrics: consider what metrics will help describe customers' purchasing behaviour

who spends on chips

what drives spends for each customer segment

You will also want to derive extra features such as pack size and brand name from the data and define metrics of interest to enable you to draw insights on who spends on chips and what drives spends for each customer segment. Remember our end goal is to form a strategy based on the findings to provide a clear recommendation to Julia the Category Manager so make sure your insights can have a commercial application.

## Analysis <a id='ana'></a>

> First, we will combine both of our data source, and then answer the below questions:
1. Discover who are our primary shoppers
2. Find out how customer segments like to spend their money
3. High level summary - Top selling brands, products, weight categories
4. 

In [None]:
masterdf=trandata.merge(pbdata, how='left', on='LYLTY_CARD_NBR')
masterdf.shape

In [None]:
# lets find out who are our primary customers

fig, ax = plt.subplots(1,2, figsize=(16/2,9/2))

masterdf.LIFESTAGE.value_counts().plot(kind='bar', ax=ax[0])

masterdf[['LIFESTAGE','TOT_SALES']].groupby('LIFESTAGE').sum().sort_values(by='TOT_SALES',ascending=False).plot(kind='bar',ax=ax[1])


> Its is observed from the graphs that, 
1. Old and retired people, older families, young and older families buy a lot chips. This observation in coherent with their spendings on chips.
2. Young and Old families tend to shop more chips then new families. Theory supports this as new families have relatively younger children who still not eat a lot of chips as well as they are more stringent on expenses.

In [None]:
# after taking reference from model answer
# from the above figure, we cannot answer how premium our target customers are.
# lets find out
total = masterdf.TOT_SALES.sum()
pvt = pd.pivot_table(masterdf, index=['LIFESTAGE'],columns=['PREMIUM_CUSTOMER'], values=['TOT_SALES'],aggfunc='sum')
pvt=pvt.applymap(lambda x: round(x*100/total,2))
pvt.plot(kind='bar', stacked=False, )

In [None]:
# lets run the numbers of graph above in heatmap for a better comparison
sns.heatmap(pvt, annot=True)

In [None]:
# we are getting most sales from budget-old families, mainstream-retirees and maintream young/single couples

# lets check if this is due to more number of shoppers
total_customers = masterdf.LYLTY_CARD_NBR.nunique()
npo = pd.pivot_table(masterdf, index=['LIFESTAGE'],columns=['PREMIUM_CUSTOMER'], values=['LYLTY_CARD_NBR'],aggfunc=pd.Series.nunique)
npo=npo.applymap(lambda x: round(x*100/total_customers,1))
sns.heatmap(npo, annot=True)


In [None]:
# it is observed that while old-budget families contribute highest to sales, but the customer group is not large. Means they tend to buy in larger quantities.
# converse, there are a roughly double mainstream-young/single couples customers, but they don't buy loads of chips.

# lets try to validate this by calculating the average number of chips units bought in a transaction per customer 

In [None]:
masterdf.columns

In [None]:
# average number of chips units bought in a transaction

# aucpc = masterdf[['TXN_ID','PROD_QTY']].groupby(['TXN_ID']).sum()
lyl = pd.pivot_table(masterdf, index='LIFESTAGE', columns='PREMIUM_CUSTOMER', values=['LYLTY_CARD_NBR'], aggfunc=[pd.Series.nunique])
aucpc = pd.pivot_table(masterdf, index='LIFESTAGE', columns='PREMIUM_CUSTOMER', values=['PROD_QTY'], aggfunc='sum')
# aucpc=aucpc.applymap(lambda x:round(x,3))
aucpc

In [None]:
temp=masterdf[(masterdf.LIFESTAGE=='NEW FAMILIES') & (masterdf.PREMIUM_CUSTOMER=='Budget')][['LYLTY_CARD_NBR','PROD_QTY']]
temp.PROD_QTY.sum()/temp.LYLTY_CARD_NBR.nunique()

In [None]:
testdf = pd.DataFrame(cols=['AVG_PROD_QTY'])
groupdf=masterdf[['LIFESTAGE','PREMIUM_CUSTOMER','PROD_QTY','LYLTY_CARD_NBR']].groupby(['LIFESTAGE','PREMIUM_CUSTOMER'])
for name, df in groupdf:
    

In [None]:
testdf = pd.DataFrame(columns=['AVG_PROD_QTY'])

In [None]:
testdf.append(['1'], index=name)

In [None]:
name, ser = next(iter(x[['LYLTY_CARD_NBR','PREMIUM_CUSTOMER']]))

In [None]:
name

In [None]:
ser

In [None]:
ser.PROD_QTY.sum()/ser.LYLTY_CARD_NBR.nunique()

In [None]:
# lets find out how do the customer segments like to spend their money

pvt = pd.pivot_table(pbdata, index=['LIFESTAGE'],columns=['PREMIUM_CUSTOMER'], values=['LYLTY_CARD_NBR'],aggfunc='count')
pvt.columns=['Budget', 'Mainstream', 'Premium']
pvt.sort_values(['Premium'], inplace=True)
pvt.plot(kind='bar')
plt.show()

> Its is observed from the graph that, 
1. Young singles/couples and Retirees tend to buy the mainstream or popular products.
2. All of the families prefer budget buying over other options.
3. Old singles/ couples are equally likely to spend in any category.

___
* define metrics of interest to enable you to draw insights on who spends on chips and what drives spends for each customer segment. 

In [None]:
# overall monthly chip consumption in grams per lifestage
masterdf['PUR_MONTH']= masterdf.DATE.dt.month_name()
pvt = pd.pivot_table(masterdf, index='LIFESTAGE', values='PROD_WTT', columns='PUR_MONTH',aggfunc='sum')
cols= ['July', 'August', 'September','October','November','December', 'January',  'February',
       'March', 'April', 'May', 'June'  ]
pvt.columns = cols
pvt = pvt.transpose()
pvt.plot(kind='line', figsize=(12,8))


In [None]:
# average transactions size


In [None]:
# average brand price for 150 gm pack

In [None]:
# adding graphs from reference

# total sales by 

### High level Summaries

In [None]:
# top 10 selling products
masterdf.PROD_NAME.value_counts()[:10]

In [None]:
# top 10 selling brands
masterdf.PROD_BRAND.value_counts()[:10]

In [None]:
# top performing stores
masterdf[['STORE_NBR','TOT_SALES']].groupby('STORE_NBR').sum().sort_values(by='TOT_SALES', ascending=False)[:10]

In [None]:
# distribution of product weights
masterdf.PROD_WTT.hist(bins=10)

> Its is observed from the graph that, 
1. The weight category of 150 to 200 gm pack is the most sold.
2. Other large or smaller packings are not so popular.

In [None]:
# Distribution of unit chip packet 
masterdf.PROD_UPRICE.hist()

> Its is observed from the graph that, 
1. Product packs falling in price range of 2.5 to 4.5 approx. are most bought.