# Quantium Data Analysis

In [1]:
#Load libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
#Load datasets
transaction_data = pd.read_excel("QVI_transaction_data.xlsx")
purchase_date = pd.read_csv("QVI_purchase_behaviour.csv")

## 1. Data Cleaning

### 1.1. Examining transaction data 

In [3]:
transaction_data.head()

Unnamed: 0,DATE,STORE_NBR,LYLTY_CARD_NBR,TXN_ID,PROD_NBR,PROD_NAME,PROD_QTY,TOT_SALES
0,43390,1,1000,1,5,Natural Chip Compny SeaSalt175g,2,6.0
1,43599,1,1307,348,66,CCs Nacho Cheese 175g,3,6.3
2,43605,1,1343,383,61,Smiths Crinkle Cut Chips Chicken 170g,2,2.9
3,43329,2,2373,974,69,Smiths Chip Thinly S/Cream&Onion 175g,5,15.0
4,43330,2,2426,1038,108,Kettle Tortilla ChpsHny&Jlpno Chili 150g,3,13.8


In [4]:
transaction_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 264836 entries, 0 to 264835
Data columns (total 8 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   DATE            264836 non-null  int64  
 1   STORE_NBR       264836 non-null  int64  
 2   LYLTY_CARD_NBR  264836 non-null  int64  
 3   TXN_ID          264836 non-null  int64  
 4   PROD_NBR        264836 non-null  int64  
 5   PROD_NAME       264836 non-null  object 
 6   PROD_QTY        264836 non-null  int64  
 7   TOT_SALES       264836 non-null  float64
dtypes: float64(1), int64(6), object(1)
memory usage: 16.2+ MB


In [5]:
#Change dtype to save memory
transaction_data['TOT_SALES'] = transaction_data['TOT_SALES'].astype('float32') #Change float64 to float32

int64 =  transaction_data.select_dtypes(include=['int64']).columns.tolist() 
transaction_data[int64] = transaction_data[int64].astype('int32') #Change int64 to int32

transaction_data['PROD_NAME'] = transaction_data['PROD_NAME'].astype('category')

In [6]:
transaction_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 264836 entries, 0 to 264835
Data columns (total 8 columns):
 #   Column          Non-Null Count   Dtype   
---  ------          --------------   -----   
 0   DATE            264836 non-null  int32   
 1   STORE_NBR       264836 non-null  int32   
 2   LYLTY_CARD_NBR  264836 non-null  int32   
 3   TXN_ID          264836 non-null  int32   
 4   PROD_NBR        264836 non-null  int32   
 5   PROD_NAME       264836 non-null  category
 6   PROD_QTY        264836 non-null  int32   
 7   TOT_SALES       264836 non-null  float32 
dtypes: category(1), float32(1), int32(6)
memory usage: 7.3 MB


Memory usage from 16.2+ MB change to 7.3 MB

In [7]:
transaction_data.describe()

Unnamed: 0,DATE,STORE_NBR,LYLTY_CARD_NBR,TXN_ID,PROD_NBR,PROD_QTY,TOT_SALES
count,264836.0,264836.0,264836.0,264836.0,264836.0,264836.0,264836.0
mean,43464.03626,135.08011,135549.5,135158.3,56.583157,1.907309,7.302778
std,105.389282,76.78418,80579.98,78133.03,32.826638,0.643654,3.083583
min,43282.0,1.0,1000.0,1.0,1.0,1.0,1.5
25%,43373.0,70.0,70021.0,67601.5,28.0,2.0,5.4
50%,43464.0,130.0,130357.5,135137.5,56.0,2.0,7.4
75%,43555.0,203.0,203094.2,202701.2,85.0,2.0,9.2
max,43646.0,272.0,2373711.0,2415841.0,114.0,200.0,650.0


The maximum of PROD_QTY, TOT_SALES are outstanding. Let's take a look

In [8]:
transaction_data.loc[transaction_data['PROD_QTY'] == 200]

Unnamed: 0,DATE,STORE_NBR,LYLTY_CARD_NBR,TXN_ID,PROD_NBR,PROD_NAME,PROD_QTY,TOT_SALES
69762,43331,226,226000,226201,4,Dorito Corn Chp Supreme 380g,200,650.0
69763,43605,226,226000,226210,4,Dorito Corn Chp Supreme 380g,200,650.0


There are 2 transactions where 200 packets of chips are purchased in one transaction by the same customer. Let examine other transactions by this customer

In [9]:
transaction_data.loc[transaction_data['LYLTY_CARD_NBR'] == 226000]

Unnamed: 0,DATE,STORE_NBR,LYLTY_CARD_NBR,TXN_ID,PROD_NBR,PROD_NAME,PROD_QTY,TOT_SALES
69762,43331,226,226000,226201,4,Dorito Corn Chp Supreme 380g,200,650.0
69763,43605,226,226000,226210,4,Dorito Corn Chp Supreme 380g,200,650.0


There is no other transactions by this customer. So it is not an ordinary retail customer. I will remove this customer from further analysis

In [10]:
transaction_data = transaction_data.drop(transaction_data[transaction_data.LYLTY_CARD_NBR == 226000].index)
transaction_data.describe()

Unnamed: 0,DATE,STORE_NBR,LYLTY_CARD_NBR,TXN_ID,PROD_NBR,PROD_QTY,TOT_SALES
count,264834.0,264834.0,264834.0,264834.0,264834.0,264834.0,264834.0
mean,43464.03623,135.079423,135548.8,135157.6,56.583554,1.905813,7.297929
std,105.389007,76.784063,80579.9,78132.92,32.826444,0.343436,2.527142
min,43282.0,1.0,1000.0,1.0,1.0,1.0,1.5
25%,43373.0,70.0,70021.0,67600.5,28.0,2.0,5.4
50%,43464.0,130.0,130357.0,135136.5,56.0,2.0,7.4
75%,43555.0,203.0,203094.0,202699.8,85.0,2.0,9.2
max,43646.0,272.0,2373711.0,2415841.0,114.0,5.0,29.5


Now check unique values of each column

In [11]:
for column in transaction_data.columns.to_list():
    print("No of unique values in column " + column + " are: " + str(len(np.unique(transaction_data[column]))))

No of unique values in column DATE are: 364
No of unique values in column STORE_NBR are: 272
No of unique values in column LYLTY_CARD_NBR are: 72636
No of unique values in column TXN_ID are: 263125
No of unique values in column PROD_NBR are: 114
No of unique values in column PROD_NAME are: 114
No of unique values in column PROD_QTY are: 5
No of unique values in column TOT_SALES are: 111


In [12]:
transaction_data.isnull().sum().sum()

0

No null values! Now check if I am looking at the right products by examining PROD_NAME

In [13]:
np.unique(transaction_data['PROD_NAME'])

array(['Burger Rings 220g', 'CCs Nacho Cheese    175g',
       'CCs Original 175g', 'CCs Tasty Cheese    175g',
       'Cheetos Chs & Bacon Balls 190g', 'Cheetos Puffs 165g',
       'Cheezels Cheese 330g', 'Cheezels Cheese Box 125g',
       'Cobs Popd Sea Salt  Chips 110g',
       'Cobs Popd Sour Crm  &Chives Chips 110g',
       'Cobs Popd Swt/Chlli &Sr/Cream Chips 110g',
       'Dorito Corn Chp     Supreme 380g',
       'Doritos Cheese      Supreme 330g',
       'Doritos Corn Chip Mexican Jalapeno 150g',
       'Doritos Corn Chip Southern Chicken 150g',
       'Doritos Corn Chips  Cheese Supreme 170g',
       'Doritos Corn Chips  Nacho Cheese 170g',
       'Doritos Corn Chips  Original 170g', 'Doritos Mexicana    170g',
       'Doritos Salsa       Medium 300g', 'Doritos Salsa Mild  300g',
       'French Fries Potato Chips 175g',
       'Grain Waves         Sweet Chilli 210g',
       'Grain Waves Sour    Cream&Chives 210G',
       'GrnWves Plus Btroot & Chilli Jam 180g',
       'Infuzi

As the Client is the Category of Manager for Chips, I should only focus on the chips product. The salsa products should be removed.

In [14]:
discard = ["Salsa"]
transaction_data = transaction_data[transaction_data['PROD_NAME'].apply(lambda x: 'Salsa' not in x)]
transaction_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 246740 entries, 0 to 264835
Data columns (total 8 columns):
 #   Column          Non-Null Count   Dtype   
---  ------          --------------   -----   
 0   DATE            246740 non-null  int32   
 1   STORE_NBR       246740 non-null  int32   
 2   LYLTY_CARD_NBR  246740 non-null  int32   
 3   TXN_ID          246740 non-null  int32   
 4   PROD_NBR        246740 non-null  int32   
 5   PROD_NAME       246740 non-null  category
 6   PROD_QTY        246740 non-null  int32   
 7   TOT_SALES       246740 non-null  float32 
dtypes: category(1), float32(1), int32(6)
memory usage: 8.7 MB


Now let's create an other attribute to get pack size from PROD_NAME