In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [30]:
df = pd.read_csv("QVI_data.csv")

In [31]:
df.head(2)

Unnamed: 0,LYLTY_CARD_NBR,DATE,STORE_NBR,TXN_ID,PROD_NBR,PROD_NAME,PROD_QTY,TOT_SALES,PACK_SIZE,BRAND,LIFESTAGE,PREMIUM_CUSTOMER
0,1000,2018-10-17,1,1,5,Natural Chip Compny SeaSalt175g,2,6.0,175,NATURAL,YOUNG SINGLES/COUPLES,Premium
1,1002,2018-09-16,1,2,58,Red Rock Deli Chikn&Garlic Aioli 150g,1,2.7,150,RRD,YOUNG SINGLES/COUPLES,Mainstream


In [68]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 264834 entries, 0 to 264833
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   LYLTY_CARD_NBR    264834 non-null  int64         
 1   DATE              264834 non-null  datetime64[ns]
 2   STORE_NBR         264834 non-null  int64         
 3   TXN_ID            264834 non-null  int64         
 4   PROD_NBR          264834 non-null  int64         
 5   PROD_NAME         264834 non-null  object        
 6   PROD_QTY          264834 non-null  int64         
 7   TOT_SALES         264834 non-null  float64       
 8   PACK_SIZE         264834 non-null  int64         
 9   BRAND             264834 non-null  object        
 10  LIFESTAGE         264834 non-null  object        
 11  PREMIUM_CUSTOMER  264834 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(6), object(4)
memory usage: 24.2+ MB


# Find the monthly sales for all the shops

In [69]:
df['YEAR_MONTH'] = df['DATE'].dt.to_period('M')

In [70]:
# Group by store number and year-month, then sum the total sales
monthly_sales = df.groupby(['STORE_NBR', 'YEAR_MONTH'])['TOT_SALES'].sum().reset_index()

In [71]:
print(monthly_sales)

      STORE_NBR YEAR_MONTH  TOT_SALES
0             1    2018-07      206.9
1             1    2018-08      176.1
2             1    2018-09      278.8
3             1    2018-10      188.1
4             1    2018-11      192.6
...         ...        ...        ...
3164        272    2019-02      395.5
3165        272    2019-03      442.3
3166        272    2019-04      445.1
3167        272    2019-05      314.6
3168        272    2019-06      312.1

[3169 rows x 3 columns]


# Find the monthly number of customers 

In [74]:
# Group by store number and year-month, then count the unique loyalty card numbers
monthly_customers = df.groupby(['STORE_NBR', 'YEAR_MONTH'])['LYLTY_CARD_NBR'].nunique().reset_index()

In [75]:
# Rename the column to 'NUM_CUSTOMERS' for clarity
monthly_customers.rename(columns={'LYLTY_CARD_NBR': 'NUM_CUSTOMERS'}, inplace=True)

In [76]:
# Display the result
print(monthly_customers)

      STORE_NBR YEAR_MONTH  NUM_CUSTOMERS
0             1    2018-07             49
1             1    2018-08             42
2             1    2018-09             59
3             1    2018-10             44
4             1    2018-11             46
...         ...        ...            ...
3164        272    2019-02             45
3165        272    2019-03             50
3166        272    2019-04             54
3167        272    2019-05             34
3168        272    2019-06             34

[3169 rows x 3 columns]


# Find the Monthly number of transactions per customer

In [77]:
# Group by loyalty card number and year-month, then count the number of transactions per customer
monthly_transactions_per_customer = df.groupby(['LYLTY_CARD_NBR', 'YEAR_MONTH'])['TXN_ID'].nunique().reset_index()

In [78]:
# Rename the column to 'NUM_TRANSACTIONS' for clarity
monthly_transactions_per_customer.rename(columns={'TXN_ID': 'NUM_TRANSACTIONS'}, inplace=True)

In [79]:
# Display the result
print(monthly_transactions_per_customer)

        LYLTY_CARD_NBR YEAR_MONTH  NUM_TRANSACTIONS
0                 1000    2018-10                 1
1                 1002    2018-09                 1
2                 1003    2019-03                 2
3                 1004    2018-11                 1
4                 1005    2018-12                 1
...                ...        ...               ...
221274         2370651    2018-08                 1
221275         2370701    2018-12                 1
221276         2370751    2018-10                 1
221277         2370961    2018-10                 2
221278         2373711    2018-12                 1

[221279 rows x 3 columns]


# Closest Stores based on the sales

In [87]:
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances

In [88]:
# Step 2: Pivot the data to have stores as columns and months as rows
monthly_sales_pivot = monthly_sales.pivot(index='YEAR_MONTH', columns='STORE_NBR', values='TOT_SALES').fillna(0)

In [89]:
def find_closest_stores(target_store, store_pivot, n_closest=3):
    target_sales = store_pivot[target_store].values.reshape(1, -1)  # Target store sales data
    other_stores = store_pivot.drop(columns=[target_store])  # Drop target store data
    
    # Compute Euclidean distances between the target store and other stores
    distances = euclidean_distances(target_sales, other_stores.T)  # Transpose to compare target to other stores
    
    # Get the indices of the closest stores
    closest_stores_idx = np.argsort(distances[0])[:n_closest]
    
    # Get the store numbers for the closest stores
    closest_stores = other_stores.columns[closest_stores_idx].tolist()
    
    return closest_stores

In [90]:
# Step 4: Find 3 closest stores for stores 77, 86, and 88
closest_stores_77 = find_closest_stores(77, monthly_sales_pivot)
closest_stores_86 = find_closest_stores(86, monthly_sales_pivot)
closest_stores_88 = find_closest_stores(88, monthly_sales_pivot)

In [91]:
# Display the results
print(f"3 closest stores to store 77: {closest_stores_77}")
print(f"3 closest stores to store 86: {closest_stores_86}")
print(f"3 closest stores to store 88: {closest_stores_88}")

3 closest stores to store 77: [46, 185, 188]
3 closest stores to store 86: [229, 109, 155]
3 closest stores to store 88: [40, 165, 237]


# Find how many bags of chips each customer has bought per month

In [92]:
#Group by customer and month, summing the total quantity bought
monthly_purchase_per_customer = df.groupby(['LYLTY_CARD_NBR', 'YEAR_MONTH'])['PROD_QTY'].sum().reset_index()

In [93]:
print(monthly_purchase_per_customer)

        LYLTY_CARD_NBR YEAR_MONTH  PROD_QTY
0                 1000    2018-10         2
1                 1002    2018-09         1
2                 1003    2019-03         2
3                 1004    2018-11         1
4                 1005    2018-12         1
...                ...        ...       ...
221274         2370651    2018-08         2
221275         2370701    2018-12         2
221276         2370751    2018-10         2
221277         2370961    2018-10         4
221278         2373711    2018-12         2

[221279 rows x 3 columns]
