In [1]:
import pandas as pd
pd.set_option('display.max_columns', 50)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy import create_engine

### Importing transactions table

In [6]:
driver = 'mysql+pymysql:'
user = 'lordchava'
password = '19693903'
ip = '104.155.143.225'
database = 'final_project'

In [7]:
connection_string = f'{driver}//{user}:{password}@{ip}/{database}'
engine = create_engine(connection_string)

In [4]:
transactions_df = pd.read_csv("../../data_final_project/transactions_clean.csv")

### Reordering columns

In [3]:
transactions_df.columns

Index(['HOUSEHOLD_KEY', 'BASKET_ID', 'WEEK_NO', 'DAY', 'TRANS_TIME',
       'STORE_ID', 'PRODUCT_ID', 'QUANTITY', 'SALES_VALUE', 'RETAIL_DISC',
       'COUPON_DISC', 'COUPON_MATCH_DISC', 'SHELF_PRICE', 'CLIENT_PRICE',
       '%_TOTAL_DISCOUNT', '%_LOYALTY_DISCOUNT', '%_COUPON_DISCOUNT'],
      dtype='object')

In [4]:
transactions_columns = ['HOUSEHOLD_KEY', 'BASKET_ID', 'WEEK_NO', 'DAY', 'TRANS_TIME',
       'STORE_ID', 'PRODUCT_ID', 'QUANTITY', 'SHELF_PRICE', 'SALES_VALUE', 'RETAIL_DISC',
       'COUPON_DISC', 'COUPON_MATCH_DISC', 'CLIENT_PRICE',
       '%_TOTAL_DISCOUNT', '%_LOYALTY_DISCOUNT', '%_COUPON_DISCOUNT']

In [5]:
transactions_df = transactions_df[transactions_columns]

### Checking column types

In [6]:
transactions_df.dtypes

HOUSEHOLD_KEY           int64
BASKET_ID               int64
WEEK_NO                 int64
DAY                     int64
TRANS_TIME              int64
STORE_ID                int64
PRODUCT_ID              int64
QUANTITY                int64
SHELF_PRICE           float64
SALES_VALUE           float64
RETAIL_DISC           float64
COUPON_DISC           float64
COUPON_MATCH_DISC     float64
CLIENT_PRICE          float64
%_TOTAL_DISCOUNT      float64
%_LOYALTY_DISCOUNT    float64
%_COUPON_DISCOUNT     float64
dtype: object

### Creating a synthetic basket identifier called "PURCHASE_ID" to complement the existing identifier

In [7]:
transactions_df["PURCHASE_ID"] = transactions_df[["DAY", "TRANS_TIME", "HOUSEHOLD_KEY"]].apply(lambda x: int("".join(map(str, x))), axis=1)

In [8]:
transactions_df.head()

Unnamed: 0,HOUSEHOLD_KEY,BASKET_ID,WEEK_NO,DAY,TRANS_TIME,STORE_ID,PRODUCT_ID,QUANTITY,SHELF_PRICE,SALES_VALUE,RETAIL_DISC,COUPON_DISC,COUPON_MATCH_DISC,CLIENT_PRICE,%_TOTAL_DISCOUNT,%_LOYALTY_DISCOUNT,%_COUPON_DISCOUNT,PURCHASE_ID
0,2375,26984851472,1,1,1631,364,1004906,1,1.99,1.39,-0.6,0.0,0.0,1.39,0.301508,0.301508,0.0,116312375
1,2375,26984851472,1,1,1631,364,1033142,1,0.82,0.82,0.0,0.0,0.0,0.82,0.0,0.0,0.0,116312375
2,2375,26984851472,1,1,1631,364,1036325,1,1.29,0.99,-0.3,0.0,0.0,0.99,0.232558,0.232558,0.0,116312375
3,2375,26984851472,1,1,1631,364,1082185,1,1.21,1.21,0.0,0.0,0.0,1.21,0.0,0.0,0.0,116312375
4,2375,26984851472,1,1,1631,364,8160430,1,1.89,1.5,-0.39,0.0,0.0,1.5,0.206349,0.206349,0.0,116312375


### Importing demographics table

It is needed because some transactions are carried out by customers for whom we do not have demographics data. Since demographics will be used to interpret the clusters, we decide to remove all those transactions and just focus on the customers for whom we have demographic data about.

In [8]:
demo = pd.read_sql("demographics", engine)

### Creating and populating clustering dataframe

In [9]:
clusters_df_base = demo.merge(transactions_df, how="left", on="HOUSEHOLD_KEY")

With a left join we get to just focus on the customer's transactions for whom we have demographics data about

In [10]:
clusters_df_base = clusters_df_base.drop(['AGE_DESC', 'INCOME_DESC', 'ADULTS_NUM', 'KIDS_NUM','NUMEROUS_FAM', 'SINGLE_GENDER'], axis=1)

Creating again the synthetic identifier that will later be used to calculate some columns on

In [11]:
clusters_df_base["PURCHASE_ID"] = clusters_df_base[["DAY", "TRANS_TIME", "HOUSEHOLD_KEY"]].apply(lambda x: int("".join(map(str, x))), axis=1)

### Checking in an exploratory manner for potential outliers

This table below shows the top 10 customers with the greater number of unique products per basket

In [12]:
clusters_df_base.groupby(["PURCHASE_ID"])["PRODUCT_ID"].nunique().sort_values(ascending=False).to_frame().head(10)

Unnamed: 0_level_0,PRODUCT_ID
PURCHASE_ID,Unnamed: 1_level_1
26717552294,167
43211191899,161
28719552208,153
6562047248,147
691017300,144
5672050248,138
4012102973,135
26013012294,134
64911011899,134
30711472489,133


We decide to get rid of the articles lower than 1$ since we observed that they are creating noise in our calculated fields

In [13]:
clusters_df_base=clusters_df_base[clusters_df_base["SHELF_PRICE"]>=1]

In [16]:
clusters_df_base.shape

(1271178, 18)

In [None]:
clusters_df_base.to_csv("../../data_final_project/pre_macro_cluster.csv", index=False)

### Uploading the data to our database

Since we were having issues to upload it to the Google database, we performed this operation in our localhost

In [25]:
driver = 'mysql+pymysql:'
user = 'root'
password = 'xxxxxx'
ip = '127.0.0.1'
database = 'proyecto_final'

In [None]:
connection_string = f'{driver}//{user}:{password}@{ip}/{database}'
engine = create_engine(connection_string)

In [27]:
clusters_df_base.to_sql('cluster_df_based',engine, index=False)