# PA005: High Value Customer Identification (Insiders)

# 0.0 Imports

In [121]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import scipy.stats as st

from umap.umap_ import UMAP

from plotly import express as px


from sklearn.cluster import KMeans, DBSCAN

from sklearn.preprocessing import StandardScaler,MinMaxScaler,RobustScaler

from sklearn.decomposition import PCA

from sklearn.manifold import TSNE

from sklearn.ensemble import RandomForestRegressor

from sklearn.mixture import GaussianMixture

from sklearn.metrics import silhouette_score, silhouette_samples
from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer

from scipy.cluster.hierarchy import  linkage,dendrogram, fcluster

from sklearn.neighbors import NearestNeighbors

pd.set_option('display.float_format', lambda x: '%.4f' % x)
import warnings
warnings.filterwarnings("ignore")

# 0.2 Load Data

In [122]:
caminho = r'..\data\raw'
df_raw = pd.read_csv(caminho + '\Ecommerce.csv', encoding = 'iso 8859-1')
df_raw = df_raw.drop(['Unnamed: 8'], axis = 1)
df_raw.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,29-Nov-16,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,29-Nov-16,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,29-Nov-16,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,29-Nov-16,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,29-Nov-16,3.39,17850.0,United Kingdom


# 1.0 Descrição os dados

In [123]:
df1 = df_raw.copy()

## 1.1 Rename columns

In [124]:
cols_new = ['invoice_no','stock_code','description','quantity','invoice_date','unit_price','customer_id','country']

df1.columns = cols_new
df1.head()

Unnamed: 0,invoice_no,stock_code,description,quantity,invoice_date,unit_price,customer_id,country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,29-Nov-16,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,29-Nov-16,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,29-Nov-16,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,29-Nov-16,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,29-Nov-16,3.39,17850.0,United Kingdom


## 1.2 Data dimension

In [125]:
print("Number of rows: {} \nNumber of columns: {}".format(df1.shape[0], df1.shape[1]))

Number of rows: 541909 
Number of columns: 8


## 1.3 Data types

In [126]:
df1.dtypes

invoice_no       object
stock_code       object
description      object
quantity          int64
invoice_date     object
unit_price      float64
customer_id     float64
country          object
dtype: object

## 1.4 Check NA

In [127]:
df1.isna().sum()

invoice_no           0
stock_code           0
description       1454
quantity             0
invoice_date         0
unit_price           0
customer_id     135080
country              0
dtype: int64

## 1.5 Replace NA

In [128]:
df_missing = df1.loc[df1['customer_id'].isna() ]
df_not_missing = df1.loc[~df1['customer_id'].isna() ]
df_not_missing.head()

Unnamed: 0,invoice_no,stock_code,description,quantity,invoice_date,unit_price,customer_id,country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,29-Nov-16,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,29-Nov-16,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,29-Nov-16,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,29-Nov-16,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,29-Nov-16,3.39,17850.0,United Kingdom


In [129]:
print('Missing => {}  and NOT missing => {}'.format(df_missing.shape[0],df_not_missing.shape[0]))

Missing => 135080  and NOT missing => 406829


In [130]:
missing_invoice = df_missing['invoice_no'].drop_duplicates().tolist()
missing_invoice[0:9]

['536414',
 '536544',
 '536545',
 '536546',
 '536547',
 '536549',
 '536550',
 '536552',
 '536553']

In [131]:
df_not_missing.loc[df_not_missing['invoice_no'].isin( missing_invoice )]
# nao existe invoice_no do missing presente no not missing

Unnamed: 0,invoice_no,stock_code,description,quantity,invoice_date,unit_price,customer_id,country


In [132]:
# atribuição dos novos "customer_id", apenas para nao perder os dados dos 25%
#df_not_missing['customer_id'].max()  # 18287

In [133]:
# create reference
df_backup = pd.DataFrame(df_missing['invoice_no'].drop_duplicates() )
df_backup['customer_id'] = np.arange(19000, 19000 + len(df_backup), 1)

# merge original with reference dataframe
df1 = pd.merge( df1, df_backup, on = 'invoice_no', how = 'left')

# coalesce
df1['customer_id'] = df1['customer_id_x'].combine_first( df1['customer_id_y'] )
df1 = df1.drop(['customer_id_y','customer_id_x'], axis = 1)
df1.head()

Unnamed: 0,invoice_no,stock_code,description,quantity,invoice_date,unit_price,country,customer_id
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,29-Nov-16,2.55,United Kingdom,17850.0
1,536365,71053,WHITE METAL LANTERN,6,29-Nov-16,3.39,United Kingdom,17850.0
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,29-Nov-16,2.75,United Kingdom,17850.0
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,29-Nov-16,3.39,United Kingdom,17850.0
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,29-Nov-16,3.39,United Kingdom,17850.0


In [134]:
df1.isna().sum()

invoice_no         0
stock_code         0
description     1454
quantity           0
invoice_date       0
unit_price         0
country            0
customer_id        0
dtype: int64

In [135]:
# com a mudança dos missings acima, rodar novamente as analises descritivas abaixo para verificar mudanças de comportamento

In [136]:
# # remove
# df1 = df1.dropna(subset = ['description','customer_id'])

# print('Removed data: {:.2f}%'.format(100*(1 - (df1.shape[0]/df_raw.shape[0]))))

In [137]:
# df1.isna().sum()

## 1.6 Change dtypes

In [138]:
# invoice_date
df1['invoice_date'] = pd.to_datetime(df1['invoice_date'], format='%d-%b-%y')

# customer_id
df1['customer_id'] = df1['customer_id'].astype('int64')
df1.head()

Unnamed: 0,invoice_no,stock_code,description,quantity,invoice_date,unit_price,country,customer_id
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2016-11-29,2.55,United Kingdom,17850
1,536365,71053,WHITE METAL LANTERN,6,2016-11-29,3.39,United Kingdom,17850
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2016-11-29,2.75,United Kingdom,17850
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2016-11-29,3.39,United Kingdom,17850
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2016-11-29,3.39,United Kingdom,17850


In [139]:
df1.dtypes

invoice_no              object
stock_code              object
description             object
quantity                 int64
invoice_date    datetime64[ns]
unit_price             float64
country                 object
customer_id              int64
dtype: object

## 1.7 Descriptive Statistics

In [140]:
num_attributes = df1.select_dtypes(include=['int64','float64'])
cat_attributes = df1.select_dtypes(exclude=['int64','float64','datetime64[ns]'])

### 1.7.1 Numerical Attributes

In [141]:
# central tendency - mean / median
ct1 = pd.DataFrame(num_attributes.apply(np.mean)).T
ct2 = pd.DataFrame(num_attributes.apply(np.median)).T

# dispersion - std / skew / kurtosis / min / max
d1 = pd.DataFrame(num_attributes.apply(np.std)).T
d2 = pd.DataFrame(num_attributes.apply(np.min)).T
d3 = pd.DataFrame(num_attributes.apply(np.max)).T
d4 = pd.DataFrame(num_attributes.apply(lambda x: x.max() - x.min() )).T
d5 = pd.DataFrame(num_attributes.apply(lambda x: x.skew() )).T
d6 = pd.DataFrame(num_attributes.apply(lambda x: x.kurtosis() )).T

# concatenate
m = pd.concat( [d2,d3,d4,ct1,ct2,d1,d5,d6]).T.reset_index()
m.columns = ['attributes','min','max','range','mean','median','std', 'skew','kurtosis']
# m

In [142]:
# quantidade com valor negativo ? (Devolução)
# preço unitario de zero ? Promoção ?


### 1.7.2 Categorical Attributes

In [143]:
# cat_attributes['invoice_no'].astype('int64')
# invoice_no
df_letter_invoices = df1.loc[df1['invoice_no'].apply( lambda x: bool(re.search('[^0-9]+', x)) )]
print('Total of invoices with letter: ', df_letter_invoices.shape[0])
print('Total of invoices with letter and quantity < 0: ',df_letter_invoices.loc[
                                        df_letter_invoices['quantity'] < 0].shape[0])

Total of invoices with letter:  9291
Total of invoices with letter and quantity < 0:  9288


In [144]:
# stock code
# pegar somente letras
df1.loc[df1['stock_code'].apply( lambda x: bool(re.search('^[a-zA-Z]+$', x)) ),
                   'stock_code'].unique()

array(['POST', 'D', 'DOT', 'M', 'S', 'AMAZONFEE', 'm', 'DCGSSBOY',
       'DCGSSGIRL', 'PADS', 'B', 'CRUK'], dtype=object)

In [145]:
# acao:  remove stock_code in ['POST', 'D', 'M', 'PADS', 'DOT', 'CRUK']


In [146]:
# description
# df1.head()

# delete description

In [147]:
# country
df1['country'].unique()

array(['United Kingdom', 'France', 'Australia', 'Netherlands', 'Germany',
       'Norway', 'EIRE', 'Switzerland', 'Spain', 'Poland', 'Portugal',
       'Italy', 'Belgium', 'Lithuania', 'Japan', 'Iceland',
       'Channel Islands', 'Denmark', 'Cyprus', 'Sweden', 'Austria',
       'Israel', 'Finland', 'Bahrain', 'Greece', 'Hong Kong', 'Singapore',
       'Lebanon', 'United Arab Emirates', 'Saudi Arabia',
       'Czech Republic', 'Canada', 'Unspecified', 'Brazil', 'USA',
       'European Community', 'Malta', 'RSA'], dtype=object)

In [148]:
df1['country'].value_counts( normalize = True).head()

United Kingdom   0.9143
Germany          0.0175
France           0.0158
EIRE             0.0151
Spain            0.0047
Name: country, dtype: float64

In [149]:
df1[['country','customer_id']].drop_duplicates().groupby('country').count().sort_values(by = 'customer_id',
                                                                                        ascending = False).head()

Unnamed: 0_level_0,customer_id
country,Unnamed: 1_level_1
United Kingdom,7587
Germany,95
France,90
EIRE,44
Spain,31


# 2.0 Filtragem das variáveis

In [150]:
df2 = df1.copy()

In [151]:
df2.sort_values(['customer_id','invoice_no','description']).head(10) 
# há devolução se olhar o primeiro individuo, pois o "C" parece ser a flag para devolução

Unnamed: 0,invoice_no,stock_code,description,quantity,invoice_date,unit_price,country,customer_id
61619,541431,23166,MEDIUM CERAMIC TOP STORAGE JAR,74215,2017-01-16,1.04,United Kingdom,12346
61624,C541433,23166,MEDIUM CERAMIC TOP STORAGE JAR,-74215,2017-01-16,1.04,United Kingdom,12346
14966,537626,84558A,3D DOG PICTURE PLAYING CARDS,24,2016-12-05,2.95,Iceland,12347
14939,537626,22375,AIRLINE BAG VINTAGE JET SET BROWN,4,2016-12-05,4.25,Iceland,12347
14948,537626,22725,ALARM CLOCK BAKELIKE CHOCOLATE,4,2016-12-05,3.75,Iceland,12347
14949,537626,22726,ALARM CLOCK BAKELIKE GREEN,4,2016-12-05,3.75,Iceland,12347
14952,537626,22729,ALARM CLOCK BAKELIKE ORANGE,4,2016-12-05,3.75,Iceland,12347
14951,537626,22728,ALARM CLOCK BAKELIKE PINK,4,2016-12-05,3.75,Iceland,12347
14950,537626,22727,ALARM CLOCK BAKELIKE RED,4,2016-12-05,3.75,Iceland,12347
14955,537626,21171,BATHROOM METAL SIGN,12,2016-12-05,1.45,Iceland,12347


In [152]:
# quantity (negative)
# df2.loc[df2['quantity'] < 0 ].sort_values('invoice_no').head()

In [153]:
#====== NUMERICAL ===========
# unity price > 0
df2 = df2.loc[df2['unit_price'] > 0.040]

# stock code != ['POST, D, M, DOT, CRUK]
df2 = df2.loc[~df2['stock_code'].isin( ['POST', 'D', 'DOT', 'M', 'S', 'AMAZONFEE', 'm', 'DCGSSBOY',
       'DCGSSGIRL', 'PADS', 'B', 'CRUK'] ) ]

# description
df2 = df2.drop(['description'], axis = 1)

# country change
df2 = df2.loc[ ~df2['country'].isin( ['Unspecified','European Community'])]


# esse cliente apareceu com compras e estornos altos após o pandas profiling na etapa de verificar outliers
df2 = df2.loc[~df2['customer_id'].isin( [16446])]


# quantity
df2_returns = df2.loc[ df2['quantity'] < 0]
df2_purchases = df2.loc[ df2['quantity'] >= 0]

# 3.0 Feature Engineering

In [154]:
df3 = df2.copy()

## 3.1 Feature creation

In [155]:
# data reference
df_ref = df3.drop(['invoice_no', 'stock_code', 'quantity', 'invoice_date',
       'unit_price', 'country'], axis = 1).drop_duplicates().reset_index(drop = True)

# df_ref.head()

### 3.1.1 Gross revenue

In [156]:
# gross revenue = quantity * price (faturamento)
df2_purchases['gross_revenue'] = df2_purchases['quantity'] * df2_purchases['unit_price']

In [157]:
# monetary
df_monetary = df2_purchases[['customer_id','gross_revenue']].groupby('customer_id').sum().reset_index()

In [158]:
df_ref = pd.merge(df_ref, df_monetary, on = 'customer_id',how = 'left')

df_ref.isna().sum() # tem NA por conta de ser somente as compras 'positivas'

customer_id       0
gross_revenue    91
dtype: int64

### 3.1.2 Recency - day from last purchase

In [159]:
# recency - last day purchase
df_recency = df2_purchases[['customer_id','invoice_date']].groupby('customer_id').max().reset_index()

# pego o df1 com max() pois eu considero que seria eu olhando pra "hoje" no meu dataset e tirando 'hoje - datas'
df_recency['recency_days'] = ( df2_purchases['invoice_date'].max() - df_recency['invoice_date']  ).dt.days
df_recency = df_recency[['customer_id','recency_days']].copy()
df_ref = pd.merge(df_ref, df_recency, on = 'customer_id', how = 'left')
# df_ref.isna().sum()

### Qtde invoices

In [160]:
# number of products
df_frequency = df2_purchases[['customer_id','invoice_no']].drop_duplicates().groupby(
                                'customer_id').count().reset_index().rename(columns={'invoice_no':'qtde_invoices'})
df_ref = pd.merge(df_ref, df_frequency, on = 'customer_id', how = 'left')
# df_ref.isna().sum()

### qtde_items

In [161]:
# number of products
df_frequency = df2_purchases[['customer_id','quantity']].groupby(
                                            'customer_id').sum().reset_index().rename(columns={'quantity':'qtde_items'})
df_ref = pd.merge(df_ref, df_frequency, on = 'customer_id', how = 'left')
# df_ref.isna().sum()

### 3.1.5 Quantity of unique products purchased

In [162]:
# number of products
df_frequency = df2_purchases[['customer_id','stock_code']].groupby(
                                'customer_id').count().reset_index().rename(columns={'stock_code':'qtde_products'})
df_ref = pd.merge(df_ref, df_frequency, on = 'customer_id', how = 'left')
# df_ref.isna().sum()

### avg ticket

In [163]:
# average ticket
df_avg_ticket = df2_purchases[['customer_id','gross_revenue']].groupby('customer_id').mean().reset_index().rename(
                                                            columns = {'gross_revenue':'avg_ticket'})
df_ref = pd.merge(df_ref, df_avg_ticket, on = 'customer_id', how = 'left')
# df_ref.isna().sum()

### avg recency days

In [164]:
# recency days  average
df_aux = df2[['customer_id','invoice_date']].drop_duplicates().sort_values(
            by=['customer_id','invoice_date'],ascending = [False, False])
df_aux['next_customer_id'] = df_aux['customer_id'].shift()
df_aux['previous_date'] = df_aux['invoice_date'].shift()

df_aux['avg_recency_days'] = df_aux.apply( lambda x: (x['previous_date'] - x['invoice_date']).days 
                                          if x['customer_id'] == x['next_customer_id']
                                         else np.nan, axis = 1)

df_aux = df_aux.drop( ['invoice_date','next_customer_id','previous_date'], axis = 1).dropna()

# average
df_avg_recency_days = df_aux.groupby('customer_id').mean().reset_index()

# merge
df_ref = pd.merge(df_ref, df_avg_recency_days, on = 'customer_id', how = 'left')

# df_ref.isna().sum()

### 3.1.7 Frequency purchase

In [165]:
# df2[['invoice_no','customer_id','invoice_date']].drop_duplicates().head()

In [166]:
df_aux = df2_purchases[['invoice_no','customer_id','invoice_date']].drop_duplicates().groupby('customer_id').agg(
                    max_ = ('invoice_date','max'),
                    min_ = ('invoice_date','min'),
                    days_ = ('invoice_date',lambda x: (x.max() - x.min()).days + 1),
                    buy_ = ('invoice_date','count')).reset_index()

# frequency
df_aux['frequency'] = df_aux[['days_','buy_']].apply(lambda x: x['buy_'] / x['days_'] if x['days_'] != 0
                                                else 0, axis = 1)
# merge
df_ref = pd.merge(df_ref, df_aux[['customer_id','frequency']], on = 'customer_id', how = 'left')
# df_ref

In [167]:
# df_ref.isna().sum()

### 3.1.8 Number of returns

In [168]:
df_returns = df2_returns[['customer_id','quantity']].groupby('customer_id').sum().reset_index()
df_returns.rename(columns = {'quantity':'qtde_returns'}, inplace = True)

df_returns['qtde_returns'] = df_returns['qtde_returns'] * ( -1 )

df_ref = pd.merge(df_ref, df_returns, on = 'customer_id', how = 'left')

df_ref.loc[df_ref['qtde_returns'].isna(), 'qtde_returns'] = 0

# df_ref.isna().sum()

# 4.0 EDA (exploratory data analysis)

In [169]:
df4 = df_ref.dropna().copy()
print(df4.shape)
# df4.isna().sum()

(2968, 10)


### basket size

In [170]:
df_aux = df2_purchases[['customer_id','invoice_no','quantity']].groupby('customer_id').agg(
                                                    n_purchase = ('invoice_no','nunique'),
                                                    n_products = ('quantity','sum')).reset_index()

# calculation
df_aux['avg_basket_size'] = df_aux['n_products'] / df_aux['n_purchase']

# merge
df_ref = pd.merge( df_ref, df_aux[['customer_id','avg_basket_size']], how = 'left', on = 'customer_id')

# df_ref.isna().sum()

### unique basket size

In [171]:
df_aux = df2_purchases[['customer_id','invoice_no','stock_code']].groupby('customer_id').agg(
                                                    n_purchase = ('invoice_no','nunique'),
                                                    n_products = ('stock_code','nunique')).reset_index()

# calculation
df_aux['avg_unique_basket_size'] = df_aux['n_products'] / df_aux['n_purchase']

# merge
df_ref = pd.merge( df_ref, df_aux[['customer_id','avg_unique_basket_size']], how = 'left', on = 'customer_id')

# df_ref.isna().sum()

## 4.3 Space Study

In [172]:
# original dataset
# df43 = df4.drop(['customer_id'], axis = 1).copy()

# selected dataset
cols_selected = ['customer_id','gross_revenue','recency_days','qtde_products','frequency','qtde_returns']
df43 = df4[cols_selected].copy()
# df43.head()

In [173]:
# rescaling/standartization
mm = MinMaxScaler()
# ss = StandardScaler()
# rs = RobustScaler()

df43['gross_revenue'] = mm.fit_transform( df43[['gross_revenue']] )
df43['recency_days'] = mm.fit_transform( df43[['recency_days']] )

df43['qtde_products'] = mm.fit_transform( df43[['qtde_products']] )

df43['frequency'] = mm.fit_transform( df43[['frequency']] )

df43['qtde_returns'] = mm.fit_transform( df43[['qtde_returns']] )


### 4.3.4 Tree-Based Embedding

In [174]:
# training dataset
X = df43.drop(['customer_id','gross_revenue'], axis = 1)
Y = df43['gross_revenue']

# model definition
rf_model = RandomForestRegressor( n_estimators=100, random_state = 42)

# model training
rf_model.fit(X, Y)

# leaf
df_leaf = pd.DataFrame( rf_model.apply(X) )
df_leaf.shape

# dataframe leaf

(2968, 100)

In [175]:
reducer = UMAP( random_state = 42 )
embedding = reducer.fit_transform( df_leaf )

# embedding
df_tree = pd.DataFrame()
df_tree['embedding_x'] = embedding[:,0]
df_tree['embedding_y'] = embedding[:,1]

# plot UMAP
# sns.scatterplot( x = 'embedding_x', y = 'embedding_y',data = df_tree)

# 7.0 Hyperparameter Fine Tunning

In [176]:
# X = df6.drop(['customer_id'], axis = 1)
X = df_tree.copy()
# X.head()

In [177]:
# clusters = [2,3,4,5,6,7]
# clusters = np.arange(2, 26, 1)

# 8.0 Model Training

## 8.2 Final model

In [178]:
k = 8

gmm_model = GaussianMixture(n_components = k, n_init = 300, random_state = 32)

# model training
gmm_model.fit( X )


# model prediction
labels = gmm_model.predict( X )

# model evaluation
sil = silhouette_score( X, labels, metric = 'euclidean')
#     print(sil)

### 8.2.2 Cluster Organization (avoid change numbers)

In [179]:
# gmm_model.means_

# gmm_model.means_.sum( axis = 1)

# # index of new cluster name
# idx = np.argsort(gmm_model.means_.sum( axis = 1 ) )

# # new cluster label
# new_cluster_label = np.zeros_like( idx )

# # cluster name
# cluster_name = [10,11,12,13,14,15,16,17]

# # assign new cluster names
# new_cluster_label[idx] = cluster_name


# # labels_gmm = new_cluster_label[ labels_gmm ]

## 9.2 Cluster Profile

In [180]:
df92 = df4[cols_selected].copy()
# df92['cluster'] = labels_kmeans
df92['cluster'] = labels
df92.head()

Unnamed: 0,customer_id,gross_revenue,recency_days,qtde_products,frequency,qtde_returns,cluster
0,17850,5391.21,372.0,297.0,17.0,40.0,2
1,13047,3232.59,56.0,171.0,0.0283,35.0,2
2,12583,6705.38,2.0,232.0,0.0403,50.0,2
3,13748,948.25,95.0,28.0,0.0179,0.0,0
4,15100,876.0,333.0,3.0,0.0732,22.0,3


In [181]:
# number of customer
df_cluster = df92[['customer_id','cluster']].groupby('cluster').count().reset_index()
df_cluster['perc_cluster'] = 100 * df_cluster['customer_id'] / df_cluster['customer_id'].sum()

# average gross revenue
df_cluster_gross_revenue = df92[['gross_revenue','cluster']].groupby('cluster').mean().reset_index()
df_cluster = df_cluster.merge(df_cluster_gross_revenue, on = 'cluster', how = 'inner')

# average recency days
df_cluster_recency_days = df92[['recency_days','cluster']].groupby('cluster').mean().reset_index()
df_cluster = df_cluster.merge(df_cluster_recency_days, on = 'cluster', how = 'inner')

# average qtd products
df_cluster_invoice_no = df92[['qtde_products','cluster']].groupby('cluster').mean().reset_index()
df_cluster = df_cluster.merge(df_cluster_invoice_no, on = 'cluster', how = 'inner')

# average frequency
df_cluster_invoice_no = df92[['frequency','cluster']].groupby('cluster').mean().reset_index()
df_cluster = df_cluster.merge(df_cluster_invoice_no, on = 'cluster', how = 'inner')

# average returns
df_cluster_invoice_no = df92[['qtde_returns','cluster']].groupby('cluster').mean().reset_index()
df_cluster = df_cluster.merge(df_cluster_invoice_no, on = 'cluster', how = 'inner')


df_cluster.sort_values(by=['gross_revenue'], ascending = False)

Unnamed: 0,cluster,customer_id,perc_cluster,gross_revenue,recency_days,qtde_products,frequency,qtde_returns
2,2,613,20.6536,7494.2848,24.5106,364.894,0.0865,119.5171
7,7,130,4.3801,4689.8686,47.4077,103.8846,0.0573,20.7538
6,6,360,12.1294,2395.1195,45.4278,127.0667,0.0448,26.8667
1,1,404,13.6119,1689.0735,54.8094,89.1757,0.05,11.5421
5,5,415,13.9825,1221.4729,61.3711,53.2193,0.042,9.1229
4,4,279,9.4003,954.047,77.362,36.5305,0.0818,5.7097
0,0,401,13.5108,679.6177,119.0574,22.2294,0.1799,11.611
3,3,366,12.3315,546.5967,99.4344,11.3689,0.3514,8.7896


In [182]:
# 2 Cluster Insiders
# 7 Cluster More Products
# 6 Cluster Spend Money
# 1 Cluster Even More Products
# 5 Cluster Less Days
# 4 Cluster Less 1k
# 0 Cluster Stop Returners
# 3 Cluster More Buy

- Cluster 01 (Insiders):
    - Número de clientes: 468 (15 % do total)
    - Recência média de 21 dias
    - Compras em média de 424 compras
    - Receita média de $ 8835,90
    

- Cluster 02:
    - Número de clientes: 31 (0,71 % do total)
    - Recência média de 13 dias
    - Compras em média de 53 compras
    - Receita média de $ 40.543,52


- Cluster 03:
    - Número de clientes: 4335 (99,15 % do total)
    - Recência média de 92 dias
    - Compras em média de 4 compras
    - Receita média de $ 1372,58

# 11.0 Deploy to Production

In [191]:
import sqlite3
from sqlalchemy import create_engine

In [192]:
for i in ['recency_days','qtde_returns','qtde_products']:
    df92[i] = df92[i].astype('int64')

In [199]:
print(df92.shape)
df92.dtypes

(2968, 7)


customer_id        int64
gross_revenue    float64
recency_days       int64
qtde_products      int64
frequency        float64
qtde_returns       int64
cluster            int64
dtype: object

In [194]:
## Insert into SQLite

In [203]:
# # create table
# query_create_insiders = """
#     CREATE TABLE insiders (
#     customer_id      INTEGER,
#     gross_revenue    REAL,
#     recency_days     INTEGER,
#     qtde_products    INTEGER,
#     frequency        REAL,
#     qtde_returns     INTEGER,
#     cluster          INTEGER
#     )
#     """

# conn = sqlite3.connect('insiders_db.sqlite')
# conn.execute( query_create_insiders )
# conn.commit()
# conn.close()

# # insert data
conn = create_engine( 'sqlite:///insiders_db.sqlite')
df92.to_sql('insiders',con = conn, if_exists = 'append', index = False)


# # select data
query = """ SELECT * FROM insiders"""
a = pd.read_sql( query, conn)
a

Unnamed: 0,customer_id,gross_revenue,recency_days,qtde_products,frequency,qtde_returns,cluster
0,17850,5391.2100,372,297,17.0000,40,2
1,13047,3232.5900,56,171,0.0283,35,2
2,12583,6705.3800,2,232,0.0403,50,2
3,13748,948.2500,95,28,0.0179,0,0
4,15100,876.0000,333,3,0.0732,22,3
...,...,...,...,...,...,...,...
2963,12479,473.2000,11,30,1.0000,34,0
2964,14126,706.1300,7,15,0.7500,50,3
2965,13521,1092.3900,1,435,0.3000,0,2
2966,15060,301.8400,8,120,2.0000,0,6
