# PA005: High Value Customer Identification ( Insiders )

# 0.0 Imports

In [1]:
import re
import sqlite3

import pandas                                                                             as pd
import numpy                                                                              as np
import seaborn                                                                            as sns
import umap.umap_                                                                         as umap


from sqlalchemy                    import create_engine
from datetime                      import datetime

from matplotlib                    import pyplot                                          as plt

from sklearn                       import cluster                                         as c
from sklearn                       import metrics                                         as met
from sklearn                       import decomposition                                   as dd
from sklearn                       import ensemble                                        as en
from sklearn                       import mixture                                         as mx
from plotly                        import express                                         as px
from sklearn                       import preprocessing                                   as pp


import warnings
warnings.filterwarnings("ignore")

##  0.2 Load Dataset


In [2]:
# load data
df_raw = pd.read_csv(r'../data/raw/Ecommerce.csv', encoding='cp1252')

# # drop extra column
df_raw = df_raw.drop(columns=['Unnamed: 8'] , axis=1)

# <font color ='red'> 1.0 Descrição dos dados </font>

In [3]:
df1 = df_raw.copy()

## 1.1 Rename Columns

In [4]:
cols_new = ['invoice_no', 'stock_code', 'description', 'quantity', 'invoice_date','unit_price', 'customer_id', 'country']

df1.columns = cols_new

## 1.2 Data dimensios

In [5]:
print('Number of rows: {}'.format(df1.shape[0]))
print('Number of columns: {}'.format(df1.shape[1]))

Number of rows: 541909
Number of columns: 8


## 1.3 Data types

In [6]:
df1.dtypes

invoice_no       object
stock_code       object
description      object
quantity          int64
invoice_date     object
unit_price      float64
customer_id     float64
country          object
dtype: object

## 1.4 Check NA

In [7]:
df1.isna().sum()

invoice_no           0
stock_code           0
description       1454
quantity             0
invoice_date         0
unit_price           0
customer_id     135080
country              0
dtype: int64

## 1.5 Replace NA


In [8]:
df_missing = df1.loc[df1['customer_id'].isna(), :]
df_not_missing = df1.loc[~df1['customer_id'].isna(), :]



In [9]:
# create referance 

df_backup = pd.DataFrame(df_missing['invoice_no'].drop_duplicates())
df_backup['customer_id'] = np.arange(19000 , 19000+len(df_backup), 1)
df_backup.head()

# merge original with reference dataframe
df1 = pd.merge(df1, df_backup , on='invoice_no' , how='left')

#coalesce
df1['customer_id'] = df1['customer_id_x'].combine_first(df1['customer_id_y'])

# drop extra columns

df1 = df1.drop(columns=['customer_id_x' , 'customer_id_y' ] , axis = 1)

df1.head()

Unnamed: 0,invoice_no,stock_code,description,quantity,invoice_date,unit_price,country,customer_id
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,29-Nov-16,2.55,United Kingdom,17850.0
1,536365,71053,WHITE METAL LANTERN,6,29-Nov-16,3.39,United Kingdom,17850.0
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,29-Nov-16,2.75,United Kingdom,17850.0
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,29-Nov-16,3.39,United Kingdom,17850.0
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,29-Nov-16,3.39,United Kingdom,17850.0


## 1.6 Changes dtypes


In [10]:
# invoice date

df1['invoice_date'] = pd.to_datetime(df1['invoice_date'] , format='%d-%b-%y')

# customer_id

df1['customer_id'] = df1['customer_id'].astype('int64')


## 1.7 Descriptive Statistics

In [11]:
num_attributes = df1.select_dtypes(include=['int64' , 'float64'])
cat_attributes = df1.select_dtypes(exclude=['int64' , 'float64','datetime64[ns]'])


### 1.7.1 Numerical Attributs

In [12]:
# central tendency - mean, median
mean = pd.DataFrame(num_attributes.apply(np.mean)).T
median = pd.DataFrame(num_attributes.apply(np.median)).T

# dispersion - std , min , max , range , skew , kurtosis
std = pd.DataFrame(num_attributes.apply(np.std)).T
mi = pd.DataFrame(num_attributes.apply(np.min)).T
ma = pd.DataFrame(num_attributes.apply(np.max)).T
ran = pd.DataFrame(num_attributes.apply(lambda x: x.max() - x.min())).T
skew = pd.DataFrame(num_attributes.apply(lambda x: x.skew())).T
kurtosis = pd.DataFrame(num_attributes.apply(lambda x: x.kurtosis())).T

# concatenate

m = pd.concat([mi , ma ,ran, mean , median  , std,  skew ,  kurtosis]).T.reset_index()
m.columns = ['Attributes' , 'min' , 'max', 'range', 'mean', 'median', 'std', 'skew', 'kurtoses']
m

Unnamed: 0,Attributes,min,max,range,mean,median,std,skew,kurtoses
0,quantity,-80995.0,80995.0,161990.0,9.55225,3.0,218.080957,-0.264076,119769.160031
1,unit_price,-11062.06,38970.0,50032.06,4.611114,2.08,96.759764,186.506972,59005.719097
2,customer_id,12346.0,22709.0,10363.0,16688.840453,16249.0,2911.408666,0.487449,-0.804287


### 1.7.2 Categorical Attributs

#### Invoice No

In [13]:
# identificação:
df_letter_invoices = df1.loc[df1['invoice_no'].apply(lambda x: bool(re.search('[^0-9]+', x))) , :]
df_letter_invoices.head()

print('Total number of invoices: {}'.format(len(df_letter_invoices)))
print('Total number of negative quantity: {}'.format(len(df_letter_invoices['quantity'] < 0)))

Total number of invoices: 9291
Total number of negative quantity: 9291


#### Stock Code

In [14]:
# check stock code only characters
df1.loc[cat_attributes['stock_code'].apply(lambda x: bool(re.search('^[a-zA-Z]+$', x))),'stock_code'].unique()

# Ação:
## 1. Remove stock_code in ['POST', 'D', 'DOT', 'M', 'S', 'AMAZONFEE', 'm', 'DCGSSBOY','DCGSSGIRL', 'PADS', 'B', 'CRUK']


array(['POST', 'D', 'DOT', 'M', 'S', 'AMAZONFEE', 'm', 'DCGSSBOY',
       'DCGSSGIRL', 'PADS', 'B', 'CRUK'], dtype=object)

#### Description

In [15]:
df1.head()

# Ação: Delete Description

Unnamed: 0,invoice_no,stock_code,description,quantity,invoice_date,unit_price,country,customer_id
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2016-11-29,2.55,United Kingdom,17850
1,536365,71053,WHITE METAL LANTERN,6,2016-11-29,3.39,United Kingdom,17850
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2016-11-29,2.75,United Kingdom,17850
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2016-11-29,3.39,United Kingdom,17850
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2016-11-29,3.39,United Kingdom,17850


#### Country

In [16]:
len(df1['country'].unique())

38

In [17]:
df1['country'].value_counts(normalize=True)

United Kingdom          0.914320
Germany                 0.017521
France                  0.015790
EIRE                    0.015124
Spain                   0.004674
Netherlands             0.004375
Belgium                 0.003818
Switzerland             0.003694
Portugal                0.002803
Australia               0.002323
Norway                  0.002004
Italy                   0.001482
Channel Islands         0.001399
Finland                 0.001283
Cyprus                  0.001148
Sweden                  0.000853
Unspecified             0.000823
Austria                 0.000740
Denmark                 0.000718
Japan                   0.000661
Poland                  0.000629
Israel                  0.000548
USA                     0.000537
Hong Kong               0.000531
Singapore               0.000423
Iceland                 0.000336
Canada                  0.000279
Greece                  0.000269
Malta                   0.000234
United Arab Emirates    0.000125
European C

In [18]:
df1[['customer_id' , 'country']].drop_duplicates().groupby('country').count().reset_index().sort_values('customer_id', ascending= False)

Unnamed: 0,country,customer_id
36,United Kingdom,7587
14,Germany,95
13,France,90
10,EIRE,44
31,Spain,31
3,Belgium,25
33,Switzerland,24
27,Portugal,20
19,Italy,15
16,Hong Kong,15


# <font color ='red'> 2.0 Filtragem de variaveis </font>

In [19]:
df2 = df1.copy()

In [20]:
# ==== Numerical Attributes ====

# unit price > 0.0
df2 = df2.loc[df2['unit_price'] > 0.04 , :]


# ==== Categorical Attributes ====


df2 = df2[~df2['stock_code'].isin(['POST', 'D', 'DOT', 'M', 'S', 'AMAZONFEE', 'm', 'DCGSSBOY','DCGSSGIRL', 'PADS', 'B', 'CRUK'])]

# description
df2 = df2.drop('description', axis =1)

# map
df2 = df2[~df2['country'].isin (['European Community' , 'Unspecified'])]

# bad users
df2 = df2[~df2['customer_id'].isin( [16446]) ]


## quantity 
df2_returns = df2.loc[df2['quantity'] < 0, :]
df2_purchase = df2.loc[df2['quantity'] >= 0, :]


# <font color ='red'> 3.0 Feature Engeneering </font>

In [21]:
df3 = df2.copy()

## 3.1 Feature Creation

In [22]:
# data reference

df_ref = df3.drop(['invoice_no','stock_code','quantity', 'invoice_date','unit_price','country'], axis = 1 ).drop_duplicates(ignore_index = True)


### 3.1.1. Gross Revenue

In [23]:
# Gross Revenue ( Faturamento = quantity * price)
df2_purchase.loc[: , 'gross_revenue'] = df2_purchase.loc[: , 'quantity'] * df2_purchase.loc[: , 'unit_price']

df_monetary = df2_purchase.loc[: , ['customer_id' , 'gross_revenue']].groupby('customer_id').sum().reset_index()
df_ref = pd.merge(df_ref, df_monetary,on='customer_id',how='left')

df_ref.isna().sum()

customer_id       0
gross_revenue    91
dtype: int64

### 3.1.2. Recency - Day from last purchase

In [24]:
# Recency - Last day purchase
df_recency = df2_purchase.loc[: ,[ 'customer_id', 'invoice_date']].groupby('customer_id').max().reset_index()
df_recency['recency_days'] = (df2['invoice_date'].max() - df_recency['invoice_date']).dt.days
df_recency = df_recency[['customer_id','recency_days']].copy()
df_ref = pd.merge(df_ref , df_recency , on='customer_id', how='left')
df_ref.isna().sum()

customer_id       0
gross_revenue    91
recency_days     91
dtype: int64

### 3.1.3. Quantity of purchased

In [25]:
# Numero de produtos
df_freq = (df2_purchase.loc[: ,['customer_id','invoice_no']].drop_duplicates()
                                                            .groupby('customer_id')
                                                            .count()
                                                            .reset_index()
                                                            .rename(columns={'invoice_no' : 'qtde_invoices'}))

df_ref = pd.merge(df_ref , df_freq , on='customer_id', how='left')
df_ref.isna().sum()

customer_id       0
gross_revenue    91
recency_days     91
qtde_invoices    91
dtype: int64

### 3.1.4. Quantity total of items purchased


In [26]:
df_freq = df2_purchase.loc[: ,['customer_id','quantity']].groupby('customer_id').sum().reset_index().rename(columns = {'quantity' : 'qtde_items'})
df_ref = pd.merge(df_ref , df_freq , on='customer_id', how='left')
df_ref.isna().sum()

customer_id       0
gross_revenue    91
recency_days     91
qtde_invoices    91
qtde_items       91
dtype: int64

### 3.1.5. Quantity of products purchased


In [27]:
df_freq = df2_purchase.loc[: ,['customer_id','stock_code']].groupby('customer_id').count().reset_index().rename(columns = {'stock_code' : 'qtde_products'})
df_ref = pd.merge(df_ref , df_freq , on='customer_id', how='left')
df_ref.isna().sum()

customer_id       0
gross_revenue    91
recency_days     91
qtde_invoices    91
qtde_items       91
qtde_products    91
dtype: int64

### 3.1.6. Avg Ticket Value

In [28]:
# Avg ticket
df_avg_ticket = df2_purchase.loc[: ,['customer_id' , 'gross_revenue']].groupby('customer_id').mean().reset_index().rename(columns={'gross_revenue':'avg_ticket'})
#df_avg_ticket['avg_ticket'] = np.round(df_avg_ticket['avg_ticket'] , 2)
df_ref = pd.merge(df_ref, df_avg_ticket,on='customer_id',how='left')
df_ref.isna().sum()

customer_id       0
gross_revenue    91
recency_days     91
qtde_invoices    91
qtde_items       91
qtde_products    91
avg_ticket       91
dtype: int64

### 3.1.7. Avg Recency Days

In [29]:
# Avarage recency days

df_aux = df3[['customer_id', 'invoice_date']].drop_duplicates().sort_values(['customer_id', 'invoice_date'], ascending=['False' , 'False'])
df_aux['next_customer_id'] = df_aux['customer_id'].shift() # next customer
df_aux['previous_date'] = df_aux['invoice_date'].shift() # next invoice date


df_aux['avg_recency_days'] = df_aux.apply(lambda x: (x['invoice_date'] - x['previous_date']).days if x['customer_id'] == x['next_customer_id'] else np.nan, axis =1)

df_aux = df_aux.drop(['invoice_date' , 'next_customer_id' , 'previous_date'], axis = 1).dropna()

# average rencecy
df_avg_recency_days = df_aux.groupby('customer_id').mean().reset_index()

# merge
df_ref = pd.merge(df_ref , df_avg_recency_days, on='customer_id', how='left')

df_ref.isna().sum()

customer_id            0
gross_revenue         91
recency_days          91
qtde_invoices         91
qtde_items            91
qtde_products         91
avg_ticket            91
avg_recency_days    2816
dtype: int64

### 3.1.8. Frequency Purchase

In [30]:
df3_purchase = df2_purchase.copy()

In [31]:
df_aux = df3_purchase[['invoice_no','customer_id', 'invoice_date']].drop_duplicates().groupby('customer_id').agg(max_  = ('invoice_date', 'max'), 
                                                                                               min_  = ('invoice_date', 'min'), 
                                                                                               days_ = ('invoice_date', lambda x: (( x.max() - x.min() ).days)+ 1),
                                                                                               buy_  = ('invoice_date', 'count')).reset_index()

# Frequency 
df_aux['frequency'] = df_aux[['buy_', 'days_']].apply( lambda x: x['buy_'] / x['days_'] if x['days_'] != 0 else 0, axis = 1)

# Merge
df_ref = pd.merge(df_ref , df_aux[['customer_id', 'frequency']], on='customer_id', how = 'left')

df_ref.isna().sum()

customer_id            0
gross_revenue         91
recency_days          91
qtde_invoices         91
qtde_items            91
qtde_products         91
avg_ticket            91
avg_recency_days    2816
frequency             91
dtype: int64

### 3.1.9. Numbers of Returns

In [32]:
df_returns = df2_returns[['customer_id' , 'quantity']].groupby('customer_id').sum().reset_index().rename( columns={'quantity' : 'qtde_returns'})
df_returns['qtde_returns'] = df_returns['qtde_returns'] * -1
df_ref = pd.merge(df_ref , df_returns, on ='customer_id' ,  how='left')

df_ref.loc[df_ref['qtde_returns'].isna() , 'qtde_returns'] = 0
df_ref.isna().sum()

customer_id            0
gross_revenue         91
recency_days          91
qtde_invoices         91
qtde_items            91
qtde_products         91
avg_ticket            91
avg_recency_days    2816
frequency             91
qtde_returns           0
dtype: int64

### 3.2.0.  Basket Size - Quantidade de items por cesta ( Quantity)

In [33]:
df_aux = df3_purchase.loc[: , ['customer_id' , 'invoice_no', 'quantity']].groupby('customer_id').agg(n_purchase =('invoice_no' , 'nunique'),
                                                                                                     n_products=('quantity', 'sum')).reset_index()
# Calculation
df_aux['avg_basket_size'] = df_aux['n_products'] / df_aux['n_purchase']

# merge
df_ref = pd.merge(df_ref , df_aux[['customer_id' , 'avg_basket_size' ]] , on='customer_id', how='left')

df_ref.isna().sum()

customer_id            0
gross_revenue         91
recency_days          91
qtde_invoices         91
qtde_items            91
qtde_products         91
avg_ticket            91
avg_recency_days    2816
frequency             91
qtde_returns           0
avg_basket_size       91
dtype: int64

### 3.2.1. Unique Basket Size - Quantidade de produtos distintos por compra


In [34]:
df_aux = df3_purchase.loc[: , ['customer_id' , 'invoice_no', 'stock_code']].groupby('customer_id').agg(n_purchase =('invoice_no' , 'nunique'),
                                                                                                       n_products=('stock_code', 'nunique')).reset_index()


# Calculation
df_aux['avg_unique_basket_size'] = df_aux['n_products'] / df_aux['n_purchase']

# merge
df_ref = pd.merge(df_ref , df_aux[['customer_id' , 'avg_unique_basket_size' ]] , on='customer_id', how='left')

df_ref.isna().sum()

customer_id                  0
gross_revenue               91
recency_days                91
qtde_invoices               91
qtde_items                  91
qtde_products               91
avg_ticket                  91
avg_recency_days          2816
frequency                   91
qtde_returns                 0
avg_basket_size             91
avg_unique_basket_size      91
dtype: int64

# <font color ='red'> 4.0 EDA (Exploratory Data Analysis) </font>

In [35]:
df4 = df_ref.dropna().copy()

## 4.3 Estudo do espaço

In [36]:
# Selected Dataset
cols_selected = ['customer_id', 'gross_revenue','recency_days', 'qtde_products', 'frequency', 'qtde_returns' ]
df43 = df4[cols_selected].copy()


In [37]:
mm = pp.MinMaxScaler()

df43['gross_revenue']          = mm.fit_transform(df43[['gross_revenue']])
df43['recency_days']           = mm.fit_transform(df43[['recency_days']])
df43['qtde_products']          = mm.fit_transform(df43[['qtde_products']])
df43['frequency']              = mm.fit_transform(df43[['frequency']])
df43['qtde_returns']           = mm.fit_transform(df43[['qtde_returns']])


X = df43.copy()


In [38]:
X.shape

(2968, 6)

### 4.3.4 Tree-Based Embedding

não precisar fazer o minmaxscaler() para trabalhar com uma arvore

In [39]:
# training dataset 
X = df43.drop(columns=['customer_id' , 'gross_revenue'], axis = 1)
Y = df43['gross_revenue']

# model definition
rf_model = en.RandomForestRegressor(n_estimators = 100 , random_state = 42 )

# model training
rf_model.fit(X,Y)

# dataframe Leaf
df_leaf = pd.DataFrame(rf_model.apply(X))

In [40]:
# reduzer dimensionality
reducer = umap.UMAP(random_state = 42)
embedding = reducer.fit_transform(df_leaf)


# embedding
df_tree = pd.DataFrame()
df_tree['embedding_x'] = embedding[:,0]
df_tree['embedding_y'] = embedding[:,1]



In [41]:
df5 = df_tree.copy()

In [42]:
df6 = df_tree.copy()

# <font color ='red'> 5.0 Hyperparameter Fine-Tunning </font>

In [43]:
x = df_tree.copy()

# <font color ='red'> 6.0 Model Training </font>

In [44]:
df8 = x.copy()

## 6.1 GMM

In [45]:
k = 9

# model definition
gmm_model = mx.GaussianMixture(n_components = k , random_state = 42)

# model training
gmm_model.fit(df8)

# model predict
labels = gmm_model.predict(df8)


## 6.2 Cluster Validation

In [46]:
print('SS value: {}'.format(met.silhouette_score(df8, labels, metric='euclidean')))

SS value: 0.6219987273216248


# <font color ='red'> 7.0 Cluster Analysis </font>

In [85]:
df92 = df4[cols_selected].copy()
df92['cluster'] = labels

# change dtypes
df92['recency_days'] = df92['recency_days'].astype('int64')
df92['qtde_products'] = df92['qtde_products'].astype('int64')
df92['qtde_returns'] = df92['qtde_returns'].astype('int64')

df92['last_training_timestamp'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

In [86]:
df92.head()

Unnamed: 0,customer_id,gross_revenue,recency_days,qtde_products,frequency,qtde_returns,cluster,last_training_timestamp
0,17850,5391.21,372,297,17.0,40,4,2021-12-04 01:55:08
1,13047,3232.59,56,171,0.028302,35,5,2021-12-04 01:55:08
2,12583,6705.38,2,232,0.040323,50,4,2021-12-04 01:55:08
3,13748,948.25,95,28,0.017921,0,1,2021-12-04 01:55:08
4,15100,876.0,333,3,0.073171,22,7,2021-12-04 01:55:08


## 7.2 Cluster Profile


In [49]:
# Number of customer
df_cluster = df92[['customer_id','cluster']].groupby('cluster').count().reset_index()
df_cluster['perc_customers'] = 100*(df_cluster['customer_id'] / df_cluster['customer_id'].sum())

# Avg Gross revenue
df_avg_gross_revenue = df92[['gross_revenue','cluster']].groupby('cluster').mean().reset_index()
df_cluster = pd.merge(df_cluster , df_avg_gross_revenue , how='inner' , on='cluster')

# Avg recency days
df_avg_recency_days = df92[['recency_days','cluster']].groupby('cluster').mean().reset_index()
df_cluster = pd.merge(df_cluster , df_avg_recency_days , how='inner' , on='cluster')

# Avg invoice_no
df_qtde_products = df92[['qtde_products','cluster']].groupby('cluster').mean().reset_index()
df_cluster = pd.merge(df_cluster , df_qtde_products , how='inner' , on='cluster')

# Frequency
df_frequency = df92[['frequency','cluster']].groupby('cluster').mean().reset_index()
df_cluster = pd.merge(df_cluster , df_frequency , how='inner' , on='cluster')

# Returns
df_qtde_returns = df92[['qtde_returns','cluster']].groupby('cluster').mean().reset_index()
df_cluster = pd.merge(df_cluster , df_qtde_returns , how='inner' , on='cluster')


df_cluster

Unnamed: 0,cluster,customer_id,perc_customers,gross_revenue,recency_days,qtde_products,frequency,qtde_returns
0,0,360,12.12938,2395.1195,45.427778,127.066667,0.044792,26.866667
1,1,454,15.296496,905.233062,83.013216,32.784141,0.11271,12.323789
2,2,226,7.614555,565.176947,139.99115,18.681416,0.193812,2.893805
3,3,415,13.98248,1221.472892,61.371084,53.219277,0.04204,9.122892
4,4,468,15.768194,8835.897073,21.438034,424.087607,0.094571,149.271368
5,5,145,4.885445,3164.115448,34.427586,173.841379,0.060415,23.482759
6,6,404,13.61186,1689.07349,54.809406,89.175743,0.049992,11.542079
7,7,366,12.331536,546.596749,99.434426,11.368852,0.351427,8.789617
8,8,130,4.380054,4689.868615,47.407692,103.884615,0.057342,20.753846


In [50]:
# 4 cluster Insiders
# 8 cluster more products
# 5 cluster spend money
# 0 cluster even more products
# 6 cluster less days
# 3 cluster less 1k
# 1 cluster Stop Returners
# 2 cluster more buy
# 7 cluster even more bu6

### Cluster 01: ( Candidato a Insiders)

    - Número de customers: 468 (16% do customers)
    - Faturamento médio: 8836
    - Recência média: 21 dias 
    - Média de produtos comprados: 424 produtos
    - Frequência de produtos comprados: 0.09 produtos/dia
    - Receita em média: $ 8835.90,00 dólares
    
### Cluster 02: 

    - Número de customers: 31 (0.71% do customers)
    - Recência em média: 14 dias 
    - Compras em média: 53 compras
    - Receita em média: $ 40.543,52 dólares

### Cluster 03: 

    - Número de customers: 4.335 (99.15% do customers)
    - Recência em média: 92 dias 
    - Compras em média: 5 compras
    - Receita em média: $ 1.372,57 dólares

# <font color ='red'> 8.0 Deploy to production </font>

In [59]:
df92.dtypes

customer_id        int64
gross_revenue    float64
recency_days       int64
qtde_products      int64
frequency        float64
qtde_returns       int64
cluster            int64
dtype: object

## 8.1. Insert into SQLITE

In [79]:
# create Table
# query_create_table_insiders = """
#     CREATE TABLE Insiders (
#         customer_id        INTEGER,
#         gross_revenue      REAL,
#         recency_days       INTEGER,
#         qtde_products      INTEGER,
#         frequency          REAL,
#         qtde_returns       INTEGER,
#         cluster            INTEGER
#      )

# """

# conn = sqlite3.connect('insiders_db.sqlite')
# conn.execute(query_create_table_insiders)
# conn.commit()
# conn.close()

# insert data
conn = create_engine('sqlite:///insiders_db.sqlite')
# df92.to_sql('insiders' , con = conn, if_exists='append', index=False)

# select data 

In [80]:
# consulting database
query = """
    SELECT * FROM Insiders
"""


df = pd.read_sql_query(query , conn)

In [81]:
df.head()

Unnamed: 0,customer_id,gross_revenue,recency_days,qtde_products,frequency,qtde_returns,cluster
0,17850,5391.21,372,297,17.0,40,4
1,13047,3232.59,56,171,0.028302,35,5
2,12583,6705.38,2,232,0.040323,50,4
3,13748,948.25,95,28,0.017921,0,1
4,15100,876.0,333,3,0.073171,22,7


In [83]:
df.shape

(2968, 7)