# 02. Clustering Analysis

In [1]:
import pandas as pd
from sklearn import preprocessing
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.cluster import Birch
from sklearn.cluster import AgglomerativeClustering

## I want to prepare demographics variables first, so at the end we can see how segrents are different from demographic point of view.

In [2]:
df_dem = pd.read_excel('../../data/raw_data/data.xlsx', sheet_name = "DurData_June 24, 2011")

In [3]:
#pd.set_option('display.max_columns', 500)
df_dem.head()

Unnamed: 0,Household_ID,Transaction_NBR,Transaction_Total,Transaction_Date,Transaction_Location,Online_Transaction,ORIGINAL_TICKET_NBR,Transaction_type,PRODUCT_ID,Category_Description,...,FEMALE_CHID_AGE_0-2,FEMALE_CHID_AGE_3-5,FEMALE_CHID_AGE_6-10,FEMALE_CHID_AGE_11-15,FEMALE_CHID_AGE_16-17,UNKNOWN_CHID_AGE_0-2,UNKNOWN_CHID_AGE_3-5,UNKNOWN_CHID_AGE_6-10,UNKNOWN_CHID_AGE_11-15,UNKNOWN_CHID_AGE_16-17
0,100003544,1,1,06JUN2003:00:00:00,537,0,53703232083,1,879852.0,VIDEO HDWR,...,0,0,0,0,0,0,0,0,0,0
1,100012312,1,2,09JAN2001:00:00:00,841,0,84102381410,1,580994.0,TELEVISION,...,0,0,0,0,0,0,0,0,0,0
2,100012312,2,2,09JAN2001:00:00:00,841,0,84102381410,1,720025.0,P*S*T,...,0,0,0,0,0,0,0,0,0,0
3,100016237,1,1,22JAN2001:00:00:00,949,0,94904161842,1,575635.0,P*S*T,...,0,0,0,0,0,0,0,0,0,0
4,100022945,1,11,30JAN2001:00:00:00,521,0,52101921224,1,545443.0,MOBILE,...,0,0,0,0,1,0,0,0,0,0


Columns for childrens are separated by sex and age, which are too many columns. So I simply add them to get the total number of kids.

In [4]:
df_dem['n_kids'] = df_dem.loc[:,df_dem.columns.str.contains("CHID")].sum(axis = 1)

I want to keep only demographics' variables.

In [5]:
df_dem = df_dem[['Household_ID', 'Age_H.Head', 'Income', 'GENDER_H.Head', 'n_kids']]

In [6]:
df_dem.isna().sum()

Household_ID         0
Age_H.Head       20971
Income           20455
GENDER_H.Head        0
n_kids               0
dtype: int64

I drop rows where "Age_H.Head" is NA.

In [7]:
df_dem = df_dem[df_dem['Age_H.Head'].notna()]

In [8]:
df_dem.isna().sum()

Household_ID        0
Age_H.Head          0
Income           8371
GENDER_H.Head       0
n_kids              0
dtype: int64

In [9]:
df_dem.Income.describe()

count    143920.000000
mean          5.998263
std           2.258421
min           1.000000
25%           5.000000
50%           6.000000
75%           8.000000
max           9.000000
Name: Income, dtype: float64

Mean is close to 6, median is 6, so I will fill it with this value.

In [10]:
df_dem.Income.fillna(6, inplace = True)

In [11]:
df_dem = pd.get_dummies(df_dem, columns = ['GENDER_H.Head'])

We have duplicate rows, we want to leave only unique rows.

In [12]:
df_dem.Household_ID.nunique() # same as the length of the dataframe after dropping duplicating rows

16384

In [13]:
df_dem.drop_duplicates(inplace = True)

In [14]:
len(df_dem) # check

16384

In [15]:
df_dem.set_index('Household_ID', inplace = True)

In [16]:
df_dem.head()

Unnamed: 0_level_0,Age_H.Head,Income,n_kids,GENDER_H.Head_F,GENDER_H.Head_M,GENDER_H.Head_U
Household_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
100003544,28.0,6.0,0,0,1,0
100012312,24.0,1.0,0,0,1,0
100022945,44.0,5.0,1,0,1,0
100022976,54.0,7.0,0,0,1,0
100024091,44.0,5.0,0,0,0,1


## Now I want to prepare behavioral data for clustering.

In [17]:
df = pd.read_csv('../../data/processed_data/behavioral_data.csv')

In [18]:
df.set_index('Household_ID', inplace = True)

In [19]:
df.head()

Unnamed: 0_level_0,AUDIO_QNT,DVS_QNT,EXPRESS_QNT,GIFT CARDS_QNT,HOME INS_QNT,IMAGING_QNT,INTABGIBLE_QNT,MAJORS_QNT,MOBILE_QNT,MUSIC_QNT,...,INTABGIBLE_EXP,MAJORS_EXP,MOBILE_EXP,MUSIC_EXP,OTHER_EXP,P*S*T_EXP,PC HDWR_EXP,TELEVISION_EXP,VIDEO HDWR_EXP,WIRELESS_EXP
Household_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100003544,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.97,0.0
100012312,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,19.99,0.0,9.99,0.0,0.0
100016237,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,89.99,0.0,0.0,0.0,0.0
100022945,0,0,0,0,0,1,0,0,2,0,...,0.0,0.0,24.98,0.0,0.0,193.95,149.99,0.0,0.0,0.0
100022976,1,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,249.99,0.0


I divide data to 2 sets, so I can use them as 2 sets of basis variables. One set is quantity of items bought in each particular category. Second set is expenditures per category.

In [20]:
df_q = df.iloc[:,:16]

In [21]:
df_q.head()

Unnamed: 0_level_0,AUDIO_QNT,DVS_QNT,EXPRESS_QNT,GIFT CARDS_QNT,HOME INS_QNT,IMAGING_QNT,INTABGIBLE_QNT,MAJORS_QNT,MOBILE_QNT,MUSIC_QNT,OTHER_QNT,P*S*T_QNT,PC HDWR_QNT,TELEVISION_QNT,VIDEO HDWR_QNT,WIRELESS_QNT
Household_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
100003544,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
100012312,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0
100016237,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
100022945,0,0,0,0,0,1,0,0,2,0,0,5,1,0,0,0
100022976,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [22]:
df_e = df.iloc[:,16:]

In [23]:
df_e.head()

Unnamed: 0_level_0,AUDIO_EXP,DVS_EXP,EXPRESS_EXP,GIFT CARDS_EXP,HOME INS_EXP,IMAGING_EXP,INTABGIBLE_EXP,MAJORS_EXP,MOBILE_EXP,MUSIC_EXP,OTHER_EXP,P*S*T_EXP,PC HDWR_EXP,TELEVISION_EXP,VIDEO HDWR_EXP,WIRELESS_EXP
Household_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
100003544,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.97,0.0
100012312,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19.99,0.0,9.99,0.0,0.0
100016237,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,89.99,0.0,0.0,0.0,0.0
100022945,0.0,0.0,0.0,0,0.0,259.99,0.0,0.0,24.98,0.0,0.0,193.95,149.99,0.0,0.0,0.0
100022976,599.99,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,249.99,0.0


Let's check the variance of basis variables, to see whether we should drop any columns.

In [24]:
df_q.var()

AUDIO_QNT         28.524774
DVS_QNT            0.393566
EXPRESS_QNT        0.000050
GIFT CARDS_QNT     0.005093
HOME INS_QNT       0.020416
IMAGING_QNT        1.372556
INTABGIBLE_QNT     0.007952
MAJORS_QNT         0.628519
MOBILE_QNT         4.127627
MUSIC_QNT          7.921250
OTHER_QNT          0.101378
P*S*T_QNT         11.195786
PC HDWR_QNT        1.654860
TELEVISION_QNT     1.304974
VIDEO HDWR_QNT     0.938595
WIRELESS_QNT       0.436556
dtype: float64

In [25]:
df_e.var()

AUDIO_EXP          53561.946966
DVS_EXP             9173.959286
EXPRESS_EXP            0.002452
GIFT CARDS_EXP         0.000000
HOME INS_EXP        4272.063539
IMAGING_EXP        64874.396736
INTABGIBLE_EXP         4.709039
MAJORS_EXP         39688.873668
MOBILE_EXP         22267.333947
MUSIC_EXP           7871.469586
OTHER_EXP             45.977035
P*S*T_EXP          14961.995540
PC HDWR_EXP       431757.614766
TELEVISION_EXP    387973.599713
VIDEO HDWR_EXP     13382.184235
WIRELESS_EXP        5730.101950
dtype: float64

"GIFT CARDS_EXP" column variance is equal to zero. So I drop this column from both data frames.

In [26]:
df_e.drop(columns = ['GIFT CARDS_EXP'], inplace = True)

In [27]:
df_q.drop(columns = ['GIFT CARDS_QNT'], inplace = True)

Now let's check summation results by columns, to see whether there are some other columns we might consider to drop.

In [28]:
df_q.sum()

AUDIO_QNT         20055
DVS_QNT            2464
EXPRESS_QNT           1
HOME INS_QNT        320
IMAGING_QNT        9181
INTABGIBLE_QNT      103
MAJORS_QNT         4143
MOBILE_QNT        12549
MUSIC_QNT         13184
OTHER_QNT          1034
P*S*T_QNT         21154
PC HDWR_QNT       11136
TELEVISION_QNT    11793
VIDEO HDWR_QNT     8897
WIRELESS_QNT       3449
dtype: int64

In [29]:
df_e.sum()

AUDIO_EXP         1736057.29
DVS_EXP            336950.60
EXPRESS_EXP             6.99
HOME INS_EXP        92313.65
IMAGING_EXP       1724541.20
INTABGIBLE_EXP       1021.99
MAJORS_EXP         772050.63
MOBILE_EXP         896364.91
MUSIC_EXP          512958.67
OTHER_EXP            6604.05
P*S*T_EXP         1042523.09
PC HDWR_EXP       5109919.40
TELEVISION_EXP    4503511.48
VIDEO HDWR_EXP     951183.67
WIRELESS_EXP       348179.16
dtype: float64

Among expenditure columns "EXPRESS_EXP", "INTABGIBLE_EXP" and "OTHER_EXP" provide too little information. Among quantity columns: "EXPRESS_QNT", "HOME INS_QNT"(quantity is small, though it is much money, so I keep it), "INTABGIBLE_QNT" and "OTHER_QNT" provide too little information. So I disided to drop them.

In [30]:
df_e.drop(columns = ['EXPRESS_EXP', 'INTABGIBLE_EXP', 'OTHER_EXP'], inplace = True)

In [31]:
df_q.drop(columns = ['EXPRESS_QNT', 'INTABGIBLE_QNT', 'OTHER_QNT'], inplace = True)

## Data normalization

I normalize variables, using formula below:

$$\frac{x - µ_x}{σ_x}$$

In [32]:
scaler = preprocessing.StandardScaler()

In [33]:
df_e = pd.DataFrame(scaler.fit_transform(df_e),columns = df_e.columns, index = df_e.index)

In [34]:
df_q = pd.DataFrame(scaler.fit_transform(df_q),columns = df_q.columns, index = df_q.index)

In [35]:
#df_q = (df_q - df_q.mean())/df_q.std()
#df_e = (df_e - df_e.mean())/df_e.std()s

## Verification:

After normalization variance should be 1, and mean - 0.

In [36]:
print("Variances:")
df_e.var()

Variances:


AUDIO_EXP         1.00005
DVS_EXP           1.00005
HOME INS_EXP      1.00005
IMAGING_EXP       1.00005
MAJORS_EXP        1.00005
MOBILE_EXP        1.00005
MUSIC_EXP         1.00005
P*S*T_EXP         1.00005
PC HDWR_EXP       1.00005
TELEVISION_EXP    1.00005
VIDEO HDWR_EXP    1.00005
WIRELESS_EXP      1.00005
dtype: float64

In [37]:
print("Means:")
df_e.mean()

Means:


AUDIO_EXP        -2.509371e-15
DVS_EXP          -5.857749e-16
HOME INS_EXP     -6.398127e-16
IMAGING_EXP       1.238709e-15
MAJORS_EXP       -2.732931e-15
MOBILE_EXP        6.785490e-15
MUSIC_EXP        -3.536832e-15
P*S*T_EXP         2.568386e-16
PC HDWR_EXP       3.696046e-16
TELEVISION_EXP   -3.437575e-15
VIDEO HDWR_EXP   -1.992803e-15
WIRELESS_EXP     -1.936868e-15
dtype: float64

In [38]:
print("Variances:")
df_q.var()

Variances:


AUDIO_QNT         1.00005
DVS_QNT           1.00005
HOME INS_QNT      1.00005
IMAGING_QNT       1.00005
MAJORS_QNT        1.00005
MOBILE_QNT        1.00005
MUSIC_QNT         1.00005
P*S*T_QNT         1.00005
PC HDWR_QNT       1.00005
TELEVISION_QNT    1.00005
VIDEO HDWR_QNT    1.00005
WIRELESS_QNT      1.00005
dtype: float64

In [39]:
print("Means:")
df_q.mean()

Means:


AUDIO_QNT         7.195838e-17
DVS_QNT          -3.420384e-16
HOME INS_QNT      6.134762e-16
IMAGING_QNT       2.667566e-15
MAJORS_QNT       -5.873152e-16
MOBILE_QNT        7.340563e-15
MUSIC_QNT        -3.069889e-15
P*S*T_QNT         6.312952e-16
PC HDWR_QNT       1.440844e-15
TELEVISION_QNT    2.461052e-15
VIDEO HDWR_QNT   -4.303866e-15
WIRELESS_QNT      2.910779e-15
dtype: float64

It does not show means as 0s, this is specifics of computer calculations. Those numbers are very close to 0s. You can see the same specifics of computer calculations at variances. I.e. they are 1.00005.

However, **data was normalized.**

## Clustering

Let's cluster data, using both sets of basis variables. I will try different clustering algorithms with different number of clusters.

In [40]:
def find_best_alg(df, from_, to_):
    bestSil = -1
    for k in range(from_,to_+1):
        print ('k= %d ' % k)
        clus = [KMeans(n_clusters=k), Birch(n_clusters=k), AgglomerativeClustering(n_clusters=k)]
        for cl in clus:
            res = cl.fit(df)
            sil = metrics.silhouette_score(df, res.labels_)
            print (str(cl)[:6] + ' with k=' +str(k) + ": " + str(round(sil,4)))
            if (sil > bestSil):
                bestSil = sil
                bestCl = cl
                bestK = k
    print('***********************************************')
    print ('Best algorithm is... ' + str(bestCl)[:6] + '  with k=' +str(bestK) )
    print('**********************')
    print ('With Silhouette Score ' + str(bestSil))

In [41]:
find_best_alg(df_q, 2, 5)

k= 2 
KMeans with k=2: 0.5785
Birch( with k=2: 0.6739
Agglom with k=2: 0.6379
k= 3 
KMeans with k=3: 0.5633
Birch( with k=3: 0.6409
Agglom with k=3: 0.3754
k= 4 
KMeans with k=4: 0.5217
Birch( with k=4: 0.645
Agglom with k=4: 0.3874
k= 5 
KMeans with k=5: 0.5004
Birch( with k=5: 0.5106
Agglom with k=5: 0.3876
***********************************************
Best algorithm is... Birch(  with k=2
**********************
With Silhouette Score 0.6738746416070719


In [42]:
find_best_alg(df_e, 2, 5)

k= 2 
KMeans with k=2: 0.5968
Birch( with k=2: 0.9486
Agglom with k=2: 0.3682
k= 3 
KMeans with k=3: 0.5676
Birch( with k=3: 0.6766
Agglom with k=3: 0.3064
k= 4 
KMeans with k=4: 0.5077
Birch( with k=4: 0.387
Agglom with k=4: 0.3195
k= 5 
KMeans with k=5: 0.4393
Birch( with k=5: 0.397
Agglom with k=5: 0.3208
***********************************************
Best algorithm is... Birch(  with k=2
**********************
With Silhouette Score 0.9486433774181372


In [43]:
clu = Birch(n_clusters = 2)

In [44]:
clu.fit(df_e)

Birch(branching_factor=50, compute_labels=True, copy=True, n_clusters=2,
      threshold=0.5)

In [45]:
df_e['segment'] = clu.labels_

In [46]:
df_e.groupby('segment').AUDIO_EXP.count()

segment
0    19928
1        2
Name: AUDIO_EXP, dtype: int64

It has enormous silhouette score (~0.9486). But this segmentation does not make sense, because there is just 2 people in the segment.

In [47]:
df_e.drop(columns = ['segment'], inplace = True)

### Let's try KMeans based on quantity basis set.

In [48]:
clu = KMeans(n_clusters = 2)

In [49]:
clu.fit(df_q)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=2, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [50]:
df_q['segment'] = clu.labels_

In [51]:
df_q.groupby('segment').AUDIO_QNT.count()

segment
0    17993
1     1937
Name: AUDIO_QNT, dtype: int64

In [52]:
df_q.groupby('segment').mean()

Unnamed: 0_level_0,AUDIO_QNT,DVS_QNT,HOME INS_QNT,IMAGING_QNT,MAJORS_QNT,MOBILE_QNT,MUSIC_QNT,P*S*T_QNT,PC HDWR_QNT,TELEVISION_QNT,VIDEO HDWR_QNT,WIRELESS_QNT
segment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,-0.067067,-0.063302,-0.050528,-0.166452,-0.123198,-0.077811,-0.107518,-0.088202,-0.08808,-0.128776,-0.148256,-0.073669
1,0.622993,0.588016,0.469361,1.546188,1.1444,0.722796,0.998744,0.819317,0.818186,1.19621,1.377161,0.684324


In [53]:
df_q.drop(columns = ['segment'], inplace = True)

Here we can see that using 2 segments does not really help. It simply devides customers to those who buy below the average and those who buy above the average.

### Let's try 3 segments, Birch algorithm, expenditures basis variables.

In [54]:
clu = Birch(n_clusters = 3)

In [55]:
clu.fit(df_e)

Birch(branching_factor=50, compute_labels=True, copy=True, n_clusters=3,
      threshold=0.5)

In [56]:
df_e['segment'] = clu.labels_

In [57]:
df_e.groupby('segment').AUDIO_EXP.count()

segment
0    19829
1        2
2       99
Name: AUDIO_EXP, dtype: int64

In [58]:
df_e.groupby('segment').mean()

Unnamed: 0_level_0,AUDIO_EXP,DVS_EXP,HOME INS_EXP,IMAGING_EXP,MAJORS_EXP,MOBILE_EXP,MUSIC_EXP,P*S*T_EXP,PC HDWR_EXP,TELEVISION_EXP,VIDEO HDWR_EXP,WIRELESS_EXP
segment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,-0.000763,-0.000884,-0.006806,-0.000408,-0.048176,-0.000451,-0.000469,-0.001471,-0.001443,-0.000882,-0.001983,-0.002859
1,-0.376391,-0.176519,70.98664,-0.339735,-0.194453,-0.301408,0.245235,0.128199,6.875337,-0.362789,-0.412576,-0.230794
2,0.160388,0.180661,-0.070868,0.088585,9.65321,0.096335,0.088942,0.292011,0.150094,0.18396,0.405487,0.577374


In [59]:
df_e.drop(columns = ['segment'], inplace = True)

2 out of 3 segments are too small.

### I went through the similar process with the rest of the algorithms and number of clusters. In my opinion this is the most optimal in terms of silhouette score and meaningful in terms of interpritation and segments' sizes result.

In [60]:
clu = KMeans(n_clusters = 3)

In [61]:
clu.fit(df_e)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=3, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [62]:
df_e['segment'] = clu.labels_

In [63]:
df_e.groupby('segment').AUDIO_EXP.count()

segment
0    17849
1      431
2     1650
Name: AUDIO_EXP, dtype: int64

In [64]:
df_e.groupby('segment').mean()

Unnamed: 0_level_0,AUDIO_EXP,DVS_EXP,HOME INS_EXP,IMAGING_EXP,MAJORS_EXP,MOBILE_EXP,MUSIC_EXP,P*S*T_EXP,PC HDWR_EXP,TELEVISION_EXP,VIDEO HDWR_EXP,WIRELESS_EXP
segment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,-0.123251,-0.133195,-0.029548,-0.129651,-0.078704,-0.076596,-0.120138,-0.131122,-0.094014,-0.109995,-0.122836,-0.05538
1,0.510998,5.677993,0.12763,0.091906,0.204904,0.140062,0.136515,0.138728,0.14189,0.5508,0.305941,-0.05001
2,1.1998,-0.04231,0.286302,1.378502,0.797862,0.791995,1.263946,1.382186,0.979945,1.045999,1.248873,0.612139


**We can see here, that the biggest segment buy below the mean in every category. 2 other segments are different.**

## Merge segmentation results with demographic data.

In [65]:
df_dem = df_dem.merge(df_e[['segment']], left_index = True, right_index = True)

In [66]:
df_dem.groupby('segment').mean()

Unnamed: 0_level_0,Age_H.Head,Income,n_kids,GENDER_H.Head_F,GENDER_H.Head_M,GENDER_H.Head_U
segment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,48.404836,5.712608,0.454231,0.324905,0.531054,0.144041
1,49.115869,6.314861,0.523929,0.216625,0.664987,0.118388
2,48.324056,6.422134,0.546057,0.25116,0.595096,0.153744


Our segments are similar in demographic variables.