## K-means應用：信用卡客戶分群

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import calinski_harabasz_score

df = pd.read_csv('customer.csv')
dict1 = {'男':1, '女':2}
df['性別'].replace(dict1, inplace=True)
df

Unnamed: 0,性別,年齡,收入(千),消費指數(1-100)
0,1,34,78,1
1,1,20,61,49
2,1,29,28,82
3,2,31,81,93
4,1,48,60,49
...,...,...,...,...
195,2,33,86,95
196,1,35,28,61
197,2,68,48,48
198,1,38,71,75


In [None]:
km = KMeans(n_clusters=3)
km.fit(df)
km.labels_

array([1, 2, 2, 0, 2, 2, 2, 1, 2, 0, 0, 2, 2, 2, 1, 2, 2, 2, 1, 2, 1, 0,
       1, 2, 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 2, 0, 0, 2, 2, 1, 2, 0, 2,
       2, 0, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 1, 0, 2, 1, 2, 2, 2, 0, 0,
       2, 0, 2, 1, 1, 0, 2, 2, 2, 2, 0, 1, 2, 2, 1, 2, 2, 1, 1, 0, 2, 2,
       2, 2, 2, 2, 0, 0, 2, 2, 1, 0, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2,
       2, 0, 0, 0, 2, 2, 2, 0, 0, 2, 1, 0, 2, 2, 2, 2, 2, 1, 0, 2, 2, 2,
       2, 2, 2, 2, 1, 2, 2, 1, 2, 2, 0, 1, 1, 2, 2, 2, 2, 1, 2, 1, 0, 0,
       2, 2, 0, 2, 1, 2, 0, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 2, 0, 2, 2, 0,
       2, 0, 2, 2, 2, 2, 2, 1, 2, 2, 1, 0, 2, 1, 1, 2, 0, 1, 2, 0, 2, 2,
       0, 0], dtype=int32)

In [None]:
df['類別'] = km.labels_
df.head()

Unnamed: 0,性別,年齡,收入(千),消費指數(1-100),類別
0,1,34,78,1,1
1,1,20,61,49,2
2,1,29,28,82,2
3,2,31,81,93,0
4,1,48,60,49,2


In [None]:
df2 = df[df['類別']==0]
df3 = df2.iloc[0:30, :]
df3

Unnamed: 0,性別,年齡,收入(千),消費指數(1-100),類別
3,2,31,81,93,0
9,2,32,103,69,0
10,2,29,98,88,0
21,1,32,126,74,0
36,1,28,77,97,0
37,1,32,73,73,0
42,1,39,78,88,0
45,1,30,137,83,0
51,1,28,87,75,0
58,1,40,71,95,0


In [None]:
df2 = df[df['類別']==1]
df3 = df2.iloc[0:30, :]
df3

Unnamed: 0,性別,年齡,收入(千),消費指數(1-100),類別
0,1,34,78,1,1
7,1,43,78,17,1
14,2,47,78,16,1
18,1,42,86,20,1
20,2,45,126,28,1
22,2,47,120,16,1
24,2,34,103,23,1
32,1,19,74,10,1
33,1,20,73,5,1
34,1,37,78,1,1


In [None]:
df2 = df[df['類別']==2]
df3 = df2.iloc[0:30, :]
df3

Unnamed: 0,性別,年齡,收入(千),消費指數(1-100),類別
1,1,20,61,49,2
2,1,29,28,82,2
4,1,48,60,49,2
5,2,42,34,17,2
6,1,26,54,54,2
8,1,48,39,36,2
11,2,21,33,81,2
12,1,59,54,47,2
13,2,23,62,41,2
15,1,68,63,43,2


In [None]:
for n in range(2,15): 
  km = KMeans(n_clusters=n)
  km.fit(df)
  metric = calinski_harabasz_score(df, km.labels_) 
  print('群組數量：{}，評分：{}'.format(n, metric))

群組數量：2，評分：89.22993794799956
群組數量：3，評分：113.75369164959781
群組數量：4，評分：128.00411558215293
群組數量：5，評分：151.0230825445021
群組數量：6，評分：166.65094265754053
群組數量：7，評分：161.92934525368761
群組數量：8，評分：163.48821945396105
群組數量：9，評分：156.4899313941271
群組數量：10，評分：154.57746679789577
群組數量：11，評分：150.16177142332705
群組數量：12，評分：147.13569835673192
群組數量：13，評分：140.98611696860985
群組數量：14，評分：140.82479218893056


## DBSCAN應用：信用卡客戶分群

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler

df = pd.read_csv('customer.csv')
dict1 = {'男':1, '女':2}
df['性別'].replace(dict1, inplace=True)
df

Unnamed: 0,性別,年齡,收入(千),消費指數(1-100)
0,1,34,78,1
1,1,20,61,49
2,1,29,28,82
3,2,31,81,93
4,1,48,60,49
...,...,...,...,...
195,2,33,86,95
196,1,35,28,61
197,2,68,48,48
198,1,38,71,75


In [None]:
scaler = StandardScaler()
scaler.fit(df)
dfScaled = scaler.transform(df)
dbscan = DBSCAN(eps=0.82, min_samples=5)
dbs = dbscan.fit(dfScaled)
dbs.labels_

array([ 0,  1,  2,  3,  0,  3,  1,  0,  0,  3,  3,  3,  0,  3,  4,  0,  0,
        3,  0,  3, -1, -1,  4,  0,  4, -1,  0,  3,  1, -1,  2,  3,  0,  0,
        0,  1,  1,  1,  3,  3,  3,  0,  1,  3,  1, -1,  3,  3,  3,  0,  3,
        1,  3,  3, -1,  3,  0, -1,  1,  3,  0, -1,  3,  3,  3,  3,  3,  3,
       -1, -1,  4,  1,  1,  3,  3,  2,  3,  4,  1,  3,  3,  0,  3,  0,  0,
        1,  3,  3,  0,  3,  3,  3,  3,  1,  3,  3,  3,  3,  4,  3,  0,  2,
        3, -1,  3,  0,  3,  2,  1,  0,  3,  1,  3,  1,  0,  0,  3,  1,  3,
        3, -1,  3,  3,  0,  3,  3,  3,  0,  3,  2,  2,  0,  3,  3,  0,  3,
        0, -1,  1,  4,  3,  3,  1,  3,  4,  2,  3, -1,  3,  4,  0,  4,  3,
        3,  3,  3,  3,  3,  0,  3,  3,  3,  0,  3, -1,  0,  3,  1,  0,  3,
        3,  3,  3,  3,  0,  3,  3,  1,  0,  3,  0, -1, -1,  4,  3,  3, -1,
        1,  3,  0,  4,  3,  3,  4,  3,  3,  2,  3,  1,  1])

In [None]:
df['類別'] = dbs.labels_
df.head()

Unnamed: 0,性別,年齡,收入(千),消費指數(1-100),類別
0,1,34,78,1,0
1,1,20,61,49,1
2,1,29,28,82,2
3,2,31,81,93,3
4,1,48,60,49,0


In [None]:
df2 = df[df['類別']==0]
df3 = df2.iloc[0:30, :]
df3

Unnamed: 0,性別,年齡,收入(千),消費指數(1-100),類別
0,1,34,78,1,0
4,1,48,60,49,0
7,1,43,78,17,0
8,1,48,39,36,0
12,1,59,54,47,0
15,1,68,63,43,0
16,1,57,54,51,0
18,1,42,86,20,0
23,1,49,62,56,0
26,1,53,46,46,0


In [None]:
df2 = df[df['類別']==1]
df3 = df2.iloc[0:30, :]
df3

Unnamed: 0,性別,年齡,收入(千),消費指數(1-100),類別
1,1,20,61,49,1
6,1,26,54,54,1
28,1,26,62,55,1
35,1,18,59,41,1
36,1,28,77,97,1
37,1,32,73,73,1
42,1,39,78,88,1
44,1,24,60,52,1
51,1,28,87,75,1
58,1,40,71,95,1


In [None]:
df2 = df[df['類別']==-1]
df3 = df2.iloc[0:30, :]
df3

Unnamed: 0,性別,年齡,收入(千),消費指數(1-100),類別
20,2,45,126,28,-1
21,1,32,126,74,-1
25,2,20,16,6,-1
29,1,19,15,39,-1
45,1,30,137,83,-1
54,1,64,19,3,-1
57,1,59,71,11,-1
61,2,35,19,99,-1
68,1,67,19,14,-1
69,1,33,113,8,-1


## PCA

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components=0.95)
data = pca.fit_transform([[2,8,4,5,9,3], [6,3,0,8,7,1], 
                          [5,4,9,1,8,2]])
print(data)

[[-0.27506139  4.0732061 ]
 [ 5.92362693 -1.89137796]
 [-5.64856554 -2.18182813]]


In [None]:
import pandas as pd
prior = pd.read_csv('order_products__prior.csv')
products = pd.read_csv('products.csv')
orders = pd.read_csv('orders.csv')
aisles = pd.read_csv('aisles.csv')
t1 = pd.merge(prior, products, on=['product_id', 
                                   'product_id'])
t1 = pd.merge(t1, orders, on=['order_id', 'order_id'])
mt = pd.merge(t1, aisles, on=['aisle_id', 'aisle_id'])
print(mt.shape)
mt

(32434489, 14)


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,aisle
0,2,33120,1,1,Organic Egg Whites,86,16,202279,prior,3,5,9,8.0,eggs
1,26,33120,5,0,Organic Egg Whites,86,16,153404,prior,2,0,16,7.0,eggs
2,120,33120,13,0,Organic Egg Whites,86,16,23750,prior,11,6,8,10.0,eggs
3,327,33120,5,1,Organic Egg Whites,86,16,58707,prior,21,6,9,8.0,eggs
4,390,33120,28,1,Organic Egg Whites,86,16,166654,prior,48,0,12,9.0,eggs
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32434484,3243156,20731,1,0,Straight Sherry,134,5,166400,prior,3,1,13,12.0,specialty wines champagnes
32434485,860862,30582,1,0,Natural Champagne,134,5,104017,prior,13,5,14,18.0,specialty wines champagnes
32434486,1333472,27906,1,0,Imperial Champagne,134,5,62079,prior,10,3,10,10.0,specialty wines champagnes
32434487,2122701,26086,1,1,La Grand Dame Brut Champagne,134,5,77799,prior,2,3,14,3.0,specialty wines champagnes


In [None]:
cross = pd.crosstab(mt['user_id'], mt['aisle'])
print(cross.shape)
cross

(206209, 134)


aisle,air fresheners candles,asian foods,baby accessories,baby bath body care,baby food formula,bakery desserts,baking ingredients,baking supplies decor,beauty,beers coolers,body lotions soap,bread,breakfast bakery,breakfast bars pastries,bulk dried fruits vegetables,bulk grains rice dried goods,buns rolls,butter,candy chocolate,canned fruit applesauce,canned jarred vegetables,canned meals beans,canned meat seafood,cat food care,cereal,chips pretzels,cleaning products,cocoa drink mixes,coffee,cold flu allergy,condiments,cookies cakes,crackers,cream,deodorants,diapers wipes,digestion,dish detergents,dog food care,doughs gelatins bake mixes,...,packaged meat,packaged poultry,packaged produce,packaged seafood,packaged vegetables fruits,paper goods,pasta sauce,pickled goods olives,plates bowls cups flatware,popcorn jerky,poultry counter,prepared meals,prepared soups salads,preserved dips spreads,protein meal replacements,red wines,refrigerated,refrigerated pudding desserts,salad dressing toppings,seafood counter,shave needs,skin care,soap,soft drinks,soup broth bouillon,soy lactosefree,specialty cheeses,specialty wines champagnes,spices seasonings,spirits,spreads,tea,tofu meat alternatives,tortillas flat bread,trail mix snack mix,trash bags liners,vitamins supplements,water seltzer sparkling water,white wines,yogurt
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,3,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,...,0,0,0,0,0,2,0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0,0,13,0,2,0,0,0,0,1,0,0,0,0,0,0,0,0,1
2,0,3,0,0,0,0,2,0,0,0,0,1,1,2,0,0,0,0,0,0,0,0,1,0,0,12,0,0,0,0,5,0,11,0,0,0,0,0,0,0,...,0,1,0,0,3,0,0,1,0,10,0,0,1,0,0,0,5,0,0,0,0,0,0,0,0,1,0,0,0,0,3,1,1,0,0,0,0,2,0,42
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0,...,0,0,0,0,14,1,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,15,0,0,0,0,4,1,0,0,0,0,0,2,0,0
4,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0
5,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206205,0,0,1,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,...,0,0,2,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,5
206206,0,4,0,0,0,0,4,1,0,0,0,1,1,0,0,0,0,1,5,0,2,0,0,0,1,11,0,0,0,0,2,0,1,16,0,0,0,2,0,0,...,0,0,0,0,6,3,0,0,5,0,0,0,0,1,0,0,19,0,2,0,0,0,8,8,7,1,3,0,3,0,1,0,0,0,0,1,0,1,0,0
206207,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,5,4,0,0,1,8,0,0,0,0,5,0,5,0,0,0,0,0,0,2,...,2,0,0,0,15,0,6,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,2,0,0,0,4,0,3,4,0,2,1,0,0,11,0,15
206208,0,3,0,0,3,0,4,0,0,0,0,55,0,0,0,0,0,22,2,0,3,6,0,0,13,12,0,0,7,0,6,0,2,16,0,0,0,4,0,5,...,1,4,3,5,71,0,2,0,2,0,1,0,0,0,0,0,10,0,1,0,0,0,5,0,2,24,0,0,7,0,5,0,0,7,0,0,0,0,0,33


In [None]:
pca = PCA(n_components=0.95)
# pca = PCA(n_components=0.9)
data = pca.fit_transform(cross)
print(data.shape)

(206209, 44)
