In [2]:
import numpy as np
import pandas as pd

In [4]:
df = pd.read_csv("NHANES_sample_data.csv")
del df["RIDRETH3"], df["WTINT2YR"], df["WTMEC2YR"], df["DR1TCARB"]
del df["DR1TTFAT"], df["LBXGLU"], df["BMXHT"]
df.columns = ["ID", "Age", "Gender", "Body_weight", "BMI", "HbA1c", "Glucose"]
df = df.dropna()

In [7]:
# 条件式
age_if = df["Age"]>=20
#bmi_if = 18.0<=df["BMI"]<=50 ...これできない
bmi_if = (18.0<=df["BMI"]) & (df["BMI"]<=50)
Glucose_if = 4.0<=df["Glucose"]
HbA1c_if = (4.0<=df["HbA1c"]) & (df["HbA1c"]<=10)
df_lim_if = age_if & bmi_if & Glucose_if & HbA1c_if

# filter
df_lim = df[df_lim_if]

In [10]:
# Q1
df_lim_corr = df_lim.corr()
df_lim_corr

Unnamed: 0,ID,Age,Gender,Body_weight,BMI,HbA1c,Glucose
ID,1.0,0.009234,-0.027366,-0.055117,-0.066573,0.029348,0.021683
Age,0.009234,1.0,-0.011334,-0.047174,0.036309,0.402987,0.288616
Gender,-0.027366,-0.011334,1.0,-0.265481,0.082842,-0.049861,-0.131898
Body_weight,-0.055117,-0.047174,-0.265481,1.0,0.865279,0.200441,0.21193
BMI,-0.066573,0.036309,0.082842,0.865279,1.0,0.245395,0.217056
HbA1c,0.029348,0.402987,-0.049861,0.200441,0.245395,1.0,0.717413
Glucose,0.021683,0.288616,-0.131898,0.21193,0.217056,0.717413,1.0


In [30]:
# Q2
pd.concat([df_lim.mean(), df_lim.min(), df_lim.max()
            , df_lim.median(), df_lim.mode().iloc[0,:]
            , df_lim.quantile(.25), df_lim.quantile(.75)
            , df_lim.std()], axis=1
            , keys=["mean", "min", "max", "median", "mode", "quantile25", "quantile75", "std"])

Unnamed: 0,mean,min,max,median,mode,quantile25,quantile75,std
ID,78688.239518,73559.0,83727.0,78723.0,73559.0,76192.0,81204.0,2920.558198
Age,49.775425,20.0,80.0,49.0,80.0,35.0,64.0,17.473633
Gender,1.513076,1.0,2.0,2.0,2.0,1.0,2.0,0.499933
Body_weight,80.394396,40.2,179.4,77.8,71.8,66.2,91.4,19.911631
BMI,28.647655,18.1,50.0,27.6,23.9,23.9,32.0,6.249964
HbA1c,5.632752,4.2,9.8,5.5,5.4,5.2,5.9,0.716073
Glucose,5.805453,4.052,20.428,5.495,5.218,5.107,5.995,1.315721


In [126]:
#Q4
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
df_clus = df_lim[["BMI", "Glucose"]]

#標準化　標準偏差1
scaler = StandardScaler()
scaler.fit(df_clus)

df_clus = pd.DataFrame(scaler.transform(df_clus))
pred = KMeans(n_clusters=4).fit_predict(df_clus)

#結果結合
df_clus = pd.concat([df_clus, pd.DataFrame(pred)]
                    , axis=1, ignore_index=True)
df_clus.columns = ["BMI", "Glucose", "cluster"]

#集計
grouped = df_clus.groupby("cluster")
result = grouped.mean()
result

Unnamed: 0_level_0,BMI,Glucose
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.214587,-0.010171
1,1.806723,-0.027777
2,-0.843313,-0.408885
3,0.482872,3.143665


In [125]:
#距離算出
df_clus_0 = df_clus.where(df_clus["cluster"]==0).dropna()
df_clus_0["dist"] = np.square(df_clus_0["BMI"] - df_clus_0["BMI"].mean()) \
                                    + np.square(df_clus_0["Glucose"] - df_clus_0["Glucose"].mean())
df_clus_1 = df_clus.where(df_clus["cluster"]==1).dropna()
df_clus_1["dist"] = np.square(df_clus_1["BMI"] - df_clus_1["BMI"].mean()) \
                                    + np.square(df_clus_1["Glucose"] - df_clus_1["Glucose"].mean())
df_clus_2 = df_clus.where(df_clus["cluster"]==2).dropna()
df_clus_2["dist"] = np.square(df_clus_2["BMI"] - df_clus_2["BMI"].mean()) \
                                    + np.square(df_clus_2["Glucose"] - df_clus_2["Glucose"].mean())
df_clus_3 = df_clus.where(df_clus["cluster"]==3).dropna()
df_clus_3["dist"] = np.square(df_clus_3["BMI"] - df_clus_3["BMI"].mean()) \
                                    + np.square(df_clus_3["Glucose"] - df_clus_3["Glucose"].mean())

#整形、描画
result = pd.concat([df_clus_0.loc[df_clus_0["dist"].idxmin(), :], df_clus_1.loc[df_clus_1["dist"].idxmin(), :]
          ,df_clus_2.loc[df_clus_2["dist"].idxmin(), :],df_clus_3.loc[df_clus_3["dist"].idxmin(), :]], axis=1)
result.columns = ["cluster 1", "cluster 2", "cluster 3", "cluster 4"]
result.loc["BMI":"Glucose", :]

Unnamed: 0,cluster 1,cluster 2,cluster 3,cluster 4
BMI,0.200418,-0.839804,1.84877,0.392459
Glucose,0.0179,-0.404769,-0.024671,3.055649
