# 統合された病院までの平均距離

- こちらも、病院数（グループ数）を決定するうえでの参照データとして作成
- かなり愚直に計算しているので、効率的ではありません

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering

prefecture = 'ibaraki'
df = pd.read_excel('dataset/'+prefecture+'/hospitals_'+prefecture+'.xlsx', index_col=0)
X = df[['latitude','longitude']].to_numpy()

df_density = pd.read_excel('dataset/'+prefecture+'/human_density_'+prefecture+'.xlsx', index_col=0)

n_clusters_list = np.arange(10,100,4, dtype=int)

In [None]:
def min_distance(Y_CODE, X_CODE):
    d_min = 1000000
    for i, row in df_hospitals_merged.iterrows():
        y = row['latitude']
        x = row['longitude']
        distance = np.sqrt((Y_CODE-y)**2 + (X_CODE-x)**2)
        if distance < d_min:
            d_min = distance
    return d_min


In [None]:
df_kmeans = pd.DataFrame( columns=['n_clusters','mean_distance'] )
df_ward = pd.DataFrame( columns=['n_clusters','mean_distance'] )

for n_clusters in n_clusters_list:
    print(n_clusters)

    # k-means
    df_ = df.copy()
    kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(X)
    labels = kmeans.labels_
    df_['kmeans_label'] = [str(label) for label in labels]

    ## Merged hospitals
    df_hospitals_merged = pd.DataFrame( columns=['name','postal_code','latitude','longitude','num_doctors'] )
    for i, df_part in df_.groupby(['kmeans_label']):
        tmp_se = df_part[df_part['num_doctors']==df_part['num_doctors'].max()][0:1]
        df_hospitals_merged = df_hospitals_merged.append( tmp_se, ignore_index=True )
        
    ## mean_distance
    mean_distance = 0
    for i, row in df_density.iterrows():
        mean_distance += row['POP_DENSITY']*min_distance(row['Y_CODE'], row['X_CODE'])
    mean_distance = mean_distance/df_density['POP_DENSITY'].sum()    
    tmp_se = pd.Series( [ n_clusters, mean_distance ], index=df_kmeans.columns )
    df_kmeans = df_kmeans.append( tmp_se, ignore_index=True )
    
    #----------------------------------
        
    # Ward algorithm
    df_ = df.copy()
    clustering = AgglomerativeClustering(n_clusters=n_clusters).fit(X)
    labels = clustering.labels_
    df_['Ward_label'] = [str(label) for label in labels]

    ## Merged hospitals
    df_hospitals_merged = pd.DataFrame( columns=['name','postal_code','latitude','longitude','num_doctors'] )
    for i, df_part in df_.groupby(['Ward_label']):
        tmp_se = df_part[df_part['num_doctors']==df_part['num_doctors'].max()][0:1]
        df_hospitals_merged = df_hospitals_merged.append( tmp_se, ignore_index=True )
        
    ## mean_distance
    mean_distance = 0
    for i, row in df_density.iterrows():
        mean_distance += row['POP_DENSITY']*min_distance(row['Y_CODE'], row['X_CODE'])
    mean_distance = mean_distance/df_density['POP_DENSITY'].sum()    
    tmp_se = pd.Series( [ n_clusters, mean_distance ], index=df_ward.columns )
    df_ward = df_ward.append( tmp_se, ignore_index=True )
        


In [None]:
plt.rcParams['font.family'] = 'IPAGothic'
fig, ax = plt.subplots(figsize=(3,2))

sns.scatterplot(data=df_kmeans, x="n_clusters", y="mean_distance", color="tomato", marker='o', label="K-means")
sns.scatterplot(data=df_ward, x="n_clusters", y="mean_distance", color="royalblue", marker='X', label="Ward")

plt.title("茨城県")
plt.legend(bbox_to_anchor=(1, 1), loc='upper right', fontsize=9)
ax.set_xlabel('病院数')
ax.set_ylabel('平均距離')
plt.tight_layout()
plt.savefig('results/'+prefecture+'/'+prefecture+'_mean_distance.pdf')