In [1]:
import numpy as np
import pandas as pd
pd.options.plotting.backend = "plotly"  # pandas 버전 1.0 이상만 가능할 것

# import matplotlib.pyplot as plt
# plt.rcParams['axes.unicode_minus'] = False
# plt.rcParams["font.family"] = 'malgun gothic'

import pyproj
from pyproj import Proj, transform
import folium
from folium import plugins

import glob
import tqdm
import gc


def katec_to_wgs84(x, y):
    inProj  = Proj('+proj=tmerc +lat_0=38 +lon_0=128 +k=0.9999 +x_0=400000 +y_0=600000 +ellps=bessel +units=m +no_defs +towgs84=-115.80,474.99,674.11,1.16,-2.31,-1.63,6.43')
    outProj = Proj({ 'proj':'latlong', 'datum':'WGS84', 'ellps':'WGS84' })
    return transform( inProj, outProj, x, y )


def show_table(df, sample_num=2):
    print('>>> shape :', df.shape)
    print('>>> number of NA :', df.isna().sum().sum())
    
    if df.shape[0] <= sample_num*2:
        display(df)
    else:
        display(df.head(sample_num))
        display(df.tail(sample_num))


print('>>> 본 소스 작성 시 사용된 라이브러리 버전에 유의하세요!')
f'pd : {pd.__version__}  |  np : {np.__version__}  |  pyproj : {pyproj.__version__}  |  folium : {folium.__version__}  |  tqdm : {tqdm.__version__}'

>>> 본 소스 작성 시 사용된 라이브러리 버전에 유의하세요!


'pd : 1.0.5  |  np : 1.16.5  |  pyproj : 2.4.2.post1  |  folium : 0.11.0  |  tqdm : 4.47.0'

## 데이터 로드 (행정동코드-행정동명 테이블)

In [2]:
dong_nm_df = pd.read_excel('data/전국 누적 행정동코드 정의서_200910.xlsx').drop(columns=['sido_cd','sido_nm', 'sgg_cd'])
dong_nm_df = dong_nm_df[(dong_nm_df.sgg_nm.str.find('전주')!=-1) & (dong_nm_df.sgg_nm.notna())].reset_index(drop=True)
dong_nm_df['admdong_cd'] = dong_nm_df['admdong_cd'].astype(int)
dong_nm_df = dong_nm_df.replace('동산동', '여의동')  # 행정동명 변경내용 반영

show_table(dong_nm_df)

>>> shape : (36, 3)
>>> number of NA : 0


Unnamed: 0,sgg_nm,admdong_cd,admdong_nm
0,전주시 완산구,45111660,서서학동
1,전주시 완산구,45111714,효자4동


Unnamed: 0,sgg_nm,admdong_cd,admdong_nm
34,전주시 완산구,45111650,동서학동
35,전주시 덕진구,45113600,팔복동


## 데이터 로드 (50셀 정보 테이블)

In [3]:
cell_df = pd.read_csv('data/50cell_info_df_JEONJU.csv').rename(columns={'admi_cd':'admdong_cd'})
cell_df = cell_df.merge(dong_nm_df, 'left', 'admdong_cd')

print(cell_df.shape)
print(cell_df.isna().sum().sum())

print('\n\n>>> 첫 행 - id 최솟값 행 (덕진구 조촌동 - 전주 최서단) ↓')
display(cell_df.sort_values(['id']).head(1))

print('\n\n>>> 마지막 행 - id 최댓값 행 (완산구 동서학동 - 전주 최동남단) ↓')
display(cell_df.sort_values(['id']).tail(1))

(37906, 8)
0


>>> 첫 행 - id 최솟값 행 (덕진구 조촌동 - 전주 최서단) ↓


Unnamed: 0,id,x,y,admdong_cd,lat,lng,sgg_nm,admdong_nm
35631,50614860,309802,365343,45113650,35.883924,126.998829,전주시 덕진구,조촌동




>>> 마지막 행 - id 최댓값 행 (완산구 동서학동 - 전주 최동남단) ↓


Unnamed: 0,id,x,y,admdong_cd,lat,lng,sgg_nm,admdong_nm
36247,55380574,329352,356093,45111650,35.802164,127.216155,전주시 완산구,동서학동


## 50셀 x, y 좌표 확인

In [4]:
# # plotly로 확인

# cell_df.merge(dong_nm_df, how='left', on='admdong_cd').\
#         plot.scatter('x', 'y', width=1000, height=800, hover_data=['id', 'sgg_nm', 'admdong_nm'],
#                      title='전주시 전체 50셀', color='admdong_nm')

In [5]:
# # folium으로 확인

# def plot_row_folium(row):
#     folium.Circle([row['lat'], row['lng']],
#                   tooltip=f"> id : {row['id']}<br>" +
#                           f"> region : {row['sgg_nm']} {row['admdong_nm']}<br>" +
#                           f"> dong_cd : {row['admdong_cd']}",
# #                           f"> center_latlng : ({row['center_lat']:.2f}, {row['center_lng']:.2f})<br>" +
# #                           f"> subCELL_count : {row['subCELL_count']}",
#                   radius=25, fill=True, color='red' ).add_to(map1)


# start_point = [cell_df.lat.mean(), cell_df.lng.mean()]
# dong_nm_list = ['조촌동', '삼천3동', '평화2동', '동서학동']
# dong_cd_list = dong_nm_df.loc[dong_nm_df.admdong_nm.isin(dong_nm_list), 'admdong_cd'].tolist()

# map1 = folium.Map(location=start_point, zoom_start=12)
# cell_df[cell_df.admdong_cd.isin(dong_cd_list)] \
#     .merge(dong_nm_df, how='left', on='admdong_cd') \
#     .apply(plot_row_folium, axis=1)


# map1.save(f'source_output/50CELL-Map_JEONJU.html')
# map1

## 50셀 정보 테이블 수정 (군집셀 생성)

In [6]:
# # x-y 좌표별 최대/최소값 확인
# display(cell_df.x.agg(['min', 'max']))
# display(cell_df.y.agg(['min', 'max']))

In [7]:
cell_size = 150
x_min, x_max, y_min, y_max = cell_df.x.min(), cell_df.x.max(), cell_df.y.min(), cell_df.y.max()

def cv_cell(x,y):
    a = ((x - x_min)//cell_size)*cell_size + (cell_size//2 -25) + x_min
    b = ((y - y_min)//cell_size)*cell_size + (cell_size//2 -25) + y_min
    return(a,b)

cell_df['bigcell_x'], cell_df['bigcell_y'] = cv_cell(cell_df['x'], cell_df['y'])
cell_df['bigcell_id'] = (cell_df['bigcell_x']//100).multiply(10000).add(cell_df['bigcell_y']//100)

latlng_tup = katec_to_wgs84(cell_df.bigcell_x.to_list(), cell_df.bigcell_y.to_list())
cell_df['bigcell_lat'] = latlng_tup[1]
cell_df['bigcell_lng'] = latlng_tup[0]

print(cell_df.shape, cell_df.bigcell_id.nunique())
display(cell_df.sort_values('bigcell_id').head(1))
display(cell_df.sort_values('bigcell_id').tail(1))

(37906, 13) 5627


Unnamed: 0,id,x,y,admdong_cd,lat,lng,sgg_nm,admdong_nm,bigcell_x,bigcell_y,bigcell_id,bigcell_lat,bigcell_lng
35663,50639235,309902,365193,45113650,35.882581,126.999953,전주시 덕진구,조촌동,309852,365243,30983652,35.883027,126.999394


Unnamed: 0,id,x,y,admdong_cd,lat,lng,sgg_nm,admdong_nm,bigcell_x,bigcell_y,bigcell_id,bigcell_lat,bigcell_lng
36634,55380573,329352,356043,45111650,35.801714,127.216159,전주시 완산구,동서학동,329352,356093,32933560,35.802164,127.216155


In [8]:
# folium으로 확인

tmp_df = cell_df.groupby(['bigcell_id', 'bigcell_lat', 'bigcell_lng', 'sgg_nm', 'admdong_nm', 'admdong_cd'])['id'].count() \
            .reset_index().drop_duplicates('bigcell_id').rename(columns={'id':'num_50cell'})


def plot_row_folium(row):
    folium.Circle([row['bigcell_lat'], row['bigcell_lng']],
                  tooltip=f"> bigcell_id : {row['bigcell_id']}<br>" +
                          f"> region : {row['sgg_nm']} {row['admdong_nm']}<br>" +
                          f"> dong_cd : {row['admdong_cd']}<br>" +
                          f"> num_50cell : {row['num_50cell']}<br>",
#                           f"> center_latlng : ({row['center_lat']:.2f}, {row['center_lng']:.2f})<br>" +
                  radius=cell_size/2, color='red', weight=1, fill=True, fill_color='magenta', fill_opacity=0.3 ).add_to(map1)


start_point = [tmp_df.bigcell_lat.mean(), tmp_df.bigcell_lng.mean()]
# dong_nm_list = ['조촌동', '삼천3동', '평화2동', '동서학동']
# dong_cd_list = dong_nm_df.loc[dong_nm_df.admdong_nm.isin(dong_nm_list), 'admdong_cd'].tolist()

map1 = folium.Map(location=start_point, zoom_start=12)

# cell_df[cell_df.admdong_cd.isin(dong_cd_list)] \
tmp_df \
    .apply(plot_row_folium, axis=1)
map1.save(f'source_output/{cell_size}CELL-Map_JEONJU.html')
map1

## 군집셀 정보 테이블 생성 (bigcell_df)

In [9]:
bigcell_df = cell_df.groupby(['bigcell_id', 'bigcell_x', 'bigcell_y', 'bigcell_lat', 'bigcell_lng'])['id'].count() \
                .reset_index().rename(columns={'id':'num_50cell'})
bigcell_dong_df = cell_df.groupby(['bigcell_id'])['admdong_nm'].agg(lambda x:x.value_counts().index[0]).reset_index()


bigcell_df = bigcell_df.merge(bigcell_dong_df, 'left', 'bigcell_id') \
                .merge(dong_nm_df[['admdong_nm', 'sgg_nm']], 'left', 'admdong_nm') \
                .drop_duplicates([c for c in bigcell_df])
show_table(bigcell_df)

>>> shape : (5627, 8)
>>> number of NA : 0


Unnamed: 0,bigcell_id,bigcell_x,bigcell_y,bigcell_lat,bigcell_lng,num_50cell,admdong_nm,sgg_nm
0,30983652,309852,365243,35.883027,126.999394,3,조촌동,전주시 덕진구
1,30983653,309852,365393,35.884379,126.999377,2,조촌동,전주시 덕진구


Unnamed: 0,bigcell_id,bigcell_x,bigcell_y,bigcell_lat,bigcell_lng,num_50cell,admdong_nm,sgg_nm
6192,32933559,329352,355943,35.800813,127.216168,3,동서학동,전주시 완산구
6193,32933560,329352,356093,35.802164,127.216155,4,동서학동,전주시 완산구


## 결과 검증 - 군집셀별 통계량 확인
- 50셀 갯수 : 최소 1개에서 최대 (cell_size/50)^2 사이가 정상
- 거리 : 최소 50에서 최대 cell_szie 사이가 정상
- 중간 지점 x,y 좌표 : id와 대조해 정상인지 확인

In [10]:
print('>>> 군집셀별 50셀 갯수')
display(cell_df.groupby('bigcell_id')['bigcell_x'].count().sort_values().to_frame().T)
display(bigcell_df.set_index('bigcell_id')[['num_50cell']].sort_values('num_50cell').T)

print('\n\n>>> 군집셀별 가로*세로 거리 (미터)')
display((cell_df.groupby('bigcell_id')[['x', 'y']].max() - cell_df.groupby('bigcell_id')[['x', 'y']].min()).add(50).sort_values(['x', 'y']).T)

print('\n\n>>> 군집셀별 중간 지점 x, y 좌표')
display(cell_df.groupby('bigcell_id')[['x', 'y']].agg(['mean', 'median']).astype(int).sort_index().T)
display(bigcell_df.set_index('bigcell_id')[['bigcell_x', 'bigcell_y']].sort_index().T)

>>> 군집셀별 50셀 개수


bigcell_id,32603548,32513550,31973505,31963659,32513553,32513559,31963652,31963649,32513571,31963533,...,32093593,32093595,32093596,32093598,32093599,32093601,32093602,32093604,31573593,32023584
bigcell_x,1,1,1,1,1,1,1,1,1,1,...,9,9,9,9,9,9,9,9,9,9


bigcell_id,32603548,32513550,31973505,31963659,32513553,32513559,31963652,31963649,32513571,31963533,...,32093593,32093595,32093596,32093598,32093599,32093601,32093602,32093604,31573593,32023584
num_50cell,1,1,1,1,1,1,1,1,1,1,...,9,9,9,9,9,9,9,9,9,9




>>> 군집셀별 가로*세로 거리 (미터)


bigcell_id,30983655,31003652,31003661,31013649,31013650,31033656,31033659,31033661,31063635,31103664,...,32893559,32893560,32893562,32903556,32903557,32903559,32903560,32923557,32923559,32923560
x,50,50,50,50,50,50,50,50,50,50,...,150,150,150,150,150,150,150,150,150,150
y,50,50,50,50,50,50,50,50,50,50,...,150,150,150,150,150,150,150,150,150,150




>>> 군집셀별 중간 지점 x, y 좌표


Unnamed: 0,bigcell_id,30983652,30983653,30983655,31003650,31003652,31003655,31003656,31003658,31003661,31013649,...,32903557,32903559,32903560,32903562,32923556,32923557,32923559,32923560,32933559,32933560
x,mean,309902,309827,309802,310002,310052,310027,310002,310027,310052,310102,...,329052,329052,329052,329039,329177,329202,329202,329202,329302,329327
x,median,309902,309827,309802,310002,310052,310027,310002,310027,310052,310102,...,329052,329052,329052,329027,329177,329202,329202,329202,329302,329327
y,mean,365243,365343,365493,365093,365293,365493,365743,365843,366093,364993,...,355793,355943,356093,356205,355668,355793,355943,356093,355943,356068
y,median,365243,365343,365493,365093,365293,365493,365743,365843,366093,364993,...,355793,355943,356093,356193,355668,355793,355943,356093,355943,356068


bigcell_id,30983652,30983653,30983655,31003650,31003652,31003655,31003656,31003658,31003661,31013649,...,32903557,32903559,32903560,32903562,32923556,32923557,32923559,32923560,32933559,32933560
bigcell_x,309852,309852,309852,310002,310002,310002,310002,310002,310002,310152,...,329052,329052,329052,329052,329202,329202,329202,329202,329352,329352
bigcell_y,365243,365393,365543,365093,365243,365543,365693,365843,366143,364943,...,355793,355943,356093,356243,355643,355793,355943,356093,355943,356093


## 최종 df 저장 및 확인

In [11]:
# 50셀별 정보 테이블
outfile_dir = f'source_output/50CELL-{cell_size}ELL_Jeonju.csv'

cell_df = cell_df[['id', 'x', 'y', 'lat', 'lng', 'sgg_nm', 'admdong_nm', 'admdong_cd', 'bigcell_id', 'bigcell_x', 'bigcell_y', 'bigcell_lat', 'bigcell_lng']] \
            .reset_index(drop=True)
cell_df.to_csv(outfile_dir, index=False)

# 결과 검증
drop_cols = [c for c in cell_df if 'lat' in c or 'lng' in c]
print(pd.read_csv(outfile_dir).drop(columns=drop_cols).equals(cell_df.drop(columns=drop_cols)))  # lat, lng 값은 I/O시 미세하게 손실
show_table(pd.read_csv(outfile_dir))

True
>>> shape : (37906, 13)
>>> number of NA : 0


Unnamed: 0,id,x,y,lat,lng,sgg_nm,admdong_nm,admdong_cd,bigcell_id,bigcell_x,bigcell_y,bigcell_lat,bigcell_lng
0,51004909,311402,365393,35.884521,127.016542,전주시 덕진구,조촌동,45113650,31133653,311352,365393,35.884516,127.015988
1,51370552,312902,364043,35.872489,127.033301,전주시 덕진구,조촌동,45113650,31283640,312852,364043,35.872485,127.032747


Unnamed: 0,id,x,y,lat,lng,sgg_nm,admdong_nm,admdong_cd,bigcell_id,bigcell_x,bigcell_y,bigcell_lat,bigcell_lng
37904,54649291,326352,358943,35.82763,127.182702,전주시 덕진구,우아2동,45113612,32633589,326352,358943,35.82763,127.182702
37905,53320601,320902,354493,35.7871,127.122829,전주시 완산구,평화2동,45111692,32093544,320952,354443,35.786653,127.123387


In [12]:
# 군집셀별 정보 테이블
outfile_dir = f'source_output/{cell_size}ELL_info_Jeonju.csv'

bigcell_df = bigcell_df[['bigcell_id', 'bigcell_x', 'bigcell_y', 'bigcell_lat', 'bigcell_lng', 'num_50cell', 'sgg_nm', 'admdong_nm']] \
                .reset_index(drop=True)
bigcell_df.to_csv(outfile_dir, index=False)

# 결과 검증
drop_cols = [c for c in bigcell_df if 'lat' in c or 'lng' in c]
print(pd.read_csv(outfile_dir).drop(columns=drop_cols).equals(bigcell_df.drop(columns=drop_cols)))  # lat, lng 값은 I/O시 미세하게 손실
show_table(bigcell_df)

True
>>> shape : (5627, 8)
>>> number of NA : 0


Unnamed: 0,bigcell_id,bigcell_x,bigcell_y,bigcell_lat,bigcell_lng,num_50cell,sgg_nm,admdong_nm
0,30983652,309852,365243,35.883027,126.999394,3,전주시 덕진구,조촌동
1,30983653,309852,365393,35.884379,126.999377,2,전주시 덕진구,조촌동


Unnamed: 0,bigcell_id,bigcell_x,bigcell_y,bigcell_lat,bigcell_lng,num_50cell,sgg_nm,admdong_nm
5625,32933559,329352,355943,35.800813,127.216168,3,전주시 완산구,동서학동
5626,32933560,329352,356093,35.802164,127.216155,4,전주시 완산구,동서학동
