In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
from shapely.geometry import Point

import folium
from folium.plugins import MarkerCluster, MiniMap

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

### 2022 하남 유동인구 구하기

In [4]:
df_pop = pd.read_csv('../data/2-8.하남시_성연령별_유동인구.csv')
df_pop

Unnamed: 0,STD_YM,m_10g_pop,m_20g_pop,m_30g_pop,m_40g_pop,m_50g_pop,m_60g_pop,w_10g_pop,w_20g_pop,w_30g_pop,w_40g_pop,w_50g_pop,w_60g_pop,lon,lat
0,202001,0.09,0.21,0.67,0.97,1.07,0.99,0.09,0.14,0.35,0.51,0.52,0.42,127.140578,37.508042
1,202001,0.20,0.44,1.34,1.96,2.28,2.16,0.22,0.31,0.70,1.04,1.15,0.96,127.140576,37.508492
2,202001,0.22,0.42,1.26,1.84,2.08,1.96,0.22,0.31,0.67,0.97,1.06,0.92,127.140573,37.508943
3,202001,0.45,0.85,2.42,3.44,3.97,3.99,0.46,0.67,1.36,1.85,2.06,1.92,127.140571,37.509394
4,202001,0.23,0.51,1.53,2.24,2.59,2.43,0.25,0.37,0.81,1.21,1.34,1.17,127.140569,37.509844
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
718315,202212,0.09,0.07,0.18,0.37,0.23,0.21,0.09,0.07,0.18,0.30,0.16,0.16,127.153180,37.474278
718316,202212,0.21,0.21,0.55,0.88,0.72,0.74,0.21,0.26,0.51,0.78,0.51,0.51,127.153748,37.473829
718317,202212,0.18,0.16,0.44,0.74,0.64,0.72,0.18,0.21,0.39,0.60,0.42,0.44,127.153745,37.474280
718318,202212,0.21,0.19,0.44,0.65,0.62,0.71,0.25,0.23,0.37,0.57,0.44,0.46,127.154313,37.473831


In [5]:
df_pop = df_pop[(df_pop['STD_YM'] >= 202201) & (df_pop['STD_YM'] <= 202212)]
df_pop

Unnamed: 0,STD_YM,m_10g_pop,m_20g_pop,m_30g_pop,m_40g_pop,m_50g_pop,m_60g_pop,w_10g_pop,w_20g_pop,w_30g_pop,w_40g_pop,w_50g_pop,w_60g_pop,lon,lat
454437,202201,7.84,21.75,47.01,77.75,82.85,79.44,8.37,15.27,23.82,34.63,34.13,34.59,127.222581,37.515474
454438,202201,2.95,7.95,18.12,31.40,34.73,33.41,2.83,5.26,8.48,13.08,13.79,14.13,127.222580,37.515924
454439,202201,0.16,0.37,0.79,1.36,1.61,1.70,0.14,0.25,0.41,0.62,0.69,0.74,127.222578,37.516375
454440,202201,1.13,2.65,5.79,9.69,11.51,12.01,1.27,1.89,3.02,4.71,5.12,5.37,127.222576,37.516826
454441,202201,4.22,7.86,16.44,29.07,34.89,36.45,4.17,6.02,8.65,13.01,14.36,16.40,127.223151,37.514574
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
718315,202212,0.09,0.07,0.18,0.37,0.23,0.21,0.09,0.07,0.18,0.30,0.16,0.16,127.153180,37.474278
718316,202212,0.21,0.21,0.55,0.88,0.72,0.74,0.21,0.26,0.51,0.78,0.51,0.51,127.153748,37.473829
718317,202212,0.18,0.16,0.44,0.74,0.64,0.72,0.18,0.21,0.39,0.60,0.42,0.44,127.153745,37.474280
718318,202212,0.21,0.19,0.44,0.65,0.62,0.71,0.25,0.23,0.37,0.57,0.44,0.46,127.154313,37.473831


In [6]:
# 열 이름 필터링 (성연령별)
filtered_columns = [col for col in df_pop.columns if col.startswith("m_") or col.startswith("w_")]

# 선택한 열과 다른 주요 열 포함
selected_columns = ["STD_YM", "lon", "lat"] + filtered_columns
df_pop_gen = df_pop[selected_columns]
df_pop_gen

Unnamed: 0,STD_YM,lon,lat,m_10g_pop,m_20g_pop,m_30g_pop,m_40g_pop,m_50g_pop,m_60g_pop,w_10g_pop,w_20g_pop,w_30g_pop,w_40g_pop,w_50g_pop,w_60g_pop
454437,202201,127.222581,37.515474,7.84,21.75,47.01,77.75,82.85,79.44,8.37,15.27,23.82,34.63,34.13,34.59
454438,202201,127.222580,37.515924,2.95,7.95,18.12,31.40,34.73,33.41,2.83,5.26,8.48,13.08,13.79,14.13
454439,202201,127.222578,37.516375,0.16,0.37,0.79,1.36,1.61,1.70,0.14,0.25,0.41,0.62,0.69,0.74
454440,202201,127.222576,37.516826,1.13,2.65,5.79,9.69,11.51,12.01,1.27,1.89,3.02,4.71,5.12,5.37
454441,202201,127.223151,37.514574,4.22,7.86,16.44,29.07,34.89,36.45,4.17,6.02,8.65,13.01,14.36,16.40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
718315,202212,127.153180,37.474278,0.09,0.07,0.18,0.37,0.23,0.21,0.09,0.07,0.18,0.30,0.16,0.16
718316,202212,127.153748,37.473829,0.21,0.21,0.55,0.88,0.72,0.74,0.21,0.26,0.51,0.78,0.51,0.51
718317,202212,127.153745,37.474280,0.18,0.16,0.44,0.74,0.64,0.72,0.18,0.21,0.39,0.60,0.42,0.44
718318,202212,127.154313,37.473831,0.21,0.19,0.44,0.65,0.62,0.71,0.25,0.23,0.37,0.57,0.44,0.46


In [7]:
# STD_YM, lat, lon을 제외한 열들끼리 각 행 합 계산 -> 전체 유동인구 수
df_pop_gen['pop_sum'] = df_pop_gen.iloc[:, 3:].sum(axis=1)

# 결과 출력 (lat, lon, pop_avg 열만 선택)
df_pop_gen_1 = df_pop_gen[['lat', 'lon', 'pop_sum']]
df_pop_gen_1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,lat,lon,pop_sum
454437,37.515474,127.222581,467.45
454438,37.515924,127.222580,186.13
454439,37.516375,127.222578,8.84
454440,37.516826,127.222576,64.16
454441,37.514574,127.223151,191.54
...,...,...,...
718315,37.474278,127.153180,2.11
718316,37.473829,127.153748,6.09
718317,37.474280,127.153745,5.12
718318,37.473831,127.154313,5.14


In [8]:
df_pop_gen_2 = df_pop_gen_1.groupby(['lat', 'lon']).mean().reset_index()
df_pop_gen_2

Unnamed: 0,lat,lon,pop_sum
0,37.468911,127.167341,11.884167
1,37.468912,127.167906,18.308333
2,37.468914,127.168472,10.152500
3,37.468915,127.169037,16.033333
4,37.469360,127.166773,19.390833
...,...,...,...
23642,37.591117,127.193983,4.648333
23643,37.591539,127.183220,23.318333
23644,37.591544,127.184919,1.115000
23645,37.591550,127.187185,2.361111


In [9]:
df_pop_gen_2.rename(columns = {'pop_sum' : 'pop_avg'}, inplace=True)

In [10]:
df_pop_gen_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23647 entries, 0 to 23646
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   lat      23647 non-null  float64
 1   lon      23647 non-null  float64
 2   pop_avg  23647 non-null  float64
dtypes: float64(3)
memory usage: 554.4 KB


In [13]:
import geopandas as gpd
df_map = gpd.read_file('../data/2-14.하남시_격자.geojson')

In [14]:
gpd.GeoDataFrame(df_pop_gen_2, geometry=gpd.GeoSeries(df_pop_gen_2.apply(lambda row: Point(row['lon'], row['lat']), axis=1)), crs="EPSG:4326")


Unnamed: 0,lat,lon,pop_avg,geometry
0,37.468911,127.167341,11.884167,POINT (127.16734 37.46891)
1,37.468912,127.167906,18.308333,POINT (127.16791 37.46891)
2,37.468914,127.168472,10.152500,POINT (127.16847 37.46891)
3,37.468915,127.169037,16.033333,POINT (127.16904 37.46892)
4,37.469360,127.166773,19.390833,POINT (127.16677 37.46936)
...,...,...,...,...
23642,37.591117,127.193983,4.648333,POINT (127.19398 37.59112)
23643,37.591539,127.183220,23.318333,POINT (127.18322 37.59154)
23644,37.591544,127.184919,1.115000,POINT (127.18492 37.59154)
23645,37.591550,127.187185,2.361111,POINT (127.18718 37.59155)


In [15]:
df_pop_gen_2

Unnamed: 0,lat,lon,pop_avg,geometry
0,37.468911,127.167341,11.884167,POINT (127.16734 37.46891)
1,37.468912,127.167906,18.308333,POINT (127.16791 37.46891)
2,37.468914,127.168472,10.152500,POINT (127.16847 37.46891)
3,37.468915,127.169037,16.033333,POINT (127.16904 37.46892)
4,37.469360,127.166773,19.390833,POINT (127.16677 37.46936)
...,...,...,...,...
23642,37.591117,127.193983,4.648333,POINT (127.19398 37.59112)
23643,37.591539,127.183220,23.318333,POINT (127.18322 37.59154)
23644,37.591544,127.184919,1.115000,POINT (127.18492 37.59154)
23645,37.591550,127.187185,2.361111,POINT (127.18718 37.59155)


### 격자와 유동인구 매핑

In [16]:
# 포인트 데이터 준비
# geometry 열이 이미 준비되어 있으므로 바로 GeoDataFrame으로 변환
gdf_points = gpd.GeoDataFrame(df_pop_gen_2, geometry=df_pop_gen_2['geometry'], crs="EPSG:4326")

# 격자 데이터 (df_map)도 GeoDataFrame 형식이어야 함
# 격자 데이터의 'geometry' 열이 다각형(Polygon) 형태여야 합니다.
gdf_grid = gpd.GeoDataFrame(df_map, geometry=df_map['geometry'], crs="EPSG:4326")

In [17]:
# 공간 조인 실행: 포인트가 격자 내에 포함되는 경우 조인
result = gpd.sjoin(gdf_points, gdf_grid, how="inner", op="within")
result

Unnamed: 0,lat,lon,pop_avg,geometry,index_right,gid
0,37.468911,127.167341,11.884167,POINT (127.16734 37.46891),8271,다사705411
4,37.469360,127.166773,19.390833,POINT (127.16677 37.46936),8271,다사705411
5,37.469361,127.167339,8.491667,POINT (127.16734 37.46936),8271,다사705411
1,37.468912,127.167906,18.308333,POINT (127.16791 37.46891),3457,다사706411
2,37.468914,127.168472,10.152500,POINT (127.16847 37.46891),3457,다사706411
...,...,...,...,...,...,...
23642,37.591117,127.193983,4.648333,POINT (127.19398 37.59112),5239,다사729546
23643,37.591539,127.183220,23.318333,POINT (127.18322 37.59154),3611,다사720547
23644,37.591544,127.184919,1.115000,POINT (127.18492 37.59154),5842,다사721547
23645,37.591550,127.187185,2.361111,POINT (127.18718 37.59155),3111,다사723547


In [18]:
result.isna().sum()

lat            0
lon            0
pop_avg        0
geometry       0
index_right    0
gid            0
dtype: int64

In [19]:
result_4 = (
    result.groupby("gid", as_index=False)
    .agg({
        "pop_avg": "mean",         # pop_avg 합계
        "lon": "mean",           # lon의 첫 번째 값
        "lat": "mean",           # lat의 첫 번째 값
    })
)

result_4

Unnamed: 0,gid,pop_avg,lon,lat
0,다사682452,27.059583,127.141153,37.506015
1,다사682454,9.099167,127.140956,37.507893
2,다사682455,7.572500,127.140857,37.508718
3,다사682456,16.805208,127.140853,37.509620
4,다사682457,30.170000,127.141133,37.510297
...,...,...,...,...
7391,다사809463,7.475000,127.284250,37.516504
7392,다사809464,9.106667,127.284531,37.517180
7393,다사809465,4.658611,127.284434,37.518006
7394,다사809466,2.459167,127.284242,37.519208


In [20]:
result_4.to_csv('../data/2022년 하남시 격자별 유동인구의 평균.csv', index=False)