## 데이터세팅

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# 도로 아이디 매핑된 격자 데이터 불러오기
df_road = pd.read_csv('../data/화성격자_도로아이디.csv')
df_road.rename(columns={'상권밀�' : '상권밀집도'}, inplace=True)
df_road['상권밀집도'].fillna(0, inplace=True)

# 도로 아이디 별로 평균 상권밀집도 구하기
df_traffic = df_road.groupby('id')['상권밀집도'].mean().reset_index()
df_traffic

Unnamed: 0,id,상권밀집도
0,4.764420e+10,0.00
1,4.764420e+10,0.00
2,4.764420e+10,0.00
3,4.764420e+10,0.00
4,4.771409e+10,0.00
...,...,...
1052,5.723553e+10,2.00
1053,5.723560e+10,2.00
1054,9.213476e+10,2.00
1055,9.282407e+10,1.75


In [3]:
# 교통추정량 데이터 불러오기
df_link = pd.read_csv('../data/화성시_링크_교통량.csv')
df_link

Unnamed: 0,id,max_speed,road_rank,road_type,up_lanes,length,ALL_AADT,PSCR_AADT,BUS_AADT,geometry
0,47824846201,0,107,0,1,0.033,8421.0,7573.0,231.0,LINESTRING (126.91694367972913 37.141367468759...
1,47836069701,0,107,0,1,0.039,10721.0,9444.0,86.0,LINESTRING (126.87905297649803 37.204045610071...
2,57235033901,0,107,0,1,0.472,3750.0,2087.0,188.0,LINESTRING (127.12825042559183 37.171141356574...
3,57138961501,0,107,0,3,0.043,10743.0,9523.0,502.0,LINESTRING (126.99836427352366 37.204803205988...
4,47815022501,0,107,0,1,0.024,916.0,551.0,11.0,LINESTRING (126.96120856323112 37.076709889890...
...,...,...,...,...,...,...,...,...,...,...
12121,57135738801,0,107,0,1,0.036,1813.0,1700.0,29.0,LINESTRING (127.06761732904454 37.191063789472...
12122,47834097401,50,107,0,1,0.057,3667.0,3200.0,93.0,LINESTRING (126.99193232143968 37.206334594009...
12123,47825277001,50,107,0,2,0.012,7857.0,5760.0,294.0,LINESTRING (126.90150845820992 37.104084732301...
12124,47735894501,0,106,0,3,0.024,25772.0,21176.0,477.0,LINESTRING (126.85997922404704 37.229820126709...


## 필요한 변수 전처리
- 상권밀집도 : 도로가 지나는 격자의 평균 상권밀집 강도 측정
- 유동 인구 강도 : 격자별 유동인구의 강도를 가져와서 피처로 사용
- 주차장과의 거리?
- 인구 정보 : 격자별 인구가 추정되면 이걸로 (만약 유동인구만으로 괜찮으면 없어도댐)

In [5]:
# 교통추정량 데이터 불러오기
df_link = pd.read_csv('../data/화성시_링크_교통량.csv')

# 도로 ID별로 상권밀집+교토량
df_merged = df_traffic.merge(df_link[['id','road_rank', 'ALL_AADT']], on='id')
df_merged

Unnamed: 0,id,상권밀집도,road_rank,ALL_AADT
0,4.764420e+10,0.0,101,37756.0
1,4.764420e+10,0.0,101,36326.0
2,4.764420e+10,0.0,101,37641.0
3,4.764420e+10,0.0,101,39440.0
4,4.771409e+10,0.0,103,13360.0
...,...,...,...,...
815,5.723553e+10,2.0,101,25442.0
816,5.723553e+10,2.0,101,32396.0
817,5.723553e+10,2.0,101,27140.0
818,5.723560e+10,2.0,101,26934.0


## 상권 데이터 변수 추가
- 도로의 반경 500m에 위치한 상권의 상권밀집도 평균을 매핑

In [7]:
import geopandas as gpd

# 격자 정보 불러오기
gdf = gpd.read_file("../data/1-14.화성시_격자.geojson")

# 격자의 중심점 좌표 구하기
gdf['centroid'] = gdf.geometry.centroid
gdf['lon'] = gdf['centroid'].x
gdf['lat'] = gdf['centroid'].y

# 필요한 컬럼만 필터링
gdf = gdf[['gid', 'lon', 'lat']]

# 도로 아이디 매핑된 격자 데이터 불러오기
df_grid = pd.read_csv('../data/화성격자_상권_2022.csv')
df_grid['상권밀집도'].fillna(0, inplace=True)
df_grid = df_grid[['gid', '상권밀집도']]
df_grid = df_grid.merge(gdf, on='gid')

# 교통추정량 데이터 불러오기
df_link = pd.read_csv('../data/화성시_링크_교통량.csv')
df_link = df_link[df_link['road_rank'].isin([101,102,103])]
df_link = df_link[['id', 'ALL_AADT', 'geometry']]

In [8]:
import geopandas as gpd
from shapely.geometry import Point, LineString
import pandas as pd
from shapely.wkt import loads

# df_link의 geometry 열을 shapely 객체로 변환
df_link['geometry'] = df_link['geometry'].apply(loads)

# GeoDataFrame으로 변환
gdf_link_fac = gpd.GeoDataFrame(df_link, geometry='geometry', crs="EPSG:4326")

# df_grid와 df_link를 GeoDataFrame으로 변환
df_grid['geometry'] = df_grid.apply(lambda row: Point(row['lon'], row['lat']), axis=1)
gdf_grid = gpd.GeoDataFrame(df_grid, geometry='geometry', crs="EPSG:4326")

# 좌표계를 거리 계산을 위한 투영 좌표계로 변환
gdf_grid = gdf_grid.to_crs("EPSG:3857")
gdf_link_fac = gdf_link_fac.to_crs("EPSG:3857")

# 첫 번째 컬럼: 도로가 지나는 모든 격자에 대해 반경 100m 내 상권밀집도 평균 계산
def compute_grid_density_all_intersections(link_geometry, grid, radius=100):
    # 도로의 모든 지점을 기준으로 버퍼 생성
    buffer = link_geometry.buffer(radius)
    # 반경 내 격자를 찾음
    intersected = grid[grid.intersects(buffer)]
    # 격자의 상권밀집도 평균 계산
    if not intersected.empty:
        return intersected['상권밀집도'].mean()
    return None

gdf_link_fac['grid_density_mean'] = gdf_link_fac['geometry'].apply(
    lambda geom: compute_grid_density_all_intersections(geom, gdf_grid, radius=100)
)

# 두 번째와 세 번째 컬럼: 반경 500m와 1km 이내 격자의 상권밀집도 평균
def compute_buffer_density(link_geometry, grid, buffer_distance):
    buffer = link_geometry.buffer(buffer_distance)
    within_buffer = grid[grid.intersects(buffer)]
    if not within_buffer.empty:
        return within_buffer['상권밀집도'].mean()
    return None

gdf_link_fac['buffer_500m_density_mean'] = gdf_link_fac['geometry'].apply(lambda geom: compute_buffer_density(geom, gdf_grid, 500))
gdf_link_fac['buffer_1km_density_mean'] = gdf_link_fac['geometry'].apply(lambda geom: compute_buffer_density(geom, gdf_grid, 1000))

# 결과 확인
print(gdf_link_fac[['grid_density_mean', 'buffer_500m_density_mean', 'buffer_1km_density_mean']].head())

    grid_density_mean  buffer_500m_density_mean  buffer_1km_density_mean
6                 1.0                  1.000000                 0.972350
26                1.0                  0.714286                 0.716814
35                1.0                  1.142857                 1.371094
69                0.0                  0.000000                 0.000000
92                1.0                  0.971014                 1.042194


In [9]:
gdf_link_fac['grid_density_mean'].unique()

array([1.        , 0.        , 2.        , 1.66666667, 3.        ,
       1.94117647, 0.5       , 2.125     , 1.81818182, 1.42857143,
       0.75      , 1.25      , 1.7       , 1.33333333, 0.76595745,
              nan, 1.11111111, 2.85714286, 1.4       , 2.33333333,
       1.31578947, 0.83333333, 1.28571429, 1.03125   , 0.06666667,
       0.45454545, 2.92857143, 1.2       , 2.77777778, 2.5       ,
       1.75      , 0.46153846, 0.10526316, 0.57142857, 1.6       ,
       1.78571429, 1.42622951, 2.8       , 1.92857143, 1.13333333,
       0.9       , 0.90625   , 0.35294118, 1.85714286, 2.16666667,
       1.71428571, 1.16666667, 2.66666667, 0.87755102, 0.7       ,
       2.25      , 1.375     , 2.83333333, 0.96153846, 0.28125   ,
       0.7254902 , 1.55555556, 0.16666667, 1.77777778, 0.66666667,
       2.9       , 2.90909091, 2.75      , 1.57142857, 0.33333333,
       0.25      , 0.2       , 1.5       , 1.25925926, 1.6122449 ,
       0.71052632, 2.05555556, 0.55172414, 0.84615385, 0.6    

In [10]:
print(gdf_link_fac.isna().sum()) # 100m 반경으로 할 시, 100m 안에 상권이 위치하지 않은 도로 13개 존재.
gdf_link_fac['grid_density_mean'] = gdf_link_fac['grid_density_mean'].fillna(0)
gdf_link_fac.head()

id                           0
ALL_AADT                     0
geometry                     0
grid_density_mean           13
buffer_500m_density_mean     0
buffer_1km_density_mean      0
dtype: int64


Unnamed: 0,id,ALL_AADT,geometry,grid_density_mean,buffer_500m_density_mean,buffer_1km_density_mean
6,47834686201,27724.0,"LINESTRING (14124074.485 4466841.932, 14124074...",1.0,1.0,0.97235
26,47735667801,11939.0,"LINESTRING (14123448.865 4465801.237, 14123427...",1.0,0.714286,0.716814
35,47825587301,10733.0,"LINESTRING (14129045.411 4460150.713, 14129018...",1.0,1.142857,1.371094
69,47735208501,28034.0,"LINESTRING (14116849.436 4474371.724, 14116846...",0.0,0.0,0.0
92,47825020201,6165.0,"LINESTRING (14129443.377 4454981.973, 14129418...",1.0,0.971014,1.042194


## 유동인구 데이터도 위와 같이 매핑

In [11]:
# 유동인구 포함한 격자 데이터 불러오기
df_move = pd.read_csv('../data/화성시_2021년, 2022년 격자별 유동인구의 평균.csv')

# 교통추정량 데이터 불러오기
df_link = pd.read_csv('../data/화성시_링크_교통량.csv')
df_link = df_link[df_link['road_rank'].isin([101,102,103])]
df_link = df_link[['id', 'ALL_AADT', 'geometry']]

In [14]:
import geopandas as gpd
from shapely.geometry import Point, LineString
import pandas as pd
from shapely.wkt import loads

# df_link의 geometry 열을 shapely 객체로 변환
df_link['geometry'] = df_link['geometry'].apply(loads)

# GeoDataFrame으로 변환
gdf_link_mov = gpd.GeoDataFrame(df_link, geometry='geometry', crs="EPSG:4326")

# df_move와 df_link를 GeoDataFrame으로 변환
df_move['geometry'] = df_move.apply(lambda row: Point(row['lon'], row['lat']), axis=1)
gdf_move = gpd.GeoDataFrame(df_move, geometry='geometry', crs="EPSG:4326")

# 좌표계를 거리 계산을 위한 투영 좌표계로 변환
gdf_move = gdf_move.to_crs("EPSG:3857")
gdf_link_mov = gdf_link_mov.to_crs("EPSG:3857")

# 도로 기준 반경 내 유동인구 평균 계산 함수
def compute_population_density(link_geometry, grid, buffer_distance):
    # 반경 buffer_distance 내의 모든 격자 찾기
    buffer = link_geometry.buffer(buffer_distance)
    within_buffer = grid[grid.intersects(buffer)]
    if not within_buffer.empty:
        return within_buffer['pop_avg'].mean()  # 유동인구 평균 계산
    return None

# 도로 기준 반경 100m, 500m, 1km에 위치한 격자들의 유동인구 평균 추가
gdf_link_mov['pop_avg_100m'] = gdf_link_mov['geometry'].apply(lambda geom: compute_population_density(geom, gdf_move, 100))
gdf_link_mov['pop_avg_500m'] = gdf_link_mov['geometry'].apply(lambda geom: compute_population_density(geom, gdf_move, 500))
gdf_link_mov['pop_avg_1km'] = gdf_link_mov['geometry'].apply(lambda geom: compute_population_density(geom, gdf_move, 1000))

# 결과 확인
print(gdf_link_mov[['pop_avg_100m', 'pop_avg_500m', 'pop_avg_1km']].head())

    pop_avg_100m  pop_avg_500m  pop_avg_1km
6      58.996484     88.680012    48.326760
26     37.545854     47.595515    28.651125
35     19.688529     15.010157    12.941629
69     18.698021     25.582787    11.286719
92     22.964521     67.429104    36.040223


In [15]:
print(gdf_link_mov.isna().sum()) # 100m 반경으로 할 시, 100m 안에 유동인구 존재하지 않은 도로 12개 존재.
gdf_link_mov['pop_avg_100m'] = gdf_link_mov['pop_avg_100m'].fillna(0)
gdf_link_mov.head()

id               0
ALL_AADT         0
geometry         0
pop_avg_100m    12
pop_avg_500m     0
pop_avg_1km      0
dtype: int64


Unnamed: 0,id,ALL_AADT,geometry,pop_avg_100m,pop_avg_500m,pop_avg_1km
6,47834686201,27724.0,"LINESTRING (14124074.485 4466841.932, 14124074...",58.996484,88.680012,48.32676
26,47735667801,11939.0,"LINESTRING (14123448.865 4465801.237, 14123427...",37.545854,47.595515,28.651125
35,47825587301,10733.0,"LINESTRING (14129045.411 4460150.713, 14129018...",19.688529,15.010157,12.941629
69,47735208501,28034.0,"LINESTRING (14116849.436 4474371.724, 14116846...",18.698021,25.582787,11.286719
92,47825020201,6165.0,"LINESTRING (14129443.377 4454981.973, 14129418...",22.964521,67.429104,36.040223


In [16]:
gdf_link_fac

Unnamed: 0,id,ALL_AADT,geometry,grid_density_mean,buffer_500m_density_mean,buffer_1km_density_mean
6,47834686201,27724.0,"LINESTRING (14124074.485 4466841.932, 14124074...",1.0,1.000000,0.972350
26,47735667801,11939.0,"LINESTRING (14123448.865 4465801.237, 14123427...",1.0,0.714286,0.716814
35,47825587301,10733.0,"LINESTRING (14129045.411 4460150.713, 14129018...",1.0,1.142857,1.371094
69,47735208501,28034.0,"LINESTRING (14116849.436 4474371.724, 14116846...",0.0,0.000000,0.000000
92,47825020201,6165.0,"LINESTRING (14129443.377 4454981.973, 14129418...",1.0,0.971014,1.042194
...,...,...,...,...,...,...
12082,47835413601,10641.0,"LINESTRING (14123941.773 4471424.285, 14124028...",2.0,1.574074,1.355769
12083,47824327901,13332.0,"LINESTRING (14128645.114 4460077.521, 14128704...",1.0,1.093750,1.356522
12090,47714507401,7604.0,"LINESTRING (14115361.548 4445011.897, 14115455...",0.0,0.000000,0.000000
12113,47835608401,15383.0,"LINESTRING (14130198.871 4472817.029, 14130183...",0.2,0.371134,0.422145


In [17]:
gdf_link = gdf_link_fac.merge(gdf_link_mov[['id','pop_avg_100m', 'pop_avg_500m', 'pop_avg_1km']], on='id')
gdf_link

Unnamed: 0,id,ALL_AADT,geometry,grid_density_mean,buffer_500m_density_mean,buffer_1km_density_mean,pop_avg_100m,pop_avg_500m,pop_avg_1km
0,47834686201,27724.0,"LINESTRING (14124074.485 4466841.932, 14124074...",1.0,1.000000,0.972350,58.996484,88.680012,48.326760
1,47735667801,11939.0,"LINESTRING (14123448.865 4465801.237, 14123427...",1.0,0.714286,0.716814,37.545854,47.595515,28.651125
2,47825587301,10733.0,"LINESTRING (14129045.411 4460150.713, 14129018...",1.0,1.142857,1.371094,19.688529,15.010157,12.941629
3,47735208501,28034.0,"LINESTRING (14116849.436 4474371.724, 14116846...",0.0,0.000000,0.000000,18.698021,25.582787,11.286719
4,47825020201,6165.0,"LINESTRING (14129443.377 4454981.973, 14129418...",1.0,0.971014,1.042194,22.964521,67.429104,36.040223
...,...,...,...,...,...,...,...,...,...
1137,47835413601,10641.0,"LINESTRING (14123941.773 4471424.285, 14124028...",2.0,1.574074,1.355769,19.400462,20.470623,39.068485
1138,47824327901,13332.0,"LINESTRING (14128645.114 4460077.521, 14128704...",1.0,1.093750,1.356522,13.302683,10.422081,11.435672
1139,47714507401,7604.0,"LINESTRING (14115361.548 4445011.897, 14115455...",0.0,0.000000,0.000000,18.202419,18.830942,12.786421
1140,47835608401,15383.0,"LINESTRING (14130198.871 4472817.029, 14130183...",0.2,0.371134,0.422145,0.559468,8.979918,16.118917


In [18]:
# 데이터 내보내기
gdf_link.to_csv('../data/고속도로_상권_유동.csv', index=False)

## 휴게소의 거리 변수 생성

In [19]:
df_rest = pd.read_csv('../data/전국휴게소정보표준데이터.csv', encoding='cp949')
df_rest = df_rest[['휴게소명', '위도', '경도']]
df_rest

Unnamed: 0,휴게소명,위도,경도
0,황간(서울),36.249437,127.854835
1,황전(순천),35.148905,127.454145
2,황전(완주),35.153306,127.454683
3,횡성(강릉),37.462755,128.133969
4,횡성(인천),37.464983,128.135449
...,...,...,...
200,마장,37.263849,127.407043
201,망향(부산),36.855650,127.180929
202,매송(목포),37.265084,126.888544
203,매송(서울),37.264769,126.891795


In [20]:
from shapely.geometry import Point
import geopandas as gpd

# 도로 중심점 계산 (geometry가 LineString)
gdf_link['centroid'] = gdf_link['geometry'].centroid

# 휴게소 데이터를 GeoDataFrame으로 변환
df_rest['geometry'] = df_rest.apply(lambda row: Point(row['경도'], row['위도']), axis=1)
gdf_rest = gpd.GeoDataFrame(df_rest, geometry='geometry', crs="EPSG:4326")

# 좌표계를 거리 계산을 위한 투영 좌표계로 변환
gdf_link = gdf_link.to_crs("EPSG:3857")
gdf_rest = gdf_rest.to_crs("EPSG:3857")

# 도로 중심점과 각 휴게소 간의 거리 계산
def calculate_min_distance(centroid, rest_areas):
    distances = rest_areas.distance(centroid)  # 모든 휴게소와의 거리 계산
    return distances.min()  # 최소 거리 반환

gdf_link['min_dist_to_rest'] = gdf_link['centroid'].apply(
    lambda centroid: calculate_min_distance(centroid, gdf_rest['geometry'])
)

# 결과 확인
gdf_link[['min_dist_to_rest']]

Unnamed: 0,min_dist_to_rest
0,7772.517840
1,6670.115741
2,4571.063813
3,8499.691329
4,6598.762094
...,...
1137,4840.615240
1138,4465.706961
1139,13147.172380
1140,5554.617507


In [21]:
gdf_link

Unnamed: 0,id,ALL_AADT,geometry,grid_density_mean,buffer_500m_density_mean,buffer_1km_density_mean,pop_avg_100m,pop_avg_500m,pop_avg_1km,centroid,min_dist_to_rest
0,47834686201,27724.0,"LINESTRING (14124074.485 4466841.932, 14124074...",1.0,1.000000,0.972350,58.996484,88.680012,48.326760,POINT (14124073.026 4466915.804),7772.517840
1,47735667801,11939.0,"LINESTRING (14123448.865 4465801.237, 14123427...",1.0,0.714286,0.716814,37.545854,47.595515,28.651125,POINT (14123350.490 4465769.603),6670.115741
2,47825587301,10733.0,"LINESTRING (14129045.411 4460150.713, 14129018...",1.0,1.142857,1.371094,19.688529,15.010157,12.941629,POINT (14128830.586 4460082.745),4571.063813
3,47735208501,28034.0,"LINESTRING (14116849.436 4474371.724, 14116846...",0.0,0.000000,0.000000,18.698021,25.582787,11.286719,POINT (14116848.130 4474382.163),8499.691329
4,47825020201,6165.0,"LINESTRING (14129443.377 4454981.973, 14129418...",1.0,0.971014,1.042194,22.964521,67.429104,36.040223,POINT (14129385.576 4454850.411),6598.762094
...,...,...,...,...,...,...,...,...,...,...,...
1137,47835413601,10641.0,"LINESTRING (14123941.773 4471424.285, 14124028...",2.0,1.574074,1.355769,19.400462,20.470623,39.068485,POINT (14123985.001 4471426.964),4840.615240
1138,47824327901,13332.0,"LINESTRING (14128645.114 4460077.521, 14128704...",1.0,1.093750,1.356522,13.302683,10.422081,11.435672,POINT (14128739.399 4460003.483),4465.706961
1139,47714507401,7604.0,"LINESTRING (14115361.548 4445011.897, 14115455...",0.0,0.000000,0.000000,18.202419,18.830942,12.786421,POINT (14115680.527 4444556.875),13147.172380
1140,47835608401,15383.0,"LINESTRING (14130198.871 4472817.029, 14130183...",0.2,0.371134,0.422145,0.559468,8.979918,16.118917,POINT (14129891.897 4472637.570),5554.617507


## 지하철과의 거리 변수 생성

In [23]:
# 지하철 데이터 불러오기
df_sub = pd.read_csv('../data/1-6.화성시_지하철역.csv')
df_sub

Unnamed: 0,stn_nm,stn_ln_info,lon,lat
0,병점역,경부선,127.033268,37.206821
1,어천역,수인선,126.908805,37.250076
2,야목역,수인선,126.88429,37.261011
3,서동탄,경부선,127.051672,37.195504
4,동탄,수도권 광역급행철도,127.09569,37.20034
5,서화성,서해선,126.787141,37.248709
6,화성시청,서해선,126.821003,37.189305
7,향남,서해선,126.913247,37.123283


In [24]:
from shapely.geometry import Point
import geopandas as gpd

# 지하철 데이터를 GeoDataFrame으로 변환
df_sub['geometry'] = df_sub.apply(lambda row: Point(row['lon'], row['lat']), axis=1)
gdf_sub = gpd.GeoDataFrame(df_sub, geometry='geometry', crs="EPSG:4326")

# 좌표계를 거리 계산을 위한 투영 좌표계로 변환
gdf_sub = gdf_sub.to_crs("EPSG:3857")
gdf_link = gdf_link.to_crs("EPSG:3857")  # 이미 EPSG:3857로 변환된 상태라면 생략 가능

# 도로 중심점과 각 지하철역 간의 거리 계산
def calculate_min_distance_to_sub(centroid, subway_stations):
    distances = subway_stations.distance(centroid)  # 모든 지하철역과의 거리 계산
    return distances.min()  # 최소 거리 반환

gdf_link['min_dist_to_sub'] = gdf_link['centroid'].apply(
    lambda centroid: calculate_min_distance_to_sub(centroid, gdf_sub['geometry'])
)

# 결과 확인
print(gdf_link[['min_dist_to_sub']].head())

   min_dist_to_sub
0      6572.074344
1      5706.176951
2      3886.564032
3      3018.999790
4      2066.172807


In [25]:
gdf_link.columns

Index(['id', 'ALL_AADT', 'geometry', 'grid_density_mean',
       'buffer_500m_density_mean', 'buffer_1km_density_mean', 'pop_avg_100m',
       'pop_avg_500m', 'pop_avg_1km', 'centroid', 'min_dist_to_rest',
       'min_dist_to_sub'],
      dtype='object')

In [26]:
# 완성된 데이터프레임생성
road_df = gdf_link[['id', 'ALL_AADT', 'grid_density_mean',
       'buffer_500m_density_mean', 'buffer_1km_density_mean', 'pop_avg_100m',
       'pop_avg_500m', 'pop_avg_1km', 'min_dist_to_rest','min_dist_to_sub']]
road_df

Unnamed: 0,id,ALL_AADT,grid_density_mean,buffer_500m_density_mean,buffer_1km_density_mean,pop_avg_100m,pop_avg_500m,pop_avg_1km,min_dist_to_rest,min_dist_to_sub
0,47834686201,27724.0,1.0,1.000000,0.972350,58.996484,88.680012,48.326760,7772.517840,6572.074344
1,47735667801,11939.0,1.0,0.714286,0.716814,37.545854,47.595515,28.651125,6670.115741,5706.176951
2,47825587301,10733.0,1.0,1.142857,1.371094,19.688529,15.010157,12.941629,4571.063813,3886.564032
3,47735208501,28034.0,0.0,0.000000,0.000000,18.698021,25.582787,11.286719,8499.691329,3018.999790
4,47825020201,6165.0,1.0,0.971014,1.042194,22.964521,67.429104,36.040223,6598.762094,2066.172807
...,...,...,...,...,...,...,...,...,...,...
1137,47835413601,10641.0,2.0,1.574074,1.355769,19.400462,20.470623,39.068485,4840.615240,4184.699840
1138,47824327901,13332.0,1.0,1.093750,1.356522,13.302683,10.422081,11.435672,4465.706961,3788.754576
1139,47714507401,7604.0,0.0,0.000000,0.000000,18.202419,18.830942,12.786421,13147.172380,16963.808053
1140,47835608401,15383.0,0.2,0.371134,0.422145,0.559468,8.979918,16.118917,5554.617507,2829.946555


In [27]:
gdf_link.to_csv('../data/화성_도로예측_위치포함.csv', index=False)
road_df.to_csv('../data/화성_도로예측.csv', index=False)

## 격자별 도로정보

In [28]:
import pandas as pd
df_road_grid = pd.read_csv('../data/격자_교통량.csv')
df_road_grid = df_road_grid[['gid', 'blck_cd', 'ALL_AADT_s']]
df_road_grid

Unnamed: 0,gid,blck_cd,ALL_AADT_s
0,다사385011,,4513.0
1,다사312110,,
2,다사473033,,
3,다사602117,,
4,다사476205,,24781.0
...,...,...,...
71379,다사513058,,
71380,다사553001,,
71381,다사517088,,
71382,다사456150,,


In [31]:
df_road_grid.to_csv('../data/격자별_교통량합계.csv', index=False)

In [32]:
df_traffic = pd.read_csv('../data/격자별_교통량합계.csv')
df_traffic

Unnamed: 0,gid,blck_cd,ALL_AADT_s
0,다사385011,,4513.0
1,다사312110,,
2,다사473033,,
3,다사602117,,
4,다사476205,,24781.0
...,...,...,...
71379,다사513058,,
71380,다사553001,,
71381,다사517088,,
71382,다사456150,,


## 하남 예측 데이터셋 준비
- 실제 예측에 적용하는 교산일대의 도로 데이터와 위와 동일한 형태로 피처를 세팅
- 같은 폴더 내에 **"전처리_타겟데이터정리"**가 먼저 실행되어야 합니다.

In [34]:
# 타겟 도로 데이터 불러오기
target_link = pd.read_csv('../data/타겟/교산_도로.csv')

# 이전까지의 예측(상권, 유동인구) 데이터 불러오기
df_pred = pd.read_csv('../data/타겟/하남시_상권밀집도_상권유형_유동인구_예측.csv')
df_pred.head()

Unnamed: 0.1,Unnamed: 0,gid,geometry,centroid,centroid_lon,centroid_lat,nearest_subway_distance,nearest_parking_distance,weighted_avg_households,nearest_bus_distance,nearest_fac_distance,상권밀집도_pred,상권유형_pred,유동인구_pred
0,0,다사710491,POLYGON ((127.17173076821828 37.54076919185162...,POINT (127.17229476868658 37.54122143617736),127.172295,37.541221,3014.305191,387.107017,798.802334,179.354672,111.434837,3.0,2,3.131951
1,1,다사732511,POLYGON ((127.19656054667902 37.55886272066008...,POINT (127.19712483359214 37.55931484626683),127.197125,37.559315,570.356206,914.268874,806.334538,188.655467,681.144494,3.0,2,1.797619
2,2,다사703464,POLYGON ((127.16391635798603 37.51641053705482...,POINT (127.16448012873086 37.516862819457096),127.16448,37.516863,1345.885688,1245.146052,786.601796,145.86203,103.762801,1.0,1,3.131798
3,3,다사734504,POLYGON ((127.19885038306606 37.55255909902908...,POINT (127.19941463661783 37.553011214029446),127.199415,37.553011,407.719269,802.063949,808.689244,62.881655,420.929866,3.0,2,1.827171
4,4,다사731458,POLYGON ((127.19562254449121 37.51108833165109...,POINT (127.19618646783618 37.51154046380315),127.196186,37.51154,141.476686,411.598808,767.835408,8.560694,47.395822,3.0,2,3.17873


In [35]:
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point, LineString, Polygon
from shapely.wkt import loads
from tqdm import tqdm

# tqdm 설정
tqdm.pandas()

# target_link의 geometry 변환
print("Checking and converting target_link geometry...")
target_link['geometry'] = target_link['geometry'].astype(str)  # 문자열로 변환
target_link['geometry'] = target_link['geometry'].progress_apply(loads)

# GeoDataFrame 변환
print("Converting target_link to GeoDataFrame...")
target_link = gpd.GeoDataFrame(target_link, geometry='geometry', crs="EPSG:4326")

# df_pred의 geometry 변환
print("Checking and converting df_pred geometry...")
df_pred['geometry'] = df_pred['geometry'].astype(str)  # 문자열로 변환
df_pred['geometry'] = df_pred['geometry'].progress_apply(loads)

print("Converting df_pred to GeoDataFrame...")
df_pred = gpd.GeoDataFrame(df_pred, geometry='geometry', crs="EPSG:4326")

# 좌표계 변환
print("Reprojecting to EPSG:3857 for distance calculations...")
target_link = target_link.to_crs("EPSG:3857")
df_pred = df_pred.to_crs("EPSG:3857")

# 결과 확인
print("Conversion complete!")
print(target_link.head())
print(df_pred.head())

100%|██████████| 204/204 [00:00<00:00, 17900.38it/s]
 39%|███▉      | 3757/9634 [00:00<00:00, 37563.77it/s]

Checking and converting target_link geometry...
Converting target_link to GeoDataFrame...
Checking and converting df_pred geometry...


100%|██████████| 9634/9634 [00:00<00:00, 37703.28it/s]


Converting df_pred to GeoDataFrame...
Reprojecting to EPSG:3857 for distance calculations...
Conversion complete!
    id                                           geometry
0  1.0  LINESTRING (14158515.413 4510208.104, 14158764...
1  2.0  LINESTRING (14158767.132 4510149.059, 14159028...
2  3.0  LINESTRING (14159034.390 4510032.004, 14159390...
3  4.0  LINESTRING (14159390.733 4509754.388, 14159531...
4  5.0  LINESTRING (14159535.756 4509480.915, 14159684...
   Unnamed: 0       gid                                           geometry  \
0           0  다사710491  POLYGON ((14156692.312 4514753.491, 14156691.8...   
1           1  다사732511  POLYGON ((14159456.351 4517293.983, 14159455.9...   
2           2  다사703464  POLYGON ((14155822.416 4511334.293, 14155821.9...   
3           3  다사734504  POLYGON ((14159711.254 4516408.829, 14159710.8...   
4           4  다사731458  POLYGON ((14159351.933 4510587.370, 14159351.5...   

                                        centroid  centroid_lon  centr

In [36]:
# 버퍼 내 평균 계산 함수 정의
def compute_average_within_buffer(link_geometry, grid, column_name, buffer_distance):
    """
    특정 반경 내에서 지정한 열의 평균 값을 계산.
    """
    buffer = link_geometry.buffer(buffer_distance)  # 버퍼 생성
    within_buffer = grid[grid.intersects(buffer)]  # 버퍼 내의 데이터 필터링
    if not within_buffer.empty:
        return within_buffer[column_name].mean()  # 열의 평균값 계산
    return None

# 도로 기준 반경 내 평균 계산
print("Calculating buffer-based averages...")
for distance, suffix in zip([100, 500, 1000], ["100m", "500m", "1km"]):
    # 상권 밀집도 평균
    print(f"Processing sector density averages for {suffix}...")
    target_link[f'sector_density_avg_{suffix}'] = target_link['geometry'].progress_apply(
        lambda geom: compute_average_within_buffer(geom, df_pred, '상권밀집도_pred', distance)
    )
    
    # 유동인구 평균
    print(f"Processing population averages for {suffix}...")
    target_link[f'pop_avg_{suffix}'] = target_link['geometry'].progress_apply(
        lambda geom: compute_average_within_buffer(geom, df_pred, '유동인구_pred', distance)
    )

# 결과 확인
print("Calculation complete. Here's the resulting DataFrame:")
print(target_link.head())

  4%|▍         | 8/204 [00:00<00:02, 71.63it/s]

Calculating buffer-based averages...
Processing sector density averages for 100m...


100%|██████████| 204/204 [00:03<00:00, 64.48it/s]
  3%|▎         | 7/204 [00:00<00:02, 68.95it/s]

Processing population averages for 100m...


100%|██████████| 204/204 [00:03<00:00, 64.54it/s]
  3%|▎         | 6/204 [00:00<00:03, 59.50it/s]

Processing sector density averages for 500m...


100%|██████████| 204/204 [00:03<00:00, 59.14it/s]
  3%|▎         | 7/204 [00:00<00:02, 69.04it/s]

Processing population averages for 500m...


100%|██████████| 204/204 [00:03<00:00, 59.33it/s]
  3%|▎         | 6/204 [00:00<00:03, 53.48it/s]

Processing sector density averages for 1km...


100%|██████████| 204/204 [00:04<00:00, 44.35it/s]
  3%|▎         | 6/204 [00:00<00:03, 52.57it/s]

Processing population averages for 1km...


100%|██████████| 204/204 [00:04<00:00, 44.88it/s]

Calculation complete. Here's the resulting DataFrame:
    id                                           geometry  \
0  1.0  LINESTRING (14158515.413 4510208.104, 14158764...   
1  2.0  LINESTRING (14158767.132 4510149.059, 14159028...   
2  3.0  LINESTRING (14159034.390 4510032.004, 14159390...   
3  4.0  LINESTRING (14159390.733 4509754.388, 14159531...   
4  5.0  LINESTRING (14159535.756 4509480.915, 14159684...   

   sector_density_avg_100m  pop_avg_100m  sector_density_avg_500m  \
0                      3.0      1.802093                      3.0   
1                      3.0      1.812659                      3.0   
2                      3.0      1.810339                      3.0   
3                      3.0      1.831350                      3.0   
4                      3.0      1.858071                      3.0   

   pop_avg_500m  sector_density_avg_1km  pop_avg_1km  
0      1.886649                2.969349     2.082284  
1      1.938314                3.000000     2.145296  




In [38]:
# 휴게소 데이터 불러오기 
df_rest = pd.read_csv('../data/전국휴게소정보표준데이터.csv', encoding='cp949')
df_rest = df_rest[['휴게소명', '위도', '경도']]

# 도로 중심점 계산 (geometry가 LineString)
target_link['centroid'] = target_link['geometry'].centroid

# 휴게소 데이터를 GeoDataFrame으로 변환
df_rest['geometry'] = df_rest.apply(lambda row: Point(row['경도'], row['위도']), axis=1)
gdf_rest = gpd.GeoDataFrame(df_rest, geometry='geometry', crs="EPSG:4326")

# 좌표계를 거리 계산을 위한 투영 좌표계로 변환
gdf_rest = gdf_rest.to_crs("EPSG:3857")

# 도로 중심점과 각 휴게소 간의 거리 계산
def calculate_min_distance(centroid, rest_areas):
    distances = rest_areas.distance(centroid)  # 모든 휴게소와의 거리 계산
    return distances.min()  # 최소 거리 반환

target_link['min_dist_to_rest'] = target_link['centroid'].apply(
    lambda centroid: calculate_min_distance(centroid, gdf_rest['geometry'])
)

# 결과 확인
target_link[['min_dist_to_rest']]

Unnamed: 0,min_dist_to_rest
0,3625.595068
1,3575.486811
2,3630.206464
3,3808.477294
4,4005.582209
...,...
199,230.788185
200,1077.321523
201,1226.089488
202,313.854041


In [39]:
# 지하철 데이터 불러오기
df_sub = pd.read_csv('../data/1-6.화성시_지하철역.csv')

from shapely.geometry import Point
import geopandas as gpd

# 지하철 데이터를 GeoDataFrame으로 변환
df_sub['geometry'] = df_sub.apply(lambda row: Point(row['lon'], row['lat']), axis=1)
gdf_sub = gpd.GeoDataFrame(df_sub, geometry='geometry', crs="EPSG:4326")

# 좌표계를 거리 계산을 위한 투영 좌표계로 변환
gdf_sub = gdf_sub.to_crs("EPSG:3857")

# 도로 중심점과 각 지하철역 간의 거리 계산
def calculate_min_distance_to_sub(centroid, subway_stations):
    distances = subway_stations.distance(centroid)  # 모든 지하철역과의 거리 계산
    return distances.min()  # 최소 거리 반환

target_link['min_dist_to_sub'] = target_link['centroid'].apply(
    lambda centroid: calculate_min_distance_to_sub(centroid, gdf_sub['geometry'])
)

# 결과 확인
print(target_link[['min_dist_to_sub']].head())

   min_dist_to_sub
0     44350.166824
1     44327.975663
2     44211.145679
3     44011.107495
4     43815.299464


In [40]:
# 데이터 내보내기
target_link.to_csv('../data/타겟/타겟_교통량_추정준비.csv',index=False)