## 데이터 준비

In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import geopandas as gpd
from geopy.distance import geodesic
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics.pairwise import haversine_distances
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from catboost import CatBoostRegressor

# 화성 도로 예측 데이터 불러오기
df_train = pd.read_csv('../data/화성_도로예측.csv')
df_train

# 도로 예측 데이터 불러오기
X_test = pd.read_csv('../data/타겟/타겟_교통량_추정준비.csv')

# # 컬럼명 일치
X_test = X_test.rename(columns={'sector_density_avg_100m' : 'grid_density_mean',
                                'sector_density_avg_500m' : 'buffer_500m_density_mean',
                                'sector_density_avg_1km' : 'buffer_1km_density_mean'
                                 })
X_test = X_test[['id','grid_density_mean', 'buffer_500m_density_mean',
       'buffer_1km_density_mean', 'pop_avg_100m', 'pop_avg_500m',
       'pop_avg_1km', 'min_dist_to_rest', 'min_dist_to_sub']]

X_test

Unnamed: 0,id,grid_density_mean,buffer_500m_density_mean,buffer_1km_density_mean,pop_avg_100m,pop_avg_500m,pop_avg_1km,min_dist_to_rest,min_dist_to_sub
0,1.000000e+00,3.0,3.0,2.969349,1.802093,1.886649,2.082284,3625.595068,44350.166824
1,2.000000e+00,3.0,3.0,3.000000,1.812659,1.938314,2.145296,3575.486811,44327.975663
2,3.000000e+00,3.0,3.0,3.000000,1.810339,1.972431,2.144845,3630.206464,44211.145679
3,4.000000e+00,3.0,3.0,2.988971,1.831350,1.973710,2.166023,3808.477294,44011.107495
4,5.000000e+00,3.0,3.0,2.838346,1.858071,1.969211,2.116309,4005.582209,43815.299464
...,...,...,...,...,...,...,...,...,...
199,9.227728e+10,3.0,3.0,3.000000,1.869585,2.037343,2.409201,230.788185,48045.619291
200,5.727468e+10,3.0,3.0,3.000000,2.529144,2.570086,2.314994,1077.321523,48863.393955
201,5.727104e+10,3.0,3.0,2.828125,2.333402,2.615216,2.473982,1226.089488,48416.148259
202,9.227728e+10,3.0,3.0,2.996350,1.890895,2.203072,2.400773,313.854041,48046.025382


In [15]:
# 종속변수 설정
X_train = df_train.drop(['id', 'ALL_AADT'], axis=1)
X_test = X_test.drop('id', axis=1)
y_train = df_train['ALL_AADT']

# 스케일링
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## 예측에 사용한 모델
파라미터 {'depth': 12, 'iterations': 400, 'learning_rate': 0.2}

In [16]:
# 예측 수행
cb = CatBoostRegressor(depth=12, iterations=400, learning_rate=0.2, random_state=42)
cb.fit(X_train, y_train)
y_pred = cb.predict(X_test)
y_pred

0:	learn: 15237.0543510	total: 194ms	remaining: 1m 17s
1:	learn: 14376.0390540	total: 393ms	remaining: 1m 18s
2:	learn: 13627.9817680	total: 592ms	remaining: 1m 18s
3:	learn: 12889.6697165	total: 791ms	remaining: 1m 18s
4:	learn: 12187.4530329	total: 990ms	remaining: 1m 18s
5:	learn: 11839.6138699	total: 1.19s	remaining: 1m 17s
6:	learn: 11332.4844024	total: 1.39s	remaining: 1m 18s
7:	learn: 10882.0501715	total: 1.59s	remaining: 1m 17s
8:	learn: 10347.0309070	total: 1.79s	remaining: 1m 17s
9:	learn: 10038.1923232	total: 2.08s	remaining: 1m 21s
10:	learn: 9667.6198614	total: 2.28s	remaining: 1m 20s
11:	learn: 9391.0425814	total: 2.48s	remaining: 1m 20s
12:	learn: 9185.4528443	total: 2.69s	remaining: 1m 20s
13:	learn: 9026.1217591	total: 2.89s	remaining: 1m 19s
14:	learn: 8850.0150333	total: 3.09s	remaining: 1m 19s
15:	learn: 8522.2733092	total: 3.38s	remaining: 1m 21s
16:	learn: 8272.8163594	total: 3.58s	remaining: 1m 20s
17:	learn: 8040.3975755	total: 3.78s	remaining: 1m 20s
18:	learn:

array([24862.49060799, 24862.49060799, 24862.49060799, 24862.49060799,
       25858.56987956, 25912.97528556, 21487.20052756, 21737.89815203,
       21977.80512044, 22273.16650775, 24172.70734676, 25069.6028153 ,
       25558.5053857 , 25507.72239247, 25942.5476359 , 26000.90620936,
       24877.0471345 , 24862.49060799, 24862.49060799, 24963.71510163,
       24996.7037845 , 24972.79580019, 24622.71734796, 24673.50034119,
       24356.51744155, 23140.38994048, 23237.39640618, 23253.50913508,
       24766.92061173, 24780.69924699, 24674.78529353, 24674.78529353,
       24941.92306921, 24887.15528041, 26032.67197906, 25796.71666929,
       24280.17374043, 26077.74239839, 19339.28882438, 21636.21355483,
       21645.23786209, 22380.30191588, 22301.310636  , 22306.53876412,
       24940.95544497, 25615.20392026, 25564.42092703, 25916.39625684,
       25908.51467691, 25938.40736357, 25980.34357364, 26000.90620936,
       25218.08245379, 25168.18100952, 24887.15528041, 24871.13468422,
      

In [19]:
# 도로 예측 데이터 불러오기
X_test = pd.read_csv('../data/타겟/타겟_교통량_추정준비.csv')
traffic_pred = pd.DataFrame({'id' : X_test['id'], 'traffic' : y_pred.tolist(), 'geometry' : X_test['geometry']})
traffic_pred 

Unnamed: 0,id,traffic,geometry
0,1.000000e+00,24862.490608,LINESTRING (14158515.413246289 4510208.1038329...
1,2.000000e+00,24862.490608,LINESTRING (14158767.132375112 4510149.0586052...
2,3.000000e+00,24862.490608,LINESTRING (14159034.389721759 4510032.0040309...
3,4.000000e+00,24862.490608,LINESTRING (14159390.732850628 4509754.3878724...
4,5.000000e+00,25858.569880,LINESTRING (14159535.756217027 4509480.9152386...
...,...,...,...
199,9.227728e+10,24174.019961,LINESTRING (14160634.365459003 4513497.4605573...
200,5.727468e+10,26063.854445,LINESTRING (14161122.960064624 4514237.0444874...
201,5.727104e+10,26069.619245,LINESTRING (14161658.090451784 4513658.4795839...
202,9.227728e+10,24188.576488,LINESTRING (14160656.938633423 4513377.6478747...


## 예측 결과 격자에 매핑

In [18]:
target_grid = pd.read_csv('../data/타겟/하남시_상권밀집도_상권유형_유동인구_예측.csv')
target_grid = target_grid[['gid', 'geometry']]
target_grid

Unnamed: 0,gid,geometry
0,다사710491,POLYGON ((127.17173076821828 37.54076919185162...
1,다사732511,POLYGON ((127.19656054667902 37.55886272066008...
2,다사703464,POLYGON ((127.16391635798603 37.51641053705482...
3,다사734504,POLYGON ((127.19885038306606 37.55255909902908...
4,다사731458,POLYGON ((127.19562254449121 37.51108833165109...
...,...,...
9629,다사755436,POLYGON ((127.22285189661623 37.49132535732914...
9630,다사766500,POLYGON ((127.23509129032205 37.54904054422564...
9631,다사744460,"POLYGON ((127.210325092193 37.51292802664879, ..."
9632,다사771452,POLYGON ((127.24090136510351 37.50578815250274...


In [6]:
import geopandas as gpd
from shapely import wkt
from tqdm import tqdm

tqdm.pandas()  # tqdm 활성화

# 문자열을 지오메트리 객체로 변환
target_grid['geometry'] = target_grid['geometry'].astype(str).apply(wkt.loads)
traffic_pred['geometry'] = traffic_pred['geometry'].astype(str).apply(wkt.loads)

# GeoDataFrame 생성
target_grid_gdf = gpd.GeoDataFrame(target_grid, geometry='geometry')
traffic_pred_gdf = gpd.GeoDataFrame(traffic_pred, geometry='geometry')

# 좌표계 설정 (격자는 WGS84, 도로는 다른 좌표계로 가정)
target_grid_gdf.set_crs("EPSG:4326", inplace=True)  # WGS84
traffic_pred_gdf.set_crs("EPSG:3857", inplace=True)  # Web Mercator

# target_grid_gdf를 EPSG:3857로 변환
target_grid_gdf = target_grid_gdf.to_crs("EPSG:3857")

# 빈 리스트 생성
traffic_sum_list = []

# tqdm을 사용한 반복문으로 진행률 표시
for grid_index, grid_row in tqdm(target_grid_gdf.iterrows(), total=target_grid_gdf.shape[0], desc="Processing grids"):
    # 현재 격자와 교차하는 도로 찾기
    intersecting_traffic = traffic_pred_gdf[traffic_pred_gdf['geometry'].intersects(grid_row['geometry'])]
    
    # 교차하는 도로의 교통량 합산
    total_traffic = intersecting_traffic['traffic'].sum()
    
    # 결과 저장
    traffic_sum_list.append({'gid': grid_row['gid'], 'traffic_sum': total_traffic})

# 결과를 데이터프레임으로 변환
traffic_sum_df = gpd.GeoDataFrame(traffic_sum_list)

# 원본 데이터프레임에 합계 교통량 추가
target_grid = target_grid.merge(traffic_sum_df, on='gid', how='left')

# 결과 출력
print(target_grid)

Processing grids: 100%|██████████| 9634/9634 [02:22<00:00, 67.79it/s]

           gid                                           geometry  traffic_sum
0     다사710491  POLYGON ((127.17173 37.54077, 127.17173 37.541...          0.0
1     다사732511  POLYGON ((127.19656 37.55886, 127.19656 37.559...          0.0
2     다사703464  POLYGON ((127.16392 37.51641, 127.16391 37.517...          0.0
3     다사734504  POLYGON ((127.19885 37.55256, 127.19885 37.553...          0.0
4     다사731458  POLYGON ((127.19562 37.51109, 127.19562 37.511...          0.0
...        ...                                                ...          ...
9629  다사755436  POLYGON ((127.22285 37.49133, 127.22285 37.492...          0.0
9630  다사766500  POLYGON ((127.23509 37.54904, 127.23509 37.549...          0.0
9631  다사744460  POLYGON ((127.21033 37.51293, 127.21032 37.513...          0.0
9632  다사771452  POLYGON ((127.24090 37.50579, 127.24090 37.506...          0.0
9633  다사702430  POLYGON ((127.16292 37.48576, 127.16292 37.486...          0.0

[9634 rows x 3 columns]





## 블록에 매핑

In [7]:
import geopandas as gpd
gdf = gpd.read_file('../data/3.대상구역도(하남교산).geojson')

# 타겟 데이터로부터 공동 주택 정보만 불러오기
gdf_house = gdf[gdf['blockType']=='공동주택']

# 특수기호 제거 (정규표현식 사용)
gdf_house['blockName'] = gdf_house['blockName'].str.replace(r'[^\w\s]', '', regex=True)

# 관찰할 컬럼만 필터링
gdf_house = gdf_house[['blockName', 'geometry']]

# 중심점 구하기
gdf_house['centroid'] = gdf_house.geometry.centroid

# 중심점에서 위도(lat)와 경도(lon) 추출
gdf_house['lon'] = gdf_house['centroid'].x  # 경도
gdf_house['lat'] = gdf_house['centroid'].y  # 위도
gdf_house.head()

Unnamed: 0,blockName,geometry,centroid,lon,lat
48,A11,"POLYGON ((127.19541 37.51630, 127.19538 37.516...",POINT (127.19406 37.51571),127.194059,37.515711
52,S1,"POLYGON ((127.22083 37.53749, 127.22135 37.536...",POINT (127.22085 37.53654),127.220855,37.536542
67,B8,"POLYGON ((127.18989 37.51374, 127.18990 37.513...",POINT (127.18957 37.51298),127.18957,37.512978
72,B9,"POLYGON ((127.19111 37.50762, 127.19120 37.507...",POINT (127.19119 37.50644),127.191186,37.506442
73,C2,"POLYGON ((127.19925 37.50853, 127.19865 37.507...",POINT (127.19725 37.50826),127.19725,37.508256


In [8]:
import geopandas as gpd
from shapely.geometry import Point
from shapely.wkt import loads

# gdf_house의 geometry 열을 shapely 객체로 변환
gdf_house['geometry'] = gdf_house['geometry'].astype(str) 
gdf_house['geometry'] = gdf_house['geometry'].apply(loads)

target_grid['geometry'] = target_grid['geometry'].astype(str) 
target_grid['geometry'] = target_grid['geometry'].apply(loads)

# GeoDataFrame으로 변환
gdf_house = gpd.GeoDataFrame(gdf_house, geometry='geometry', crs="EPSG:4326")
target_grid = gpd.GeoDataFrame(target_grid, geometry='geometry', crs="EPSG:4326")

# df_house의 lon, lat을 사용해 포인트 생성
gdf_house['geometry'] = gdf_house.apply(lambda row: Point(row['lon'], row['lat']), axis=1)

# df_house를 GeoDataFrame으로 변환 (crs는 필요에 따라 설정)
gdf_house = gpd.GeoDataFrame(gdf_house, geometry='geometry', crs="EPSG:4326")

# 각 df_house 지점에 대해 500m 버퍼 생성
gdf_house['buffer'] = gdf_house.geometry.buffer(0.005)

# df_traffic을 GeoDataFrame으로 변환 
target_grid = gpd.GeoDataFrame(target_grid, geometry='geometry', crs="EPSG:4326")

# 각 df_house의 버퍼와 df_traffic의 도로 정보가 겹치는 부분을 찾고, ALL_AADT_s 합계 계산
def get_traffic_sum_for_buffer(house_idx):
    buffer_geom = gdf_house.loc[house_idx, 'buffer']
    
    # 버퍼와 겹치는 도로 찾기
    overlapping_roads = target_grid[target_grid.geometry.intersects(buffer_geom)]
    
    # 겹치는 도로의 ALL_AADT_s 합계 계산
    traffic_sum = overlapping_roads['traffic_sum'].sum()
    
    return traffic_sum

# 각 df_house의 버퍼에 대해 교차하는 도로의 ALL_AADT_s 합계를 구함
gdf_house['traffic_sum'] = gdf_house.index.to_series().apply(get_traffic_sum_for_buffer)

# 결과 출력
gdf_house[['lon', 'lat', 'traffic_sum']].head()

Unnamed: 0,lon,lat,traffic_sum
48,127.194059,37.515711,441134.3
52,127.220855,37.536542,922094.4
67,127.18957,37.512978,223500.1
72,127.191186,37.506442,1773257.0
73,127.19725,37.508256,1665686.0


In [9]:
# 필요한 데이터만 추출
gdf_traffic_sum = gdf_house[['blockName', 'centroid', 'traffic_sum']]
gdf_traffic_sum

Unnamed: 0,blockName,centroid,traffic_sum
48,A11,POINT (127.19406 37.51571),441134.3
52,S1,POINT (127.22085 37.53654),922094.4
67,B8,POINT (127.18957 37.51298),223500.1
72,B9,POINT (127.19119 37.50644),1773257.0
73,C2,POINT (127.19725 37.50826),1665686.0
75,C3,POINT (127.19386 37.49738),1205541.0
95,A5,POINT (127.22399 37.53289),1271488.0
96,A7,POINT (127.20791 37.52356),914313.0
97,A8,POINT (127.20836 37.52127),735644.2
98,B1,POINT (127.21674 37.53551),2103805.0


In [10]:
gdf_traffic_sum.to_csv('../data/타겟/교산_블록별_교통량.csv', index=False)

In [11]:
traffic_house = pd.read_csv('../data/타겟/교산_블록별_교통량.csv')
traffic_house.head()

Unnamed: 0,blockName,centroid,traffic_sum
0,A11,POINT (127.19405888260044 37.515711397601265),441134.3
1,S1,POINT (127.22085463797028 37.53654240083496),922094.4
2,B8,POINT (127.18957037420951 37.51297849300711),223500.1
3,B9,POINT (127.19118576824998 37.50644181369634),1773257.0
4,C2,POINT (127.19725030250127 37.50825574055258),1665686.0


In [13]:
gdf_traffic_sum.traffic_sum.to_numpy()

array([ 441134.32283283,  922094.43245474,  223500.07494304,
       1773257.21340514, 1665685.62053343, 1205540.79350679,
       1271488.36631279,  914312.98828786,  735644.1766248 ,
       2103804.79668342, 1106190.19661459,  123875.21882515,
        497524.32433697, 2389136.78505953, 1910109.12744289,
       1786471.89096204, 1206137.88531431, 1039732.98158808,
       1581282.69529845, 1063531.49090389,  246414.80216744,
       1264925.29442275,  990833.74651239, 1516366.26589462,
       1147555.75173743, 1227971.859136  , 1069132.23709774,
        896564.90532857, 1782584.42162284, 1765984.91486374,
       1947968.96093149,  398996.3465109 , 1374885.67009983,
       1164933.54025457,  562298.75682421,  980388.02335828,
        980388.02335828, 2499372.24669258])