In [110]:
import pandas as pd 
import geopandas as gpd
import numpy as np
from geopy.distance import geodesic
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics.pairwise import haversine_distances
from math import radians
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

### 화성

### 화성_상권밀집도 학습모델

In [111]:
gdf_hwa = pd.read_csv('../data/상권정보예측.csv')

X_hwa = gdf_hwa[['nearest_subway_distance','nearest_parking_distance', 'weighted_avg_households']]
y_hwa = gdf_hwa['상권밀집도']
X_train_hwa, X_test_hwa, y_train_hwa, y_test_hwa = train_test_split(X_hwa, y_hwa, test_size=0.3, random_state=42)

param_grid = {
    'n_estimators': [100],
    'max_depth': [10,15],
    'min_samples_split': [2],
    'min_samples_leaf': [1]
}

rf_hwa = RandomForestClassifier(random_state=42)
grid_search_hwa = GridSearchCV(estimator=rf_hwa, param_grid=param_grid, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)
grid_search_hwa.fit(X_train_hwa, y_train_hwa)

best_rf_hwa = grid_search_hwa.best_estimator_
y_pred_hwa = best_rf_hwa.predict(X_test_hwa)
print("Random Forest Best Parameters:", grid_search_hwa.best_params_)
print("Random Forest Classification Report:\n", classification_report(y_test_hwa, y_pred_hwa))

train_accuracy_hwa = best_rf_hwa.score(X_train_hwa, y_train_hwa)
test_accuracy_hwa = best_rf_hwa.score(X_test_hwa, y_test_hwa)

print(f"Train Accuracy: {train_accuracy_hwa:.4f}")
print(f"Test Accuracy: {test_accuracy_hwa:.4f}")

if train_accuracy_hwa - test_accuracy_hwa > 0.1:
    print("Warning: The model may be overfitting.")
else:
    print("The model does not show significant signs of overfitting.")

y_train_pred_hwa = best_rf_hwa.predict(X_train_hwa)
print("Random Forest Classification Report:\n", classification_report(y_train_hwa, y_train_pred_hwa))

Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   4.6s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   4.6s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   4.5s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   4.6s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   4.5s
[CV] END max_depth=15, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   5.6s
[CV] END max_depth=15, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   5.6s
[CV] END max_depth=15, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   5.5s
[CV] END max_depth=15, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   5.5s
[CV] END max_depth=15, min_samp

In [112]:
gdf_hwa = gdf_hwa.drop(columns=['parking_1km','parking_500m'])

In [113]:
gdf_hwa.head(2)

Unnamed: 0,gid,상권밀집도,geometry,centroid,centroid_lon,centroid_lat,nearest_subway_distance,nearest_parking_distance,weighted_avg_households
0,다사385011,0.0,POLYGON ((126.80784063508459 37.10653637928979...,POINT (126.80839923101641 37.10699034270337),126.808399,37.10699,9203.670759,2616.915044,966.231874
1,다사312110,1.0,"POLYGON ((126.7247779576752 37.19526074709075,...",POINT (126.7253366940065 37.19571509699195),126.725337,37.195715,8042.302201,2206.415809,918.546292


### 화성_상권유형 학습모델

In [114]:
df_type = pd.read_csv('../data/화성_상권유형.csv')

In [115]:
df_type.cluster.value_counts()

일반상권    13797
주택상권    10143
복합상권    10036
Name: cluster, dtype: int64

In [116]:
from sklearn.preprocessing import LabelEncoder

# NaN을 새로운 카테고리로 대체
df_type['cluster'] = df_type['cluster'].fillna('미확인')
# 수동 매핑 설정
mapping = {'일반상권': 1, '주택상권': 2, '복합상권': 3}
# cluster 열 매핑 적용 및 NaN 처리
df_type['cluster_encoded'] = df_type['cluster'].map(mapping).fillna(0).astype(int)

In [117]:
df_type.head(2)

Unnamed: 0,gid,year,m_20g_pop,w_20g_pop,m_30g_pop,w_30g_pop,m_40g_pop,w_40g_pop,m_50g_pop,w_50g_pop,...,w_100g_pop,total_pop,youth_pop,23_20_total,23_20_youth,blck_cd,cluster,geometry,상권밀집도,cluster_encoded
0,다사385011,2023,,,,0.0,,,,,...,,0.0,0.0,0.0,0.0,,미확인,POLYGON ((126.80784063508459 37.10653637928979...,0.0,0
1,다사312110,2023,,0.0,0.0,0.0,0.0,,0.0,0.0,...,,0.0,0.0,0.0,0.0,,주택상권,"POLYGON ((126.7247779576752 37.19526074709075,...",1.0,2


In [118]:
gdf_hwa.shape

(71384, 9)

In [119]:
df_type.shape

(71384, 29)

In [120]:
df_type.columns

Index(['gid', 'year', 'm_20g_pop', 'w_20g_pop', 'm_30g_pop', 'w_30g_pop',
       'm_40g_pop', 'w_40g_pop', 'm_50g_pop', 'w_50g_pop', 'm_60g_pop',
       'w_60g_pop', 'm_70g_pop', 'w_70g_pop', 'm_80g_pop', 'w_80g_pop',
       'm_90g_pop', 'w_90g_pop', 'm_100g_pop', 'w_100g_pop', 'total_pop',
       'youth_pop', '23_20_total', '23_20_youth', 'blck_cd', 'cluster',
       'geometry', '상권밀집도', 'cluster_encoded'],
      dtype='object')

In [121]:
df_type = df_type.drop(columns=['year', 'm_20g_pop', 'w_20g_pop', 'm_30g_pop', 'w_30g_pop',
       'm_40g_pop', 'w_40g_pop', 'm_50g_pop', 'w_50g_pop', 'm_60g_pop',
       'w_60g_pop', 'm_70g_pop', 'w_70g_pop', 'm_80g_pop', 'w_80g_pop',
       'm_90g_pop', 'w_90g_pop', 'm_100g_pop', 'w_100g_pop', 'total_pop',
       'youth_pop', '23_20_total', '23_20_youth', 'blck_cd', 'cluster',
       'geometry', '상권밀집도'])

In [122]:
df_type.head(2)

Unnamed: 0,gid,cluster_encoded
0,다사385011,0
1,다사312110,2


In [123]:
gdf_hwa = pd.merge(gdf_hwa,df_type,on='gid')

In [124]:
gdf_hwa.head(2)

Unnamed: 0,gid,상권밀집도,geometry,centroid,centroid_lon,centroid_lat,nearest_subway_distance,nearest_parking_distance,weighted_avg_households,cluster_encoded
0,다사385011,0.0,POLYGON ((126.80784063508459 37.10653637928979...,POINT (126.80839923101641 37.10699034270337),126.808399,37.10699,9203.670759,2616.915044,966.231874,0
1,다사312110,1.0,"POLYGON ((126.7247779576752 37.19526074709075,...",POINT (126.7253366940065 37.19571509699195),126.725337,37.195715,8042.302201,2206.415809,918.546292,2


In [125]:
# 데이터 준비
X_hwa_type = gdf_hwa[['nearest_subway_distance', 'nearest_parking_distance', 'weighted_avg_households', '상권밀집도']]
y_hwa_type = gdf_hwa['cluster_encoded']
X_hwa_type_train, X_hwa_type_test, y_hwa_type_train, y_hwa_type_test = train_test_split(X_hwa_type, y_hwa_type, test_size=0.3, random_state=42)

# Random Forest 하이퍼파라미터 튜닝
param_grid_hwa_type = {
    'n_estimators': [100],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

rf_hwa_type = RandomForestClassifier(random_state=42)
grid_search_hwa_type = GridSearchCV(estimator=rf_hwa_type, param_grid=param_grid_hwa_type, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)
grid_search_hwa_type.fit(X_hwa_type_train, y_hwa_type_train)

# 최적의 모델로 예측 및 평가
best_rf_hwa_type = grid_search_hwa_type.best_estimator_
y_hwa_type_pred = best_rf_hwa_type.predict(X_hwa_type_test)
print("Random Forest Best Parameters:", grid_search_hwa_type.best_params_)
print("Random Forest Classification Report:\n", classification_report(y_hwa_type_test, y_hwa_type_pred))

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   4.0s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   3.9s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   4.0s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   4.0s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   2.9s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   2.9s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   3.9s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   4.0s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   4.0s
[CV] END max_depth=10, min_sam

### 화성_유동인구 학습모델

In [126]:
df_hwa_p = pd.read_csv('../data/타겟/화성시_유동인구예측.csv')
df_hwa_p = df_hwa_p.drop(columns='Unnamed: 0')

In [127]:
X_hwa_p = df_hwa_p.drop(columns=['pop_avg'])  # 특성 (입력 변수)
y_hwa_p = df_hwa_p['pop_avg']  # 타겟 (유동인구 지수)

# Box-Cox 변환 (타겟 변수에 대해 적용)
y_hwa_p_positive = y_hwa_p + 1  # Box-Cox 변환을 위해 0 이하 값 처리 (필요한 경우)
y_hwa_p_boxcox, lambda_hwa_p_boxcox = stats.boxcox(y_hwa_p_positive)

# 데이터 분할 (학습용, 테스트용 데이터)
X_hwa_p_train, X_hwa_p_test, y_hwa_p_train_boxcox, y_hwa_p_test_boxcox = train_test_split(X_hwa_p, y_hwa_p_boxcox, test_size=0.2, random_state=42)

# 특성 스케일링 (StandardScaler)
scaler_hwa_p = StandardScaler()
X_hwa_p_train_scaled = scaler_hwa_p.fit_transform(X_hwa_p_train)
X_hwa_p_test_scaled = scaler_hwa_p.transform(X_hwa_p_test)

# 최적 하이퍼파라미터 설정
best_rf_model_hwa_p = RandomForestRegressor(
    n_estimators=200,
    max_depth=20,
    max_features='sqrt',
    min_samples_split=2,
    min_samples_leaf=1,
    bootstrap=True,
    random_state=42
)

# 모델 학습
best_rf_model_hwa_p.fit(X_hwa_p_train_scaled, y_hwa_p_train_boxcox)

# Box-Cox 변환된 예측값을 얻기
y_hwa_p_pred_boxcox = best_rf_model_hwa_p.predict(X_hwa_p_test_scaled)

# Box-Cox 예측값을 원래 스케일로 복원 (Inverse Box-Cox 변환)
y_hwa_p_pred_boxcox_original = np.exp(y_hwa_p_pred_boxcox) - 1  # Box-Cox의 역변환
y_hwa_p_test_boxcox_original = np.exp(y_hwa_p_test_boxcox) - 1  # 실제값의 역변환

# 성능 평가 - Box-Cox 변환
mse_hwa_p_boxcox = mean_squared_error(y_hwa_p_test_boxcox_original, y_hwa_p_pred_boxcox_original)
r2_hwa_p_boxcox = r2_score(y_hwa_p_test_boxcox_original, y_hwa_p_pred_boxcox_original)

# 결과 출력
print("Box-Cox 변환 결과:")
print(f"Mean Squared Error: {mse_hwa_p_boxcox:.2f}")
print(f"R^2 Score: {r2_hwa_p_boxcox:.2f}\n")

Box-Cox 변환 결과:
Mean Squared Error: 2.69
R^2 Score: 0.68



### 하남

### 하남_상권밀집도_예측

In [128]:
df1_ha = pd.read_csv('../data/타겟/교산_신설공동주택정보.csv')
df2_ha = pd.read_csv('../data/타겟/하남시_격자별_블록코드.csv')
df3_ha = pd.read_csv('../data/타겟/하남_지하철.csv')
df4_ha = pd.read_csv('../data/타겟/하남_학교.csv')
df5_ha = pd.read_csv('../data/타겟/하남시_공장등록_현황.csv', encoding='cp949')
df6_ha = pd.read_csv('../data/타겟/2-5.하남시_버스정류장.csv')

In [129]:
df7_ha_1 = pd.read_csv('../data/타겟/2-7.하남시_공영주차장.csv')
df7_ha_2 = pd.read_csv('../data/하남시_교산지구_주차장_입지선정_3차.csv')

In [130]:
df7_ha_1 = df7_ha_1.drop(columns=['carpark_type','slots'])

In [131]:
df7_ha_1.head(2)

Unnamed: 0,carpark_nm,lon,lat
0,하남프라자 주변,127.208296,37.551446
1,에코1단지 주변,127.208497,37.541546


In [132]:
df7_ha_2.head(2)

Unnamed: 0,index,gid,nearest_subway_distance,nearest_bus_distance,nearest_fac_distance
0,0,다사737475,100.039167,440.146793,61.652351
1,1,다사732450,728.29584,417.271102,81.456037


In [133]:
df1_ha = df1_ha.drop(columns='Unnamed: 0')
df2_ha = df2_ha.drop(columns='blck_cd')
df5_ha = df5_ha.drop(columns=['행정기관명', '설립구분명', '용지면적(㎡)', '건축면적(㎡)', '종업원수', '공장규모구분명',
       '공장등록일', '용도지역명', '지목명', '업종명', '업종코드', '생산품정보', '전화번호', '정제우편번호',
       '정제도로명주소', '정제지번주소'])
df6_ha = df6_ha.drop(columns='bis_id')
df3_ha.columns= ['역이름','경도','위도']
df4_ha.columns= ['학교이름','경도','위도']
df5_ha.columns = ['회사이름', '위도', '경도'] 
df6_ha.columns= ['역이름','경도','위도']

In [134]:
df1_ha.head(2)

Unnamed: 0,blockName,geometry,세대수,통합공임 포함,10평이상,20평이상,30평이상,area
0,A11,POLYGON ((127.19541408002533 37.51630069427863...,715,1,0,1,0,3e-06
1,S1,POLYGON ((127.22083392223836 37.53748968517276...,333,0,0,1,1,2e-06


In [135]:
df2_ha.head(2)

Unnamed: 0,gid,geometry
0,다사710491,POLYGON ((127.17173076821828 37.54076919185162...
1,다사732511,POLYGON ((127.19656054667902 37.55886272066008...


In [136]:
df3_ha.head(2)

Unnamed: 0,역이름,경도,위도
0,미사,127.192979,37.563254
1,하남풍산,127.203871,37.552058


In [137]:
df4_ha.head(2)

Unnamed: 0,학교이름,경도,위도
0,중학교,127.148312,37.477211
1,중학교,127.202479,37.535497


In [138]:
df5_ha.head(2)

Unnamed: 0,회사이름,위도,경도
0,(주)신영정보기술,37.553504,127.19468
1,(주)신화메디,37.553504,127.19468


In [139]:
df6_ha.head(2)

Unnamed: 0,역이름,경도,위도
0,위례자이더시티.플로리체위례,127.149133,37.473767
1,위례롯데캐슬.호반써밋에비뉴,127.151933,37.473883


#### 기준이 되는 dataframe : df2_ha

In [140]:
gdf_ha = df2_ha.copy()

In [141]:
from shapely.wkt import loads
import geopandas as gpd

# 문자열을 지리 데이터로 변환 후, GeoDataFrame으로 변환
gdf_ha['geometry'] = gdf_ha['geometry'].apply(loads)
gdf_ha = gpd.GeoDataFrame(gdf_ha, geometry='geometry')

In [142]:
def calculate_centroid_ha(geometry):
    return geometry.centroid

def calculate_distance_ha(coord1, coord2):
    return geodesic(coord1, coord2).meters

gdf_ha['centroid'] = gdf_ha['geometry'].apply(calculate_centroid_ha)
gdf_ha['centroid_lon'] = gdf_ha['centroid'].apply(lambda x: x.x)
gdf_ha['centroid_lat'] = gdf_ha['centroid'].apply(lambda x: x.y)
gdf_ha.head(2)

Unnamed: 0,gid,geometry,centroid,centroid_lon,centroid_lat
0,다사710491,"POLYGON ((127.17173 37.54077, 127.17173 37.541...",POINT (127.17229 37.54122),127.172295,37.541221
1,다사732511,"POLYGON ((127.19656 37.55886, 127.19656 37.559...",POINT (127.19712 37.55931),127.197125,37.559315


#### 지하철 계산

In [143]:
def calculate_min_distance_ha(df1_coords_ha, target_coords_ha):
    distances_ha = np.array([
        geodesic(target_coords_ha, station_coords_ha).meters
        for station_coords_ha in df1_coords_ha
    ])
    return distances_ha.min()

# df3_ha 지하철역 좌표 준비
df3_coords_ha = df3_ha[['위도', '경도']].values

# 각 격자 중심과 가장 가까운 지하철역의 거리 계산
def calculate_distances_ha(row_ha):
    target_coords_ha = (row_ha['centroid_lat'], row_ha['centroid_lon'])
    return calculate_min_distance_ha(df3_coords_ha, target_coords_ha)

gdf_ha['nearest_subway_distance'] = gdf_ha.apply(calculate_distances_ha, axis=1)

In [144]:
gdf_ha.head(2)

Unnamed: 0,gid,geometry,centroid,centroid_lon,centroid_lat,nearest_subway_distance
0,다사710491,"POLYGON ((127.17173 37.54077, 127.17173 37.541...",POINT (127.17229 37.54122),127.172295,37.541221,3014.305191
1,다사732511,"POLYGON ((127.19656 37.55886, 127.19656 37.559...",POINT (127.19712 37.55931),127.197125,37.559315,570.356206


In [145]:
df7_ha_2.head(1)

Unnamed: 0,index,gid,nearest_subway_distance,nearest_bus_distance,nearest_fac_distance
0,0,다사737475,100.039167,440.146793,61.652351


#### 주차장 계산

In [146]:
df7_ha_2_1 = pd.merge(df7_ha_2,gdf_ha,on='gid',how='inner')
df7_ha_2_1 = df7_ha_2_1.drop(columns=['gid', 'nearest_subway_distance_x', 'nearest_bus_distance',
       'nearest_fac_distance', 'geometry', 'centroid','nearest_subway_distance_y'])

In [149]:
df7_ha_2_1.columns= ['이름','경도','위도']

In [150]:
df7_ha_1.columns= ['이름','경도','위도']

In [151]:
df7_ha = pd.concat([df7_ha_2_1, df7_ha_1])
df7_ha.shape
df7_ha['이름'] = df7_ha['이름'].fillna('New')

In [152]:
df7_ha.shape

(50, 3)

In [153]:
df7_ha

Unnamed: 0,이름,경도,위도
0,0,127.202915,37.526881
1,1,127.197347,37.504333
2,2,127.197249,37.528669
3,9,127.190529,37.511526
4,20,127.206285,37.533199
5,37,127.204032,37.530489
6,39,127.204022,37.533193
7,40,127.202911,37.527782
8,44,127.201769,37.530483
9,46,127.186041,37.5025


In [154]:
df7_ha.to_csv('../data/타겟/하남시_공영주차장_최종.csv')

In [155]:
# 지구의 반지름 (6371km -> 6371000m)
EARTH_RADIUS_METERS = 6371000

df7_coords_rad_ha = np.radians(df7_ha[['위도','경도']].values)

df_coords_rad_ha = np.radians(gdf_ha[['centroid_lat', 'centroid_lon']].values)

def calculate_nearest_park_distances_ha(df7_coords_rad_ha, df_coords_rad_ha):
    # haversine_distances로 모든 거리 계산
    distances_ha = haversine_distances(df_coords_rad_ha, df7_coords_rad_ha) * EARTH_RADIUS_METERS
    # 각 격자 중심별 가장 가까운 거리 선택
    nearest_distances_ha = distances_ha.min(axis=1)
    return nearest_distances_ha

# 결과 계산
gdf_ha['nearest_parking_distance'] = calculate_nearest_park_distances_ha(df7_coords_rad_ha, df_coords_rad_ha)

In [156]:
gdf_ha.head(2)

Unnamed: 0,gid,geometry,centroid,centroid_lon,centroid_lat,nearest_subway_distance,nearest_parking_distance
0,다사710491,"POLYGON ((127.17173 37.54077, 127.17173 37.541...",POINT (127.17229 37.54122),127.172295,37.541221,3014.305191,387.107017
1,다사732511,"POLYGON ((127.19656 37.55886, 127.19656 37.559...",POINT (127.19712 37.55931),127.197125,37.559315,570.356206,914.268874


#### 세대수, 거리 계산

In [157]:
from shapely.wkt import loads
import geopandas as gpd

# 문자열을 지리 데이터로 변환 후, GeoDataFrame으로 변환
df1_ha['geometry'] = df1_ha['geometry'].apply(loads)
df1_ha = gpd.GeoDataFrame(df1_ha, geometry='geometry')

df1_ha['centroid'] = df1_ha['geometry'].apply(calculate_centroid_ha)
df1_ha['centroid_lon'] = df1_ha['centroid'].apply(lambda x: x.x)
df1_ha['centroid_lat'] = df1_ha['centroid'].apply(lambda x: x.y)
df1_ha.head(2)

Unnamed: 0,blockName,geometry,세대수,통합공임 포함,10평이상,20평이상,30평이상,area,centroid,centroid_lon,centroid_lat
0,A11,"POLYGON ((127.19541 37.51630, 127.19538 37.516...",715,1,0,1,0,3e-06,POINT (127.19406 37.51571),127.194059,37.515711
1,S1,"POLYGON ((127.22083 37.53749, 127.22135 37.536...",333,0,0,1,1,2e-06,POINT (127.22085 37.53654),127.220855,37.536542


In [158]:
def calculate_weighted_average_ha(row_ha, df1_ha):
    target_coords_ha = (row_ha['centroid_lat'], row_ha['centroid_lon'])
    df1_ha['distance'] = df1_ha.apply(
        lambda x_ha: geodesic(target_coords_ha, (x_ha['centroid_lat'], x_ha['centroid_lon'])).meters, axis=1
    )
    # 거리의 역수와 세대수를 곱해 가중치 계산
    df1_ha['weight_ha'] = df1_ha['세대수'] / df1_ha['distance']
    weighted_sum_ha = (df1_ha['weight_ha'] * df1_ha['세대수']).sum()
    total_weight_ha = df1_ha['weight_ha'].sum()
    return weighted_sum_ha / total_weight_ha if total_weight_ha > 0 else 0

# 격자별 계산
gdf_ha['weighted_avg_households'] = gdf_ha.apply(
    lambda row_ha: calculate_weighted_average_ha(row_ha, df1_ha), axis=1
)

In [159]:
gdf_ha.head(2)

Unnamed: 0,gid,geometry,centroid,centroid_lon,centroid_lat,nearest_subway_distance,nearest_parking_distance,weighted_avg_households
0,다사710491,"POLYGON ((127.17173 37.54077, 127.17173 37.541...",POINT (127.17229 37.54122),127.172295,37.541221,3014.305191,387.107017,798.802334
1,다사732511,"POLYGON ((127.19656 37.55886, 127.19656 37.559...",POINT (127.19712 37.55931),127.197125,37.559315,570.356206,914.268874,806.334538


#### 버스정류장 계산

In [160]:
# 지구의 반지름 (6371km -> 6371000m)
EARTH_RADIUS_METERS = 6371000

# df6_ha 버스 정류장 좌표를 radians로 변환
df6_coords_rad_ha = np.radians(df6_ha[['위도','경도']].values)

# df6_ha 격자 중심 좌표를 radians로 변환
df_coords_rad_ha = np.radians(gdf_ha[['centroid_lat', 'centroid_lon']].values)

# haversine_distances를 이용한 거리 계산
def calculate_nearest_bus_distances_ha(df6_coords_rad_ha, df_coords_rad_ha):
    # haversine_distances로 모든 거리 계산
    distances_ha = haversine_distances(df_coords_rad_ha, df6_coords_rad_ha) * EARTH_RADIUS_METERS
    # 각 격자 중심별 가장 가까운 거리 선택
    nearest_distances_ha = distances_ha.min(axis=1)
    return nearest_distances_ha

# 결과 계산
gdf_ha['nearest_bus_distance'] = calculate_nearest_bus_distances_ha(df6_coords_rad_ha, df_coords_rad_ha)

In [161]:
gdf_ha.head(2)

Unnamed: 0,gid,geometry,centroid,centroid_lon,centroid_lat,nearest_subway_distance,nearest_parking_distance,weighted_avg_households,nearest_bus_distance
0,다사710491,"POLYGON ((127.17173 37.54077, 127.17173 37.541...",POINT (127.17229 37.54122),127.172295,37.541221,3014.305191,387.107017,798.802334,179.354672
1,다사732511,"POLYGON ((127.19656 37.55886, 127.19656 37.559...",POINT (127.19712 37.55931),127.197125,37.559315,570.356206,914.268874,806.334538,188.655467


#### 제조업 공장 계산

In [162]:
# 지구의 반지름 (6371km -> 6371000m)
EARTH_RADIUS_METERS = 6371000

df5_coords_rad_ha = np.radians(df5_ha[['위도','경도']].values)

df_coords_rad_ha = np.radians(gdf_ha[['centroid_lat', 'centroid_lon']].values)

def calculate_nearest_fac_distances_ha(df5_coords_rad_ha, df_coords_rad_ha):
    distances_ha = haversine_distances(df_coords_rad_ha, df5_coords_rad_ha) * EARTH_RADIUS_METERS
    nearest_distances_ha = distances_ha.min(axis=1)
    return nearest_distances_ha

gdf_ha['nearest_fac_distance'] = calculate_nearest_fac_distances_ha(df5_coords_rad_ha, df_coords_rad_ha)

In [163]:
gdf_ha.head(2)

Unnamed: 0,gid,geometry,centroid,centroid_lon,centroid_lat,nearest_subway_distance,nearest_parking_distance,weighted_avg_households,nearest_bus_distance,nearest_fac_distance
0,다사710491,"POLYGON ((127.17173 37.54077, 127.17173 37.541...",POINT (127.17229 37.54122),127.172295,37.541221,3014.305191,387.107017,798.802334,179.354672,111.434837
1,다사732511,"POLYGON ((127.19656 37.55886, 127.19656 37.559...",POINT (127.19712 37.55931),127.197125,37.559315,570.356206,914.268874,806.334538,188.655467,681.144494


### 하남_상권밀집도_예측

In [164]:
# 필요한 입력 데이터로 gdf_ha에서 예측 변수 추출
X_ha = gdf_ha[['nearest_subway_distance', 'nearest_parking_distance', 'weighted_avg_households']]

# 학습된 Random Forest 모델(best_rf_hwa)로 예측 수행
y_pred_ha = best_rf_hwa.predict(X_ha)

print("Predicted Target Values for gdf_ha:\n", y_pred_ha)

Predicted Target Values for gdf_ha:
 [3. 3. 1. ... 2. 0. 0.]


In [165]:
y_pred_ha

array([3., 3., 1., ..., 2., 0., 0.])

In [166]:
gdf_ha['상권밀집도_pred'] = y_pred_ha
gdf_ha.head(2)

Unnamed: 0,gid,geometry,centroid,centroid_lon,centroid_lat,nearest_subway_distance,nearest_parking_distance,weighted_avg_households,nearest_bus_distance,nearest_fac_distance,상권밀집도_pred
0,다사710491,"POLYGON ((127.17173 37.54077, 127.17173 37.541...",POINT (127.17229 37.54122),127.172295,37.541221,3014.305191,387.107017,798.802334,179.354672,111.434837,3.0
1,다사732511,"POLYGON ((127.19656 37.55886, 127.19656 37.559...",POINT (127.19712 37.55931),127.197125,37.559315,570.356206,914.268874,806.334538,188.655467,681.144494,3.0


### 하남_상권유형_예측

In [167]:
# 필요한 입력 데이터로 gdf_ha에서 예측 변수 추출
X_ha = gdf_ha[['nearest_subway_distance', 'nearest_parking_distance', 'weighted_avg_households','상권밀집도_pred']]

# 학습된 Random Forest 모델(best_rf_hwa)로 예측 수행
y_type_ha = best_rf_hwa_type.predict(X_ha)

print("Predicted Target Values for gdf_ha:\n", y_type_ha)

Predicted Target Values for gdf_ha:
 [2 2 1 ... 2 0 0]


In [168]:
y_type_ha

array([2, 2, 1, ..., 2, 0, 0])

In [169]:
gdf_ha['상권유형_pred'] = y_type_ha
gdf_ha.head(2)

Unnamed: 0,gid,geometry,centroid,centroid_lon,centroid_lat,nearest_subway_distance,nearest_parking_distance,weighted_avg_households,nearest_bus_distance,nearest_fac_distance,상권밀집도_pred,상권유형_pred
0,다사710491,"POLYGON ((127.17173 37.54077, 127.17173 37.541...",POINT (127.17229 37.54122),127.172295,37.541221,3014.305191,387.107017,798.802334,179.354672,111.434837,3.0,2
1,다사732511,"POLYGON ((127.19656 37.55886, 127.19656 37.559...",POINT (127.19712 37.55931),127.197125,37.559315,570.356206,914.268874,806.334538,188.655467,681.144494,3.0,2


In [170]:
gdf_ha['상권유형_pred'].value_counts()

0    4832
2    3087
1    1563
3     152
Name: 상권유형_pred, dtype: int64

### 하남_유동인구_예측

In [171]:
# gdf_ha로 예측 진행
X_gdf_ha_hwa_p = gdf_ha.drop(columns=['gid','geometry','centroid','상권유형_pred'])  # 예측 데이터의 특성
X_gdf_ha_hwa_p_scaled = scaler_hwa_p.transform(X_gdf_ha_hwa_p)  # gdf_ha 데이터 스케일링

# 예측
y_gdf_ha_pred_boxcox_hwa_p = best_rf_model_hwa_p.predict(X_gdf_ha_hwa_p_scaled)

# Box-Cox 예측값을 원래 스케일로 복원 (Inverse Box-Cox 변환)
y_gdf_ha_pred_boxcox_original_hwa_p = np.exp(y_gdf_ha_pred_boxcox_hwa_p) - 1  # Box-Cox 역변환

# 예측 결과 출력
print("gdf_ha 예측 결과:")
print(y_gdf_ha_pred_boxcox_original_hwa_p)

gdf_ha 예측 결과:
[3.13195059 1.79761927 3.13179804 ... 1.85282169 1.98655641 2.0796592 ]


In [172]:
gdf_ha['유동인구_pred'] = y_gdf_ha_pred_boxcox_original_hwa_p
gdf_ha.head(2)

Unnamed: 0,gid,geometry,centroid,centroid_lon,centroid_lat,nearest_subway_distance,nearest_parking_distance,weighted_avg_households,nearest_bus_distance,nearest_fac_distance,상권밀집도_pred,상권유형_pred,유동인구_pred
0,다사710491,"POLYGON ((127.17173 37.54077, 127.17173 37.541...",POINT (127.17229 37.54122),127.172295,37.541221,3014.305191,387.107017,798.802334,179.354672,111.434837,3.0,2,3.131951
1,다사732511,"POLYGON ((127.19656 37.55886, 127.19656 37.559...",POINT (127.19712 37.55931),127.197125,37.559315,570.356206,914.268874,806.334538,188.655467,681.144494,3.0,2,1.797619


In [173]:
gdf_ha.to_csv('../data/타겟/하남시_상권밀집도_상권유형_유동인구_예측.csv')