In [2]:
import pandas as pd 
import geopandas as gpd
import numpy as np
from geopy.distance import geodesic
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# 상권 유형 예측 

활용 데이터
1. 지하철역 
2. 임대주택 정보
3. 주차장
4. 상권 밀집도 

In [3]:
df1= pd.read_csv('../data/1-6.화성시_지하철역.csv')
df2= pd.read_csv('../data/1-7.화성시_공영주차장.csv')
df3=pd.read_csv('../data/1-12.공공주택임대_정보(화성시).csv')
df4=pd.read_csv('../data/화성2022.csv') # 타겟값: 상권 유형을 예측해야 함
gdf = gpd.read_file('../data/1-14.화성시_격자.geojson')

In [4]:
df1.columns= ['역이름','지하철노선이름','경도','위도']
df2.columns = ['이름', '종류', '주차면수', '경도', '위도'] # 101
df3.columns= ['블록코드','단지명','지원유형','세대수','주차면수','경도','위도']
df4=df4.rename({'fac_density':'상권밀집도'},axis=1)

In [5]:
df4.cluster.value_counts()

일반상권    13391
복합상권     9824
주택상권     9401
Name: cluster, dtype: int64

In [14]:
from sklearn.preprocessing import LabelEncoder

# NaN을 새로운 카테고리로 대체
df4['cluster'] = df4['cluster'].fillna('미확인')

# 수동 매핑 설정
mapping = {'일반상권': 1, '주택상권': 2, '복합상권': 3}

# cluster 열 매핑 적용 및 NaN 처리
df4['cluster_encoded'] = df4['cluster'].map(mapping).fillna(0).astype(int)


In [16]:
target=df4[['gid','상권밀집도','cluster_encoded']]
gdf2=target.merge(gdf)
def calculate_centroid(geometry):
    return geometry.centroid

def calculate_distance(coord1, coord2):
    return geodesic(coord1, coord2).meters

gdf2['centroid'] = gdf2['geometry'].apply(calculate_centroid)
gdf2['centroid_lon'] = gdf2['centroid'].apply(lambda x: x.x)
gdf2['centroid_lat'] = gdf2['centroid'].apply(lambda x: x.y)


In [17]:
# 거리 계산 함수
def calculate_min_distance(df1_coords, target_coords):
    distances = np.array([
        geodesic(target_coords, station_coords).meters
        for station_coords in df1_coords
    ])
    return distances.min()

# df1의 지하철역 좌표 준비
df1_coords = df1[['위도', '경도']].values

# 각 격자 중심과 가장 가까운 지하철역의 거리 계산
def calculate_distances(row):
    target_coords = (row['centroid_lat'], row['centroid_lon'])
    return calculate_min_distance(df1_coords, target_coords)

gdf2['nearest_subway_distance'] = gdf2.apply(calculate_distances, axis=1)

In [18]:
from sklearn.metrics.pairwise import haversine_distances
from math import radians

# 주차장 좌표를 radians로 변환
df2_coords_rad = np.radians(df2[['위도', '경도']].values)
gdf2_coords_rad = np.radians(gdf2[['centroid_lat', 'centroid_lon']].values)

def calculate_parking_stats_vectorized():
    distances = haversine_distances(gdf2_coords_rad, df2_coords_rad) * 6371000  # 반지름 곱하기 지구 반경(미터)
    nearest_distances = distances.min(axis=1)  # 가장 가까운 거리
    return nearest_distances

# 가장 가까운 주차장과의 거리
gdf2['nearest_parking_distance'] = calculate_parking_stats_vectorized()


In [19]:
# 거리와 세대수를 반영한 가중 평균 계산 함수
def calculate_weighted_average(row, df3):
    target_coords = (row['centroid_lat'], row['centroid_lon'])
    df3['distance'] = df3.apply(
        lambda x: geodesic(target_coords, (x['위도'], x['경도'])).meters, axis=1
    )
    # 거리의 역수와 세대수를 곱해 가중치 계산
    df3['weight'] = df3['세대수'] / df3['distance']
    weighted_sum = (df3['weight'] * df3['세대수']).sum()
    total_weight = df3['weight'].sum()
    return weighted_sum / total_weight if total_weight > 0 else 0

# 격자별 계산
gdf2['weighted_avg_households'] = gdf2.apply(lambda row: calculate_weighted_average(row, df3), axis=1)


In [20]:
gdf2.to_csv('../data/상권유형예측.csv',index=False)

In [6]:
gdf2=pd.read_csv('../data/상권유형예측.csv')

In [25]:
# 데이터 준비
X = gdf2[['nearest_subway_distance', 'nearest_parking_distance', 'weighted_avg_households','상권밀집도']]
y = gdf2['cluster_encoded']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Random Forest 하이퍼파라미터 튜닝
param_grid = {
    'n_estimators': [100],
    'max_depth': [10,15],
    'min_samples_split': [2,4],
    'min_samples_leaf': [1,2]
}

rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

# 최적의 모델로 예측 및 평가
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test)
print("Random Forest Best Parameters:", grid_search.best_params_)
print("Random Forest Classification Report:\n", classification_report(y_test, y_pred))


Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   4.1s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   4.0s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   4.0s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   2.9s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   2.6s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=4, n_estimators=100; total time=   2.6s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=4, n_estimators=100; total time=   2.6s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=4, n_estimators=100; total time=   2.8s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=4, n_estimators=100; total time=   3.9s
[CV] END max_depth=10, min_samp

## 적절한 파라미터로 충분히 좋은 성능을 기록하는 것으로 보아 트리의 깊이를 더 깊게 할 필요가 없어 보임