In [11]:
import pandas as pd 
import geopandas as gpd
import numpy as np
from geopy.distance import geodesic
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics.pairwise import haversine_distances
from math import radians

# 상권 밀집도 예측 

활용 데이터
1. 지하철역 
2. 임대주택 정보
3. 주차장

In [49]:
df1= pd.read_csv('../data/1-6.화성시_지하철역.csv')
df2= pd.read_csv('../data/1-7.화성시_공영주차장.csv')
df3=pd.read_csv('../data/1-12.공공주택임대_정보(화성시).csv')
df4=pd.read_csv('../data/화성2022.csv') # 타겟값: 상권밀집도를 예측해야 함
gdf = gpd.read_file('../data/1-14.화성시_격자.geojson')


In [50]:
df1.columns= ['역이름','지하철노선이름','경도','위도']
df2.columns = ['이름', '종류', '주차면수', '경도', '위도'] # 101
df3.columns= ['블록코드','단지명','지원유형','세대수','주차면수','경도','위도']

## 예측 
- 지하철역, 공영주차장, 공공주택임대 정보로 상권 밀집도 예측하기 
1. 각 격자별로 가장 가까운 지하철역 : 거리를 값으로 
2. 각 격자별로 가장 가까운 주차장 : 거리를 값으로 
3. 각 격자와 모든 단지와의 거리 및 세대수

## 1. 상권 강도 예측하기

In [51]:
target=df4[['gid','fac_density']]
target=target.rename({'fac_density':'상권밀집도'},axis=1)
gdf2=target.merge(gdf)
def calculate_centroid(geometry):
    return geometry.centroid

def calculate_distance(coord1, coord2):
    return geodesic(coord1, coord2).meters

gdf2['centroid'] = gdf2['geometry'].apply(calculate_centroid)
gdf2['centroid_lon'] = gdf2['centroid'].apply(lambda x: x.x)
gdf2['centroid_lat'] = gdf2['centroid'].apply(lambda x: x.y)


In [52]:
# 거리 계산 함수
def calculate_min_distance(df1_coords, target_coords):
    distances = np.array([
        geodesic(target_coords, station_coords).meters
        for station_coords in df1_coords
    ])
    return distances.min()

# df1의 지하철역 좌표 준비
df1_coords = df1[['위도', '경도']].values

# 각 격자 중심과 가장 가까운 지하철역의 거리 계산
def calculate_distances(row):
    target_coords = (row['centroid_lat'], row['centroid_lon'])
    return calculate_min_distance(df1_coords, target_coords)

gdf2['nearest_subway_distance'] = gdf2.apply(calculate_distances, axis=1)

In [None]:
from sklearn.metrics.pairwise import haversine_distances
from math import radians

# 주차장 좌표를 radians로 변환
df2_coords_rad = np.radians(df2[['위도', '경도']].values)
gdf2_coords_rad = np.radians(gdf2[['centroid_lat', 'centroid_lon']].values)

def calculate_parking_stats_vectorized():
    distances = haversine_distances(gdf2_coords_rad, df2_coords_rad) * 6371000  # 반지름 곱하기 지구 반경(미터)
    nearest_distances = distances.min(axis=1)  # 가장 가까운 거리
    return nearest_distances

# 가장 가까운 주차장과의 거리
gdf2['nearest_parking_distance'] = calculate_parking_stats_vectorized()


In [None]:
# 거리와 세대수를 반영한 가중 평균 계산 함수
def calculate_weighted_average(row, df3):
    target_coords = (row['centroid_lat'], row['centroid_lon'])
    df3['distance'] = df3.apply(
        lambda x: geodesic(target_coords, (x['위도'], x['경도'])).meters, axis=1
    )
    # 거리의 역수와 세대수를 곱해 가중치 계산
    df3['weight'] = df3['세대수'] / df3['distance']
    weighted_sum = (df3['weight'] * df3['세대수']).sum()
    total_weight = df3['weight'].sum()
    return weighted_sum / total_weight if total_weight > 0 else 0

# 격자별 계산
gdf2['weighted_avg_households'] = gdf2.apply(lambda row: calculate_weighted_average(row, df3), axis=1)


In [None]:
gdf2.to_csv('../data/상권정보예측.csv',index=False)

In [53]:
gdf2=pd.read_csv('../data/상권정보예측.csv')
gdf2.head()


Unnamed: 0,gid,상권밀집도,geometry,centroid,centroid_lon,centroid_lat,nearest_subway_distance,parking_1km,nearest_parking_distance,parking_500m,weighted_avg_households
0,다사385011,0.0,POLYGON ((126.80784063508459 37.10653637928979...,POINT (126.80839923101641 37.10699034270337),126.808399,37.10699,9203.670759,0,2616.915044,0,966.231874
1,다사312110,1.0,"POLYGON ((126.7247779576752 37.19526074709075,...",POINT (126.7253366940065 37.19571509699195),126.725337,37.195715,8042.302201,0,2206.415809,0,918.546292
2,다사473033,3.0,POLYGON ((126.90672137340184 37.12690343376624...,POINT (126.90728071427678 37.12735693644125),126.907281,37.127357,696.775397,393,334.022982,108,1021.022054
3,다사602117,3.0,POLYGON ((127.05149327624233 37.20325801374553...,POINT (127.05205404765378 37.20371083733202),127.052054,37.203711,911.437502,63,21.351497,39,999.497917
4,다사476205,0.0,POLYGON ((126.9088906157039 37.281954863308094...,POINT (126.90945110256466 37.28240835397451),126.909451,37.282408,3258.769915,0,5414.220356,0,892.15345


### 모델링 
- step1) 가장 좋은 모델이 무엇일지 확인 
- step2) 가장 좋은 모델로 파라미터 조정해보며 예측

In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

# 데이터 준비
X = gdf2[['nearest_subway_distance', 'weighted_avg_households','nearest_parking_distance']]
y = gdf2['상권밀집도']  # 상권밀집도를 분류형으로 변환

# 데이터 분할 (Train/Test Split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 스케일링 (특히 SVM, KNN 등에 필요)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 모델 정의
models = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Naive Bayes': GaussianNB(),
}

# 모델 성능 비교
results = []
for name, model in models.items():
    if name in ['Support Vector Classifier', 'K-Nearest Neighbors']:
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    results.append((name, accuracy))

# 결과 정리 및 출력
results_df = pd.DataFrame(results, columns=['Model', 'Accuracy']).sort_values(by='Accuracy', ascending=False)


In [25]:
results_df


Unnamed: 0,Model,Accuracy
1,Random Forest,0.872382
3,K-Nearest Neighbors,0.871612
4,Decision Tree,0.844785
2,Gradient Boosting,0.726693
0,Logistic Regression,0.634727
5,Naive Bayes,0.576171


### 모델 선택 
- randomforest를 활용
- knn도 좋은 성능을 가졌지만 knn은 모델 특성상 중소규모 데이터셋에 적합하기 때문에 randomforest 모델 선택함.
- 과적합을 주의하며 테스트 해봐야 함

In [42]:

# 데이터 준비
X = gdf2[['nearest_subway_distance', 'weighted_avg_households','nearest_parking_distance']]
y = gdf2['상권밀집도']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Random Forest 하이퍼파라미터 튜닝
param_grid = {
    'n_estimators': [100],
    'max_depth': [15,],
    'min_samples_split': [2,3,5],
    'min_samples_leaf': [1]
}

rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

# 최적의 모델로 예측 및 평가
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test)
print("Random Forest Best Parameters:", grid_search.best_params_)
print("Random Forest Classification Report:\n", classification_report(y_test, y_pred))


Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] END max_depth=15, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   5.5s
[CV] END max_depth=15, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   5.5s
[CV] END max_depth=15, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   5.4s
[CV] END max_depth=15, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   5.5s
[CV] END max_depth=15, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   5.5s
[CV] END max_depth=15, min_samples_leaf=1, min_samples_split=3, n_estimators=100; total time=   4.8s
[CV] END max_depth=15, min_samples_leaf=1, min_samples_split=3, n_estimators=100; total time=   3.8s
[CV] END max_depth=15, min_samples_leaf=1, min_samples_split=3, n_estimators=100; total time=   5.7s
[CV] END max_depth=15, min_samples_leaf=1, min_samples_split=3, n_estimators=100; total time=   5.8s
[CV] END max_depth=15, min_samp

In [44]:
# 훈련 데이터와 테스트 데이터 정확도 계산
train_accuracy = best_rf.score(X_train, y_train)
test_accuracy = best_rf.score(X_test, y_test)

# 과적합 여부 출력
print(f"Train Accuracy: {train_accuracy:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")


Train Accuracy: 0.9529
Test Accuracy: 0.8511


### 결론: 트리의 깊이를 높이면 과적합 위험성이 매우 높아, 15로 지정하는 것이 가장 적합해 보인다.
- 10으로 하면 성능이 너무 안 나오고, 20 이상이 되면 훈련과 테스트 정확도가 0.1을 뛰어넘기 때문에 적합하지 않다.