# 지하철 피처 추가

In [None]:
import pandas as pd

subway_df = pd.read_csv('./data/subway_feature.csv')
print(subway_df.info())
print(subway_df.head(3))

In [None]:
apartment_df = pd.read_csv('./data/newXY_for_test.csv')
print(apartment_df.shape)
print(apartment_df.info())
print(apartment_df.head(3))

In [None]:
import pandas as pd
import numpy as np
from scipy.spatial import cKDTree

def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371  # 지구의 반경 (km)
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    distance = R * c
    return distance * 1000  # 미터 단위로 변환

def walking_time(distance):
    return distance / (4000/60)  # 4km/h의 걷는 속도 가정

def add_subway_features(apartment_df, subway_df):
    apartment_coords = apartment_df[['좌표Y_2', '좌표X_2']].values
    station_coords = subway_df[['위도', '경도']].values

    # 가장 가까운 3개의 역 찾기
    tree = cKDTree(station_coords)
    distances, indices = tree.query(apartment_coords, k=3)

    for i in range(3):
        apartment_df[f'{i+1}번째_가까운_역_이름'] = subway_df.loc[indices[:, i], '역사명'].values
        apartment_df[f'{i+1}번째_가까운_역_호선'] = subway_df.loc[indices[:, i], '호선'].values
        apartment_df[f'{i+1}번째_가까운_역_거리'] = np.array([haversine_distance(ac[0], ac[1], station_coords[idx][0], station_coords[idx][1]) 
                                                   for ac, idx in zip(apartment_coords, indices[:, i])])
        apartment_df[f'{i+1}번째_가까운_역_도보시간'] = walking_time(apartment_df[f'{i+1}번째_가까운_역_거리'])

    # 시간대별 역 개수 계산
    def count_stations_in_time_range(min_time, max_time):
        min_dist = min_time * (4000/60)
        max_dist = max_time * (4000/60)
        return np.array([np.sum((min_dist < haversine_distance(c[0], c[1], station_coords[:, 0], station_coords[:, 1])) & 
                                (haversine_distance(c[0], c[1], station_coords[:, 0], station_coords[:, 1]) <= max_dist)) 
                         for c in apartment_coords])

    apartment_df['5분이하_역_개수'] = count_stations_in_time_range(0, 5)
    apartment_df['5분초과_10분이하_역_개수'] = count_stations_in_time_range(5, 10)
    apartment_df['10분초과_15분이하_역_개수'] = count_stations_in_time_range(10, 15)
    apartment_df['15분초과_20분이하_역_개수'] = count_stations_in_time_range(15, 20)

    return apartment_df

In [None]:
merged_apartment_df = add_subway_features(apartment_df, subway_df)

In [None]:
print(merged_apartment_df.shape)
print(merged_apartment_df.info())
print(merged_apartment_df.head(3))

In [None]:
from dataprep.eda import create_report

report = create_report(merged_apartment_df)
report.save('Test Subway Dataset')

In [None]:
import os
merged_subway_path = os.path.join('preprocessed', 'test_with_subway_infos.csv')
merged_apartment_df.to_csv(merged_subway_path, index=False)

In [None]:
confirm_df = pd.read_csv(merged_subway_path)
print(confirm_df.shape)
print(confirm_df.info())
print(confirm_df.head(3))

## 버스 피처 추가

In [None]:
import pandas as pd

bus_df = pd.read_csv('./data/bus_feature.csv')
print(bus_df.info())
print(bus_df.head(3))

In [None]:
len(bus_df['정류소명'].unique())

In [None]:
# NaN 값 처리와 함께 '정류소'열 생성
bus_df['정류소'] = bus_df['정류소번호'].fillna('').astype(str) + '_' + bus_df['정류소명'].fillna('')

print(bus_df.info())
print(bus_df.head(3))

len(bus_df['정류소'].unique())

In [None]:
apartment_df = pd.read_csv('./data/newXY_for_train.csv')
print(apartment_df.shape)
print(apartment_df.info())
print(apartment_df.head(3))

In [None]:
import pandas as pd
import numpy as np
from scipy.spatial import cKDTree

def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371  # 지구의 반경 (km)
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    distance = R * c
    return distance * 1000  # 미터 단위로 변환

def walking_time(distance):
    return distance / (4000/60)  # 4km/h의 걷는 속도 가정

def add_bus_stop_features(apartment_df, bus_df):
    apartment_coords = apartment_df[['좌표Y_2', '좌표X_2']].values
    bus_stop_coords = bus_df[['Y좌표', 'X좌표']].values

    tree = cKDTree(bus_stop_coords)
    distances, indices = tree.query(apartment_coords, k=3)

    for i in range(3):
        apartment_df[f'{i+1}번째_가까운_버스정류장_이름'] = bus_df.loc[indices[:, i], '정류소'].values
        apartment_df[f'{i+1}번째_가까운_버스정류장_거리'] = np.array([haversine_distance(ac[0], ac[1], bus_stop_coords[idx][0], bus_stop_coords[idx][1]) 
                                                   for ac, idx in zip(apartment_coords, indices[:, i])])
        apartment_df[f'{i+1}번째_가까운_버스정류장_도보시간'] = walking_time(apartment_df[f'{i+1}번째_가까운_버스정류장_거리'])

    def count_bus_stops_in_time_range(min_time, max_time):
        min_dist = min_time * (4000/60)
        max_dist = max_time * (4000/60)
        return np.array([np.sum((min_dist < haversine_distance(c[0], c[1], bus_stop_coords[:, 0], bus_stop_coords[:, 1])) & 
                                (haversine_distance(c[0], c[1], bus_stop_coords[:, 0], bus_stop_coords[:, 1]) <= max_dist)) 
                         for c in apartment_coords])

    apartment_df['5분이하_버스정류장_개수'] = count_bus_stops_in_time_range(0, 5)
    apartment_df['5분초과_10분이하_버스정류장_개수'] = count_bus_stops_in_time_range(5, 10)
    apartment_df['10분초과_15분이하_버스정류장_개수'] = count_bus_stops_in_time_range(10, 15)

    return apartment_df

In [None]:
merged_apartment_df = add_bus_stop_features(apartment_df, bus_df)

In [None]:
print(merged_apartment_df.shape)
print(merged_apartment_df.info())
print(merged_apartment_df.head(3))

In [None]:
from dataprep.eda import create_report

report = create_report(merged_apartment_df)
report.save('Train Bus Dataset')

In [None]:
import os
merged_bus_path = os.path.join('preprocessed', 'train_with_bus_infos.csv')
merged_apartment_df.to_csv(merged_bus_path, index=False)

In [None]:
confirm_df = pd.read_csv(merged_bus_path)
print(confirm_df.shape)
print(confirm_df.info())
print(confirm_df.head(3))