In [1]:
import pandas as pd
import numpy as np
import math

In [2]:
# data 기본 경로
ABSOLUTE_PATH = "C:\\Users\\rudnf\\vscode\\Graduation\\Data_Preprocessing\\data\\"
MONTHS = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"]
SEASONS = ["spring", "summer", "fall", "winter"]
KS = [20, 35, 50]

In [3]:
def season_match(season):
    months = []
    if season == "spring":
        months = ["03", "04", "05"]
    if season == "summer":
        months = ["06", "07", "08"]
    if season == "fall":
        months = ["09", "10", "11"]
    if season == "winter":
        months = ["12", "01", "02"]
    return months

In [4]:
def load_POI_cluster_data(path):
    locations = pd.read_csv(path)
        
    locations = locations[["lon", "lat"]]
    locations.index = [f'POI{i}' for i in range(len(locations))]
    
    return locations

In [5]:
def load_GPS_data(path):
    df = pd.read_csv(path)

    # csv파일로 불러온 시간(Type : str)을 datetime으로 변경(이후 코드에서 datetime 사용)
    df['collection_dt'] = pd.to_datetime(df['collection_dt'])
    df.sort_values(by='collection_dt', ascending=True)
    
    return df

## GPS 데이터로 Trajectory 만들기
    trajectory : (trajectory_id, start_point, end_point, path, time_period)

In [6]:
# Haversine 공식 : 위도, 경로 간 거리 구하기
def _haversine_distance(lon1, lat1, lon2, lat2):
    R = 6371  # 지구의 반지름 (단위: km)
    lon1_rad = math.radians(lon1)
    lat1_rad = math.radians(lat1)
    lon2_rad = math.radians(lon2)
    lat2_rad = math.radians(lat2)
    
    diff_lon = lon2_rad - lon1_rad
    diff_lat = lat2_rad - lat1_rad

    a = math.sin(diff_lon/2)**2 + math.cos(lat1_rad) * math.cos(lat2_rad) * math.sin(diff_lat/2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
    distance = R * c

    return distance

In [7]:
# dawn : 00:00 ~ 05:59
# morning : 06:00 ~ 11:59
# afternoon : 12:00 ~ 17:59
# night : 18:00 ~ 23:00
def _map_time_period(time):
    if time < pd.to_datetime('06:00:00').time():
        return 'dawn'
    elif time < pd.to_datetime('12:00:00').time():
        return 'morning'
    elif time < pd.to_datetime('18:00:00').time():
        return 'afternoon'
    else:
        return 'night'

In [8]:
def _map_group_time_period(rows):
    rows['time_period'] = rows['collection_dt'].dt.time.apply(_map_time_period)

    return rows 

In [9]:
# Haversine 공식을 이용한 Trajectory 생성
# 관심지점 - GPS 데이터 간 거리 1km 이내
def _calculate_path(rows, locations):
    path = []
    for _, row in rows.iterrows():
        GPS_lon = row['longitude']
        GPS_lat = row['latitude']
        
        # POI_name : POIx
        distance = [_haversine_distance(GPS_lon, GPS_lat, POI_GPS_data['lon'], POI_GPS_data['lat']) for _, POI_GPS_data in locations.iterrows()]
        
        nearest_idx = np.argmin(distance)
        # 1km이하인 지점이 아니라 제일 가까운 POI로 수정해야함
        if np.min(distance) <= 1:
            path.append(locations.index[nearest_idx])
        
    return path

In [10]:
# v.0.5.0 : _remove_duplicated_path에서 시간대가 걸쳐있는 path는 중복된 POI를 제거 x
def _remove_duplicated_path(path, cal_time_rows):
    removed_path = []
    removed_time_period = []

    prev_location = path[0]
    start_idx = 0
    end_idx = len(path)-1

    for idx, location in enumerate(path):

        if (location != prev_location) or (idx == len(path)-1) :
            end_idx = idx

            same_POI_rows = cal_time_rows.iloc[start_idx:end_idx]
            time_period_unique = same_POI_rows['time_period'].unique()
            
            if len(time_period_unique) != 1:
                for each_time_period in time_period_unique:
                    removed_path.append(prev_location)
                    removed_time_period.append(each_time_period)
            else:
                removed_path.append(prev_location)
                removed_time_period.append(time_period_unique.item())
            
            prev_location = location
            start_idx = idx       

    return removed_path, removed_time_period

In [11]:
def make_trajectory(df, locations):
        
    # DataFrame을 (년, 월, 일, oid)로 grouping
    grouped = df.groupby([df['collection_dt'].dt.year, 
                            df['collection_dt'].dt.month, 
                            df['collection_dt'].dt.day, 
                            df['oid']])

    trajectories = pd.DataFrame(columns=['trajectory_id', 'start_point','end_point', 'path', 'time_period'])
    concat_row = pd.DataFrame(columns=['trajectory_id', 'start_point','end_point', 'path', 'time_period'])


    index = 0
    for group_key, rows in grouped:
        
        if index == 10:
            break

        # Grouping한 GPS data가 100개 이하일 경우 의미있는 경로가 나오지 않을 것이라 판단하여 포함 x
        if (len(rows) <= 100):
            continue
        
        # Grouping 한 rows에서 collection_dt의 값이 동일한 data가 존재
        rows.drop_duplicates(subset=['collection_dt'], inplace=True)

        # Grouping한 rows의 time_period를 mapping
        cal_time_rows = _map_group_time_period(rows)
        path = _calculate_path(cal_time_rows, locations)
        
        if len(path) == 0:
            continue

        removed_path, removed_time_period = _remove_duplicated_path(path, cal_time_rows)

        if (len(removed_path) == 0) or (len(removed_path) == 1):
            continue

        start_point, end_point = removed_path[0], removed_path[-1]
        
        column_list = ['trajectory_id', 'start_point', 'end_point', 'path', 'time_period']
        concat_row.loc[0, column_list] = [ group_key, start_point, end_point, removed_path, removed_time_period ]
        
        trajectories = pd.concat([trajectories, concat_row], ignore_index=True)
        index += 1 
    return trajectories
        

## 전체 코드 돌리기

In [14]:
for season in SEASONS:
    months = season_match(season)
    for month in months:    
        for k in KS:
            df = load_GPS_data(ABSOLUTE_PATH + f"GPS_data\\month_{month}.csv")
            locations = load_POI_cluster_data(ABSOLUTE_PATH + f"frequency_of_stay_data\\cluster_data\\{season}\\{season}_cluster_{k}.csv")
            
            trajectories = make_trajectory(df, locations)
        
            trajectories.to_csv(ABSOLUTE_PATH + f"trajectory\\{season}\\trajectory_{month}_cluster_{k}.csv", index=False)
        