In [1]:
import psycopg2
import pandas as pd
import folium
from geopy.distance import geodesic
from sklearn.cluster import DBSCAN
from scipy.spatial.distance import cdist
import numpy as np
from datetime import datetime, timedelta
from collections import Counter
from sqlalchemy import create_engine
from sklearn.metrics import pairwise_distances_argmin_min

In [2]:
USED_BUS_LINES = ['100', '108', '232', '2336', '2803', '292', '298', '3', '309', '315', '324', '328', '343', '355', '371', '388', 
                  '397', '399', '415', '422', '457', '483', '497', '550', '553', '554', '557', '565', '606', '624', '629', '634', 
                  '638', '639', '665', '756', '759', '774', '779', '803', '838', '852', '864', '867', '878', '905', '917', '918'] # SELECT DISTINCT FROM

In [37]:
# CHOOSEN_DATES = ['20240506', '20240507', '20240508', '20240509', '20240510', '20240511']
CHOOSEN_DATES = ['20240426', '20240427', '20240429']
# CHOOSEN_DATES = ['20240425', '20240426', '20240427', '20240428', '20240429', '20240430', 
#                  '20240501', '20240502', '20240503', '20240504', '20240505', '20240506',
#                  '20240507', '20240508', '20240509', '20240510']

CHOOSEN_TEST_LINE = '3'

In [5]:
database_uri = 'postgresql://postgres:admin@localhost:5432/gps_onibus_rj'
db_engine_alchemy = create_engine(database_uri)

In [24]:
def database_query(linha, dates: list, engine) -> pd.DataFrame:
    queries = []
    for date in dates:
        # query = (f'SELECT ordem, latitude, longitude, datahora_ts, velocidade ' 
        #          f'FROM dados_gps_{date} '
        #          f'WHERE linha = \'{linha}\' '
        #          f'AND EXTRACT (HOUR FROM datahora_ts) >= 11 '
        #          f'AND EXTRACT (HOUR FROM datahora_ts) <= 15 ')
        query = (f'SELECT ordem, dg.latitude, dg.longitude, datahora_ts, velocidade '
                 f'FROM dados_gps_{date} dg '
                 f'LEFT JOIN coords_garagem cg '
                 f'ON ST_DWithin(cg.geom, dg.geom, 200 / 111320.0) '
                 f'WHERE cg.geom IS NULL '
                 f'AND dg.linha = \'{linha}\' '
                 f'AND EXTRACT (HOUR FROM dg.datahora_ts) >= 9 '
                 f'AND EXTRACT (HOUR FROM dg.datahora_ts) <= 18 ')
                #  f'AND dg.velocidade = 0 ')
        queries.append(query)
        
    union_all_query = ' UNION ALL '.join(queries)

    df = pd.read_sql(union_all_query, con=engine)
    return df

In [38]:
df = database_query(CHOOSEN_TEST_LINE, CHOOSEN_DATES , db_engine_alchemy)
df

Unnamed: 0,ordem,latitude,longitude,datahora_ts,velocidade
0,D53625,-22.88351,-43.49067,2024-04-29 09:04:28,0
1,D53514,-22.90228,-43.55031,2024-04-26 09:00:20,24
2,D53514,-22.90046,-43.55028,2024-04-26 09:00:51,37
3,D53514,-22.90115,-43.55281,2024-04-26 09:01:22,3
4,D53514,-22.90150,-43.55283,2024-04-26 09:01:53,12
...,...,...,...,...,...
4551,D53592,-22.88547,-43.51980,2024-04-27 18:20:37,50
4552,D53502,-22.90010,-43.54250,2024-04-27 18:26:15,35
4553,D53521,-22.91115,-43.58865,2024-04-27 18:30:10,46
4554,D53521,-22.90206,-43.54637,2024-04-27 18:45:18,16


In [8]:
def get_garage_points(engine):
    # Query to select relevant rows
    query = f'SELECT latitude, longitude FROM coords_garagem' 
    df = pd.read_sql(query, con=engine)
    return list(zip(df['latitude'], df['longitude']))

In [9]:
garage_points = get_garage_points(db_engine_alchemy)

In [119]:
df.sort_values(by=['ordem', 'datahora_ts'], inplace=True)

In [None]:
def filter_df(df: pd.DataFrame):

    df = df[df['velocidade'] == 0]

    ordem_counts = df['ordem'].value_counts()

    most_frequent_ordem = ordem_counts.idxmax()

    filtered_df = df[df['ordem'] == ordem_counts.index[14]].reset_index()

    filtered_df = filtered_df[filtered_df['velocidade'] == 0]
    
    filtered_df = filtered_df.sort_values(by=['ordem', 'datahora_ts']).reset_index()
    filtered_df['datahora_ts'] = pd.to_datetime(filtered_df['datahora_ts'])

    # Function to calculate distance
    def calculate_distance(row1, row2):
        point1 = (row1['latitude'], row1['longitude'])
        point2 = (row2['latitude'], row2['longitude'])
        return geodesic(point1, point2).meters

    # Initialize an empty column for stops
    filtered_df['is_stop'] = False

    # Sliding window approach
    window_size = timedelta(minutes=10)
    i = 0
    while i < len(filtered_df):
        end_time = filtered_df.loc[i, 'datahora_ts'] + window_size
        window = filtered_df[(filtered_df['datahora_ts'] >= filtered_df.loc[i, 'datahora_ts']) & (filtered_df['datahora_ts'] <= end_time)]
        initial_point = window.iloc[0]
        
        if all(calculate_distance(initial_point, row) <= 10 for idx, row in window.iterrows()):
            filtered_df.loc[window.index, 'is_stop'] = True
            i += len(window.index)
        else:
            i += 1

    return filtered_df
filtered_df = filter_df(df)
print(len((filtered_df[filtered_df['is_stop']]==False).index))
filtered_df

In [17]:
def filter_df_v2(df: pd.DataFrame):
    
    df.loc[:,'datahora_ts'] = pd.to_datetime(df['datahora_ts'])
    df = df.sort_values(by=['ordem', 'datahora_ts']).reset_index()

    # Function to calculate distance
    def calculate_distance(row1, row2):
        point1 = (row1['latitude'], row1['longitude'])
        point2 = (row2['latitude'], row2['longitude'])
        return geodesic(point1, point2).meters

    # Initialize an empty column for stops
    df['is_stop'] = False
    
    for ordem in df['ordem'].unique():
        # Sliding window approach
        window_size = timedelta(minutes=5)
        df_slice = df[df['ordem']==ordem]
        i = df_slice.index[0]
        while i < df_slice.index[-1]:
            end_time = df_slice.loc[i, 'datahora_ts'] + window_size
            window = df_slice[(df_slice['datahora_ts'] >= df_slice.loc[i, 'datahora_ts']) & (df_slice['datahora_ts'] <= end_time)]
            initial_point = window.iloc[0]
            final_point = window.iloc[-1]
            
            # if all(calculate_distance(initial_point, row) <= 10 for idx, row in window.iterrows()):
            if calculate_distance(initial_point, final_point) <= 10 and final_point['datahora_ts'] - initial_point['datahora_ts'] >= window_size - timedelta(minutes=2):
                df.loc[window.index, 'is_stop'] = True
                i += len(window.index)
            else:
                i += 1

    final_df = df[df['is_stop']==True]

    # return final_df.drop(columns=['is_stop'])
    return final_df
filtered_df = filter_df_v2(df)
filtered_df

Unnamed: 0,index,ordem,latitude,longitude,datahora_ts,velocidade,is_stop
86,10799,D86002,-22.90289,-43.55536,2024-05-09 11:54:35,0,True
87,13283,D86002,-22.90287,-43.55530,2024-05-09 11:55:06,0,True
88,12624,D86002,-22.90288,-43.55529,2024-05-09 11:55:37,0,True
89,12628,D86002,-22.90290,-43.55528,2024-05-09 11:56:08,0,True
90,12630,D86002,-22.90290,-43.55528,2024-05-09 11:56:39,0,True
...,...,...,...,...,...,...,...
16591,16485,D86411,-22.90145,-43.55452,2024-05-10 18:54:03,0,True
16592,15558,D86411,-22.90145,-43.55452,2024-05-10 18:54:34,0,True
16593,15562,D86411,-22.90145,-43.55452,2024-05-10 18:55:05,0,True
16594,15565,D86411,-22.90145,-43.55452,2024-05-10 18:55:35,0,True


In [46]:
def plot_trajectories(df: pd.DataFrame):
    df = df.sort_values(by=['ordem','datahora_ts']).reset_index(drop=True)

    # Create a map centered around the average location
    map_center = [df['latitude'].mean(), df['longitude'].mean()]
    m = folium.Map(location=map_center, zoom_start=15)
    choosen_order = df['ordem'].unique()[13]
    # df = df[df['ordem'] == choosen_order]
    print(choosen_order)
    # Add points and polylines
    for i, row in df.iterrows():
        point = [row['latitude'], row['longitude']]
        # popup_text = f"Velocidade: {row['velocidade']} km/h<br>Hora: {row['datahora_ts'].strftime('%H:%M:%S')}"
        popup_text = f"Velocidade: {row['velocidade']} km/h<br>Hora: {row['datahora_ts'].strftime('%H:%M:%S')}<br>Coords:{point}"
        
        # Determine the color
        # if row['is_stop'] == True:
        #     color = 'red'
        # else:
        #     color = 'blue'
        if row['velocidade'] > 0:
            color = 'blue'
        else:
            color = 'red'
        
        # Add circle marker
        # if color == 'red':
        folium.CircleMarker(location=point, radius=5, color=color, fill=True, fill_color=color, popup=popup_text).add_to(m)
        
        # Add polyline to previous point
        
    m.save(f'maps/trajectory_{CHOOSEN_TEST_LINE}.html')
plot_trajectories(df)

D53528


In [36]:
def calculate_final_stops_v2(df: pd.DataFrame, radiusInMeters: int = 100):
    epsilon = radiusInMeters / 6371000  # Earth radius in meters

    # DBSCAN clustering
    coords = df[['latitude', 'longitude']].to_numpy()
    db = DBSCAN(eps=epsilon, min_samples=10, metric='haversine').fit(np.radians(coords))

    # Extract cluster labels
    df['cluster'] = db.labels_

    # Count the number of points in each cluster
    cluster_counts = Counter(df['cluster'])

    # Get the top 3 most selected areas (excluding noise cluster -1 if present)
    top_clusters = cluster_counts.most_common(5)  # Usually enough to exclude noise

    # Print the results
    print("Top 3 most selected areas:")
    i = 1
    # centroids_result = []
    # for cluster_id, count in top_clusters:
    #     if cluster_id != -1:  # Exclude the noise cluster
    #         cluster_points = df[df['cluster'] == cluster_id]
    #         center_lat = cluster_points['latitude'].mean()
    #         center_lon = cluster_points['longitude'].mean()
    #         print(f"Cluster {cluster_id}: Center ({center_lat}, {center_lon}), Count: {count}")
    #         centroids_result.append((i, (center_lat, center_lon), count))
    #         i += 1
    # return centroids_result
    
    # medoids_result = []
    # for cluster_id, count in top_clusters:
    #     if cluster_id != -1:  # Exclude the noise cluster
    #         cluster_points = df[df['cluster'] == cluster_id]
    #         cluster_coords = cluster_points[['latitude', 'longitude']].to_numpy()

    #         # Calculate medoid for the cluster
    #         medoid_index, _ = pairwise_distances_argmin_min(cluster_coords, cluster_coords)
    #         medoid_lat, medoid_lon = cluster_coords[medoid_index][0], cluster_coords[medoid_index][1]

    #         print(f"Cluster {cluster_id}: Medoid ({medoid_lat}, {medoid_lon}), Count: {count}")
    #         medoids_result.append((i, (medoid_lat, medoid_lon), count))
    #         i += 1
    # return medoids_result

    i = 1
    centroids_result = []
    for cluster_id, count in top_clusters:
        if cluster_id != -1:  # Exclude the noise cluster
            cluster_points = df[df['cluster'] == cluster_id]
            # Calculate medoid
            dist_matrix = cdist(cluster_points[['latitude', 'longitude']], cluster_points[['latitude', 'longitude']])
            total_distances = np.sum(dist_matrix, axis=1)
            medoid_index = np.argmin(total_distances)
            medoid_point = cluster_points.iloc[medoid_index]
            medoid_lat = medoid_point['latitude']
            medoid_lon = medoid_point['longitude']
            print(f"Cluster {cluster_id}: Medoid ({medoid_lat}, {medoid_lon}), Count: {count}")
            centroids_result.append((i, (medoid_lat, medoid_lon), count))
            i += 1
    return centroids_result
    

In [55]:
bus_stops_per_line = {}
bus_stops_per_line[CHOOSEN_TEST_LINE] = calculate_final_stops_v2(filtered_df)

Top 3 most selected areas:
Cluster 1: Medoid (-22.90889, -43.17025), Count: 158
Cluster 0: Medoid (-22.90076, -43.29027), Count: 156
Cluster 2: Medoid (-22.91594, -43.22964), Count: 43


In [None]:
bus_stops_per_line = {}
STOP_RADIUS = 30
for linha in ['774']:
    if linha not in ['3', '232','774','852']:
        continue
    print(f'[{linha}]Querying database...')
    df = database_query(linha, CHOOSEN_DATES , db_engine_alchemy)
    print(f'[{linha}]Raw dataframe size: {len(df.index)}')

    print(f'[{linha}]Filtering dataframe...')
    filtered_df = filter_df_v2(df)

    if (len(filtered_df.index)>80000):
        filtered_df = filtered_df.sample(n=80000)
    # bus_stops_per_line[linha] = calculate_final_stops(df_filtered)
    print(f'[{linha}]Filtered dataframe size: {len(filtered_df.index)}')
    print(f'[{linha}]Clustering points...')
    bus_stops_per_line[linha] = calculate_final_stops_v2(filtered_df, radiusInMeters=STOP_RADIUS)
    print('-'*50)

In [56]:
df_results_dict = {'linha':[], 'cluster_order':[], 'latitude':[], 'longitude':[], 'cluster_count':[]}
for linha, info_list in bus_stops_per_line.items():
    for info in info_list:
        df_results_dict['linha'].append(linha)
        df_results_dict['cluster_order'].append(info[0])
        df_results_dict['latitude'].append(info[1][0])
        df_results_dict['longitude'].append(info[1][1])
        df_results_dict['cluster_count'].append(info[2])
df_results = pd.DataFrame.from_dict(df_results_dict)
# df_results.to_csv('calculated_bus_stops.csv', index=False)
display(df_results)

Unnamed: 0,linha,cluster_order,latitude,longitude,cluster_count
0,232,1,-22.90889,-43.17025,158
1,232,2,-22.90076,-43.29027,156
2,232,3,-22.91594,-43.22964,43


In [39]:
def create_single_bus_stops_map(df: pd.DataFrame, garage_points, line_choosen):
    
    map_center = [df['latitude'].mean(), df['longitude'].mean()]
    folium_map = folium.Map(location=map_center, zoom_start=12)

    # Color array provided
    # colors = ['blue', 'green', 'purple', 'orange', 'beige', 'darkblue', 'darkgreen', 'cadetblue', 'black', 'darkpurple', 'pink', 'lightblue', 'lightgreen', 'gray']

    # Create a mapping from "linha" to colors
    # linha_colors = {linha: colors[i % len(colors)] for i, linha in enumerate(df['linha'].unique())}

    for item in garage_points:
            folium.Circle(
            location=(item[0], item[1]),
            radius=100,
            color='red',
            fill=True,
            fill_color='red',
            fill_opacity=0.2
        ).add_to(folium_map)

    # Add circle markers to the map
    for _, row in df.iterrows():
        # if row['cluster_order'] > 2:
        #     continue
        popup_text = f"linha: {row['linha']}<br>cluster_order: {row['cluster_order']}<br>cluster_count: {row['cluster_count']}<br>Location:({row['latitude']}, {row['longitude']})"
        folium.Marker(
        location=(row['latitude'], row['longitude']),
        popup=folium.Popup(popup_text, max_width=300),
        icon=folium.Icon(color='blue')
        ).add_to(folium_map)
        # Add the circle to represent the radius
        folium.Circle(
            location=(row['latitude'], row['longitude']),
            radius=STOP_RADIUS,
            color='blue',
            fill=True,
            fill_color='blue',
            fill_opacity=0.2
        ).add_to(folium_map)
        
        

        # Save the map to an HTML file
        folium_map.save(f"maps/bus_stops/bus_stops_mapv2_{line_choosen}.html")


In [57]:
for linha in USED_BUS_LINES:
    df_sliced = df_results[df_results['linha'] == linha]
    if len(df_sliced.index) > 0:
        create_single_bus_stops_map(df_sliced, garage_points, linha)