In [1]:
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point
%matplotlib inline
import seaborn as sns; sns.set_theme(color_codes=True)
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.colors
import warnings
warnings.filterwarnings('ignore')
from scipy import stats
import math    
import folium
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import pdist
import sys
from haversine import haversine
from shapely.geometry import LineString
from shapely.ops import split, linemerge
import krippendorff

In [2]:
from src.cluster import *
from src.colors import *
from src.map import *
from src.compare import *

In [3]:
#split line into segments of 30 ft
def split_line_at_intervals(line, interval=30):
    distance = 0
    segments = []

    while distance < line.length:
        sub_line = line.interpolate(distance)
        next_distance = distance + interval
        next_sub_line = line.interpolate(next_distance)
        segment = LineString([sub_line, next_sub_line])
        segments.append(segment)
        distance = next_distance

    return segments

def split_geodataframe(geodf, local_crs, interval=30):
    
    geodf = geodf.to_crs(local_crs)
    
    new_rows = []

    for index, row in geodf.iterrows():
        line = row.geometry
        splitted_lines = split_line_at_intervals(line, interval)

        for segment_id, segment in enumerate(splitted_lines):
            new_row = row.copy()
            new_row.geometry = segment
            new_row['segment_id'] = segment_id
            new_rows.append(new_row)

    df = gpd.GeoDataFrame(new_rows, columns=geodf.columns.append(pd.Index(['segment_id'])))
    #add a column that combines street_edge_id and segment_id
    df['street_edge_segment_id'] =  df['street_edge_id'].astype(str) + '_' +  df['segment_id'].astype(str)
    df.crs = local_crs
    return df

In [4]:
#read in seattle validation labels
seattle_labels = gpd.read_file('data/round-1/seattle-validation-labels-round1-230611.geojson')
seattle_labels = seattle_labels.drop(columns=['time_created'])
seattle_labels.head()

Unnamed: 0.1,Unnamed: 0,username,label_id,street_edge_id,neighborhood,label_type,lat,lng,gsv_panorama_id,correct,severity,temporary,tag_list,description,geometry
0,3,Devon Snyder,3,480,University District,Obstacle,47.661041,-122.320709,-L8Sgb0ozIqqanCBfAZjYQ,t,3.0,f,pole,,POINT (-122.32071 47.66104)
1,4,Devon Snyder,4,480,University District,Obstacle,47.661026,-122.320702,-L8Sgb0ozIqqanCBfAZjYQ,t,2.0,f,pole,,POINT (-122.32070 47.66103)
2,5,Devon Snyder,5,480,University District,SurfaceProblem,47.66106,-122.320709,OH4yKYW98UY1aMK_b68Etw,t,2.0,f,bumpy,,POINT (-122.32071 47.66106)
3,8,Devon Snyder,8,480,University District,Occlusion,47.661106,-122.320724,OH4yKYW98UY1aMK_b68Etw,,,f,,,POINT (-122.32072 47.66111)
4,9,Devon Snyder,9,480,University District,Obstacle,47.661098,-122.320663,O0VZ-MF6Nrmjhkb2nCRhCw,t,3.0,f,sign,temporary due to construction,POINT (-122.32066 47.66110)


In [5]:
#read in chicago validation labels
chicago_labels = gpd.read_file('data/round-1/chicago-validation-labels-round1-230611.geojson')
chicago_labels = chicago_labels.drop(columns=['time_created'])
chicago_labels.head()

Unnamed: 0.1,Unnamed: 0,username,label_id,street_edge_id,neighborhood,label_type,lat,lng,gsv_panorama_id,correct,severity,temporary,tag_list,description,geometry
0,94,Devon Snyder,94,447,Skokie,CurbRamp,42.022606,-87.728157,rcjtiNG8gkNtkifZDFj2NQ,t,4.0,f,"not enough landing space,narrow,not level with...",,POINT (-87.72816 42.02261)
1,96,Devon Snyder,96,447,Skokie,CurbRamp,42.02272,-87.728149,rcjtiNG8gkNtkifZDFj2NQ,t,3.0,f,"not enough landing space,surface problem",,POINT (-87.72815 42.02272)
2,97,Devon Snyder,97,437,Skokie,CurbRamp,42.022717,-87.728432,_KrmBjHlebAncoZc40tsSg,t,3.0,f,"not enough landing space,points into traffic",,POINT (-87.72843 42.02272)
3,98,Devon Snyder,98,437,Skokie,CurbRamp,42.022621,-87.728439,_KrmBjHlebAncoZc40tsSg,t,3.0,f,not enough landing space,,POINT (-87.72844 42.02262)
4,99,Devon Snyder,99,447,Skokie,SurfaceProblem,42.022999,-87.728149,2vMz67hXm2ZGNcKHbyoR8g,t,1.0,f,grass,,POINT (-87.72815 42.02300)


In [6]:
#read in seattle streets
seattle_streets = gpd.read_file("data/round-1/seattle-streets-230611.geojson")
#read in chicago streets
chicago_streets = gpd.read_file("data/round-1/chicago-streets-230611.geojson")

In [17]:
chicago_streets 

Unnamed: 0,route_id,region_id,street_edge_id,way_type,geometry
0,,14,377,secondary,"LINESTRING (-87.87907 42.12500, -87.87904 42.1..."
1,,17,381,tertiary,"LINESTRING (-87.62409 41.64457, -87.62369 41.6..."
2,,17,382,tertiary,"LINESTRING (-87.62448 41.64457, -87.62409 41.6..."
3,,16,385,primary,"LINESTRING (-87.95971 42.03702, -87.95940 42.0..."
4,,16,398,primary,"LINESTRING (-87.95993 42.03701, -87.95971 42.0..."
5,,16,399,primary,"LINESTRING (-87.96043 42.03656, -87.96040 42.0..."
6,,11,402,secondary,"LINESTRING (-87.72852 42.01123, -87.72851 42.0..."
7,,13,404,secondary,"LINESTRING (-87.72813 42.02564, -87.72812 42.0..."
8,,13,406,secondary,"LINESTRING (-87.72939 42.02625, -87.72909 42.0..."
9,,13,413,secondary,"LINESTRING (-87.72875 42.02625, -87.72872 42.0..."


In [7]:
#obtain cluster to street segment mapping
def cluster_to_street_segment_mapping(df,streets_seg):
     # Group by cluster_id and combine all geometries in each group into a MultiPoint
     df_grouped = df.groupby('cluster_id')['geometry'].apply(lambda x: x.unary_union)
     # Calculate centroid of each group
     centroids = df_grouped.centroid
     # Convert centroids to GeoDataFrame
     cluster_centroids = gpd.GeoDataFrame(geometry=centroids).reset_index()
     cluster_centroids.crs = {'init' :'epsg:4326'}
     cluster_centroids_ft = cluster_centroids.to_crs(epsg=streets_seg.crs.to_epsg())
     #map gdf_centroids to seattle_split, delete index_right column
     cluster_centeroids_street = cluster_centroids_ft.sjoin_nearest(streets_seg, distance_col='distance')
     cluster_centeroids_street = cluster_centeroids_street.drop(columns=['index_right'])
     #select only cluster_id and street_edge_segment_id
     cluster_centeroids_street = cluster_centeroids_street[['cluster_id','street_edge_segment_id']]
     #if there are multiple street_edge_segment_ids for a cluster_id, select the first one
     cluster_centeroids_street = cluster_centeroids_street.groupby('cluster_id').first().reset_index()
     return cluster_centeroids_street

In [8]:
#construct agreement table
def irr_table (df,street_seg):
    mikey = df[df['username'] == 'mikey']
    devon = df[df['username'] == 'Devon Snyder']
    irr_mikey = mikey.merge(street_seg, on='street_edge_segment_id', how='right').fillna(0)
    irr_devon = devon.merge(street_seg, on='street_edge_segment_id', how='right').fillna(0)
    #replace usernames with 1
    irr_mikey  = irr_mikey .replace('mikey',1)
    irr_devon  = irr_devon .replace('Devon Snyder',1)
    #group by street_edge_segment_id and count the number of label_ids
    irr_mikey = irr_mikey.groupby('street_edge_segment_id').sum()
    irr_devon = irr_devon.groupby('street_edge_segment_id').sum()
    irr_mikey = irr_mikey[['username']].reset_index()
    irr_devon = irr_devon[['username']].reset_index()
    irr_mikey = irr_mikey.rename(columns={'username':'mikey'})
    irr_devon = irr_devon.rename(columns={'username':'devon'})
    irr_mikey_devon = irr_mikey.merge(irr_devon, on='street_edge_segment_id', how='left').fillna(0)
    irr_mikey_devon = irr_mikey_devon.drop(columns=['street_edge_segment_id'])
    
    return irr_mikey_devon

In [12]:
def krippendorff_alpha(labels,label_type_id, streets, local_crs):
    #clustering by label type
    label_type = cluster_label_type_at_index(labels,label_type_id)[2]
    #split streets into segments
    street_seg = split_geodataframe(streets, local_crs, interval=30)
    #map cluster_id to street_edge_segment_id
    mapping = cluster_to_street_segment_mapping(label_type,street_seg)
    #update the street_edge_segment_id with the one closest to the cluster centroid
    label_type = label_type.merge(mapping, on='cluster_id', how='left')
    #select only the columns we need
    df = label_type[['username','cluster_id','street_edge_segment_id','label_id']]
    #construct agreement table
    irr = irr_table(df,street_seg)
    return irr

In [13]:
def calculating_overall_krippendorff_alpha(seattle_labels,chicago_labels,label_type_id):
    seattle = krippendorff_alpha(seattle_labels,label_type_id, seattle_streets, 2285)
    chicago = krippendorff_alpha(chicago_labels,label_type_id, chicago_streets, 3435)
    overall = seattle.append(chicago)
    data_matrix = overall.T.to_numpy()
    alpha = krippendorff.alpha(data_matrix,level_of_measurement='ratio')
    return alpha.round(3)


In [24]:
alpha_overall = {}
for i in [0,1,2,4,5,8,9]:
    alpha = calculating_overall_krippendorff_alpha(seattle_labels,chicago_labels,i)
    alpha_overall[label_types[i]] = alpha


In [25]:
#30 ft
alpha_overall

{'CurbRamp': 0.888,
 'NoSidewalk': 0.69,
 'Problem': 0.523,
 'SurfaceProblem': 0.52,
 'Obstacle': 0.445,
 'Crosswalk': 0.91,
 'Signal': 0.697}

In [26]:
#take out label type = surfaceproblem and obstacle
seattle_problems= seattle_labels[(seattle_labels['label_type'] == 'SurfaceProblem') | (seattle_labels['label_type'] == 'Obstacle') ]
#take out severity ==1
seattle_problems = seattle_problems[seattle_problems['severity'] != 1]
seattle_problems

Unnamed: 0.1,Unnamed: 0,username,label_id,street_edge_id,neighborhood,label_type,lat,lng,gsv_panorama_id,correct,severity,temporary,tag_list,description,geometry
0,3,Devon Snyder,3,480,University District,Obstacle,47.661041,-122.320709,-L8Sgb0ozIqqanCBfAZjYQ,t,3.0,f,pole,,POINT (-122.32071 47.66104)
1,4,Devon Snyder,4,480,University District,Obstacle,47.661026,-122.320702,-L8Sgb0ozIqqanCBfAZjYQ,t,2.0,f,pole,,POINT (-122.32070 47.66103)
2,5,Devon Snyder,5,480,University District,SurfaceProblem,47.661060,-122.320709,OH4yKYW98UY1aMK_b68Etw,t,2.0,f,bumpy,,POINT (-122.32071 47.66106)
4,9,Devon Snyder,9,480,University District,Obstacle,47.661098,-122.320663,O0VZ-MF6Nrmjhkb2nCRhCw,t,3.0,f,sign,temporary due to construction,POINT (-122.32066 47.66110)
18,30,Devon Snyder,30,483,University District,Obstacle,47.661419,-122.320488,iI69DtsYap_Fjiz-RZ4i1A,f,2.0,f,pole,,POINT (-122.32049 47.66142)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1450,2485,mikey,2485,810,Windermere,SurfaceProblem,47.672047,-122.267578,hVSZIyYVmFvFvQ6Rizfo8g,,2.0,f,,pine needles,POINT (-122.26758 47.67205)
1451,2486,mikey,2486,810,Windermere,SurfaceProblem,47.672070,-122.267845,09WtcNVwH9KSZ8B4Hqm3Bg,,2.0,f,,pine needles,POINT (-122.26785 47.67207)
1452,2487,mikey,2487,810,Windermere,Obstacle,47.672207,-122.267700,09WtcNVwH9KSZ8B4Hqm3Bg,,4.0,f,pole,,POINT (-122.26770 47.67221)
1469,2534,Devon Snyder,2534,479,University District,SurfaceProblem,47.659721,-122.319923,SRZ-Py452sb8rEJfkHzHWw,,2.0,f,"cracks,uneven/slanted",,POINT (-122.31992 47.65972)


In [27]:
#take out label type = surfaceproblem and obstacle
chicago_problems= chicago_labels[(chicago_labels['label_type'] == 'SurfaceProblem') | (chicago_labels['label_type'] == 'Obstacle') ]
#take out severity ==1
chicago_problems = chicago_problems[chicago_problems['severity'] != 1]
chicago_problems

Unnamed: 0.1,Unnamed: 0,username,label_id,street_edge_id,neighborhood,label_type,lat,lng,gsv_panorama_id,correct,severity,temporary,tag_list,description,geometry
15,113,Devon Snyder,113,437,Skokie,Obstacle,42.024498,-87.728394,p_-iwPEGdz3zFZzojysq_Q,t,4.0,t,construction,,POINT (-87.72839 42.02450)
16,114,Devon Snyder,114,447,Skokie,Obstacle,42.024517,-87.728065,p_-iwPEGdz3zFZzojysq_Q,t,5.0,t,construction,,POINT (-87.72807 42.02452)
22,120,Devon Snyder,120,447,Skokie,SurfaceProblem,42.025555,-87.728050,6-jUvCDerpU1Mmdck_7brw,t,2.0,f,cracks,,POINT (-87.72805 42.02555)
23,121,Devon Snyder,121,447,Skokie,SurfaceProblem,42.025524,-87.728058,6-jUvCDerpU1Mmdck_7brw,t,3.0,f,"cracks,uneven/slanted",,POINT (-87.72806 42.02552)
24,122,Devon Snyder,122,447,Skokie,SurfaceProblem,42.025539,-87.728050,6-jUvCDerpU1Mmdck_7brw,t,3.0,f,uneven/slanted,,POINT (-87.72805 42.02554)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
449,2116,mikey,2116,417,Robbins,SurfaceProblem,41.641541,-87.699074,pyGMeg_ov_yzl9DHeCNKaQ,,3.0,f,grass,,POINT (-87.69907 41.64154)
462,2130,mikey,2130,445,Robbins,SurfaceProblem,41.644279,-87.699326,aezvzNgq2vdXD_QT4edWRw,,2.0,f,,,POINT (-87.69933 41.64428)
471,2139,mikey,2139,445,Robbins,SurfaceProblem,41.644253,-87.700073,LyeTlktgRkxWxkPeFoZ63w,,2.0,f,"grass,uneven/slanted",,POINT (-87.70007 41.64425)
476,2144,mikey,2144,445,Robbins,Obstacle,41.644253,-87.700371,Hdhuc9wypqqkvSIEWjpx3w,,4.0,f,trash/recycling can,,POINT (-87.70037 41.64425)


In [28]:
alpha_overall_problems = {}
for i in [2,4,5]:
    alpha = calculating_overall_krippendorff_alpha(seattle_problems,chicago_problems,i)
    alpha_overall_problems[label_types[i]] = alpha

In [29]:
alpha_overall_problems

{'Problem': 0.453, 'SurfaceProblem': 0.452, 'Obstacle': 0.508}