In [1]:
import geopandas as gpd
from geopandas.tools import sjoin
import pandas as pd
import numpy as np
import pyproj    
import shapely
import shapely.ops as ops
from functools import partial

In [42]:
predictions = "/Users/user/Downloads/CB_ISL_Full_Predict_05242021.geojson"
training = "ISL_Training_Labels.geojson"
osm_full_filepath = "/Users/user/Downloads/relevant_osm_info.json"
cb_boundary = "/Users/user/Downloads/Congo_Basin_Boundary_no_islands_v5.geojson"
osm_clipped = "/Users/user/Documents/GitHub/cb_feature_detection/analytics/osm_relevant_clipped.geojson"

In [43]:
predictions = gpd.read_file(predictions)
training = gpd.read_file(training)
osm_clipped = gpd.read_file(osm_clipped)

#### Clip to CB Boundary

In [44]:
# osm_clipped = gpd.clip(osm_full, cb_boundary)

In [45]:
# osm_clipped.to_file("osm_relevant_clipped.geojson", driver="GeoJSON")

#### % of Training Chips Detected by Prediction

In [5]:
training_lines_match = sjoin(training, predictions, how="inner")["geometry"]

In [6]:
training_lines_match.nunique() / training.shape[0]

0.929046037019459

### Chips in Training Set

In [13]:
intersecting_polygons = sjoin(training,predictions, how="right")

In [22]:
true_positive_predictions = gpd.GeoDataFrame(intersecting_polygons[pd.notna(intersecting_polygons["index_left"])]["geometry"])

In [26]:
new_predictions = gpd.GeoDataFrame(intersecting_polygons[pd.isnull(intersecting_polygons["index_left"])]["geometry"])

#### True Positive Analysis

In [47]:
tp_osm = sjoin(true_positive_predictions,osm_clipped, how="right")

In [52]:
tp_osm = tp_osm[pd.notna(tp_osm["index_left"])]

In [60]:
tp_osm["tags"].reset_index().loc[0,"tags"]

{'access': 'forestry',
 'source': 'CIRAD;Landsat',
 'highway': 'track',
 'start_date': '1986'}

In [71]:
kv_tags = []
for tag in tp_osm["tags"]:
    kv_tag_list = []
    for k in tag:
        kv_tag = k + "_" + tag[k]
        kv_tag_list.append(kv_tag)
    kv_tags.append(kv_tag_list)

In [73]:
tp_osm["tag_list"] = kv_tags

In [90]:
tag_list_count = {}

for tag_list in tp_osm["tag_list"]:
    for tag in tag_list:
        if tag not in tag_list_count.keys():
            tag_list_count[tag] = 1 
        else:
            tag_list_count[tag] += 1

In [91]:
tp_osm.shape

(4735, 5)

In [92]:
# tag_list_count

{k: v for k, v in sorted(tag_list_count.items(), key=lambda item: item[1],reverse=True)}

{'highway_track': 4063,
 'access_forestry': 3530,
 'source_CIRAD;Landsat': 812,
 'source_Landsat;WRI': 552,
 'start_date_2014': 498,
 'start_date_2017': 451,
 'start_date_2018': 430,
 'way_track': 336,
 'source:geometry_Landsat 8': 323,
 'source:geometry:date_2017': 323,
 'highway_unclassified': 307,
 'start_date_before 2000': 301,
 'start_date_2001': 283,
 'start_date_2015': 257,
 'tracktype_grade5': 210,
 'start_date_1986': 185,
 'source_Landsat;JRC': 184,
 'surface_ground': 157,
 'surface_unpaved': 155,
 'start_date_2016': 151,
 'source_WRI;DIAF;Africover': 145,
 'way_unclassified': 132,
 'source_Bing Hires Aerial Image': 132,
 'source_WRI;DIAF;Société Trans M': 132,
 'source_MINTP CTIN - Référentiel Routier du Cameroun 2014': 119,
 'seasonal_dry_season': 118,
 'start_date_2012': 106,
 'start_date_2006': 106,
 'source:geometry_INDEFOR-AP cartografía': 105,
 'source:date_2012': 98,
 'highway_secondary': 85,
 'way_secondary': 69,
 'ref_D90': 68,
 'operator_Région': 68,
 'start_date_20

# Function for Tag Count Intersect between OSM and Prediction Set

In [116]:
def gdf_osm_tag_count(predictions_gdf,osm_gdf,filter_out=.05):
    
    tp_osm = sjoin(predictions_gdf,osm_gdf, how="right")
    
    tp_osm = tp_osm[pd.notna(tp_osm["index_left"])]
    
    kv_tags = []
    for tag in tp_osm["tags"]:
        kv_tag_list = []
        for k in tag:
            kv_tag = k + "_" + tag[k]
            kv_tag_list.append(kv_tag)
        kv_tags.append(kv_tag_list)
        
    tp_osm["tag_list"] = kv_tags
    
    tag_list_count = {}

    for tag_list in tp_osm["tag_list"]:
        for tag in tag_list:
            if tag not in tag_list_count.keys():
                tag_list_count[tag] = 1 
            else:
                tag_list_count[tag] += 1
                
    counts_sorted = {k: v for k, v in sorted(tag_list_count.items(), key=lambda item: item[1],reverse=True)}
    
    counts_sorted_df = pd.DataFrame({"tag":counts_sorted.keys(),"count":counts_sorted.values(),"percent_containing":[i/ predictions_gdf.shape[0] for i in counts_sorted.values()]})
    
    counts_sorted_df_filtered = counts_sorted_df[counts_sorted_df["percent_containing"] < filter_out]
    
    return counts_sorted_df_filtered
    
    
    

In [131]:
new_df = gdf_osm_tag_count(new_predictions,osm_clipped,filter_out=1)

In [132]:
true_df = gdf_osm_tag_count(true_positive_predictions,osm_clipped,filter_out=1)

In [157]:
cross_df_outer = pd.merge(
    new_df,
    true_df,
    how="outer",
    on=None,
    left_on="tag",
    right_on="tag",
    left_index=False,
    right_index=False,
    sort=True,
    suffixes=("_x", "_y"),
    copy=True,
    indicator=False,
    validate=None,
)

In [158]:
cross_df_outer.shape

(4108, 5)

#### Low Hanging Filter List - Represents of Occurring Tags

In [175]:
cross_df_outer["count_y"].isna().sum() / new_df.shape[0]

0.9690168333739937

In [173]:
new_df.shape[0]

4099

In [172]:
new_df.shape[0] - cross_df_outer["count_y"].isna().sum()

127

In [163]:
sum(filter_tags_misc["count_y"])

nan

#### Tags Appearing in Both Datasets

In [176]:
cross_df = cross_df_outer[cross_df_outer["count_y"].notna()]

In [177]:
cross_df

Unnamed: 0,tag,count_x,percent_containing_x,count_y,percent_containing_y
7,FIXME_check import,128.0,0.001009,3.0,0.000302
251,abandoned:date_before 2007,464.0,0.003657,15.0,0.001508
253,abandoned:date_before 2009,49.0,0.000386,25.0,0.002514
254,abandoned:date_before 2010,48.0,0.000378,3.0,0.000302
255,abandoned:date_before 2011,180.0,0.001419,6.0,0.000603
...,...,...,...,...,...
4056,way_secondary,812.0,0.006399,69.0,0.006937
4057,way_service,23.0,0.000181,15.0,0.001508
4058,way_tertiary,401.0,0.003160,20.0,0.002011
4059,way_track,1825.0,0.014382,336.0,0.033782


In [178]:
cross_df.sort_values(by="percent_containing_x",ascending=False)

Unnamed: 0,tag,count_x,percent_containing_x,count_y,percent_containing_y
620,highway_track,28562.0,0.225086,4063.0,0.408506
267,access_forestry,25293.0,0.199324,3530.0,0.354917
4049,waterway_river,17505.0,0.137950,34.0,0.003418
3793,source_Landsat;WRI,14837.0,0.116924,552.0,0.055500
1811,landuse_residential,14490.0,0.114190,19.0,0.001910
...,...,...,...,...,...
2984,name_Route Timberland,,,3.0,0.000302
3623,source:date_2007-05-10,,,8.0,0.000804
3779,source_Landast;JRC,,,6.0,0.000603
3949,start_date_2018,,,430.0,0.043233


#### New Chips Not in Training Set

In [4]:
intersecting_polygons = sjoin(predictions, training, how="left")

In [6]:
training

Unnamed: 0,width,color,note,geometry
0,1.0,25500255,,"LINESTRING (16.58639 2.19236, 16.58641 2.19236..."
1,1.0,25500255,,"LINESTRING (16.62630 2.19762, 16.62633 2.19762..."
2,1.0,25500255,,"LINESTRING (16.63438 2.18443, 16.63436 2.18443..."
3,1.0,25500255,,"LINESTRING (16.64634 2.19385, 16.64642 2.19385..."
4,1.0,25500255,,"LINESTRING (16.65006 2.19487, 16.65009 2.19487..."
...,...,...,...,...
4209,1.0,25500255,,"LINESTRING (17.35599 3.36944, 17.35602 3.36944..."
4210,1.0,25500255,,"LINESTRING (17.37302 3.36918, 17.37363 3.36918..."
4211,1.0,25500255,,"LINESTRING (17.38822 3.36912, 17.38880 3.36912..."
4212,1.0,25500255,,"LINESTRING (17.36942 3.36900, 17.36942 3.36897..."


In [8]:
values = {'index_right': "none"}
intersecting_polygons = intersecting_polygons.fillna(value=values)

In [9]:
intersecting_polygons = intersecting_polygons[intersecting_polygons["index_right"] == "none"]

#### 4% of Chips Comprised Training Chips  

In [10]:
intersecting_polygons.shape[0] / predictions.shape[0] 

0.9608159371237762

In [11]:
intersecting_polygons.shape

(126894, 5)

All Remaining Polygons Are Unique and not part of prediction set

In [15]:
intersecting_polygons["geometry"].nunique()

126894

In [13]:
predictions.shape[0] - intersecting_polygons.shape[0]

5175

In [14]:
predictions.shape[0]

132069

#### Area Ananlysis

In [None]:
area_km_sq = 0

for item in intersecting_polygons.itertuples():
    geom = item[1]
    geom_area = ops.transform(
    partial(
        pyproj.transform,
        pyproj.Proj(init='EPSG:4326'),
        pyproj.Proj(
            proj='aea',
            lat_1=geom.bounds[1],
            lat_2=geom.bounds[3])),
    geom)

#     Print the area in m^2
#     print(geom)
#     print(geom_area.area / 10**6)
    area_km_sq += geom_area.area / 10**6

  return _prepare_from_string(" ".join(pjargs))
  projstring = _prepare_from_string(" ".join((projstring, projkwargs)))
  return _prepare_from_string(" ".join(pjargs))
  projstring = _prepare_from_string(" ".join((projstring, projkwargs)))
  return _prepare_from_string(" ".join(pjargs))
  projstring = _prepare_from_string(" ".join((projstring, projkwargs)))
  return _prepare_from_string(" ".join(pjargs))
  projstring = _prepare_from_string(" ".join((projstring, projkwargs)))
  return _prepare_from_string(" ".join(pjargs))
  projstring = _prepare_from_string(" ".join((projstring, projkwargs)))
  return _prepare_from_string(" ".join(pjargs))
  projstring = _prepare_from_string(" ".join((projstring, projkwargs)))
  return _prepare_from_string(" ".join(pjargs))
  projstring = _prepare_from_string(" ".join((projstring, projkwargs)))
  return _prepare_from_string(" ".join(pjargs))
  projstring = _prepare_from_string(" ".join((projstring, projkwargs)))
  return _prepare_from_string(" ".join(p