In [2]:
# Import the module
from IaaGeoDataCleaning.CleaningUtils.coordinates_validator import *

In [3]:
# Read shapefile
shape_dir = '/Users/thytnguyen/Desktop/geodata-2018/IaaGeoDataCleaning/resources/mapinfo'
shape_dict = process_shapefile(shape_dir)
shape_dict

{'dbf': '/Users/thytnguyen/Desktop/geodata-2018/IaaGeoDataCleaning/resources/mapinfo/TM_WORLD_BORDERS-0.3.dbf',
 'prj': '/Users/thytnguyen/Desktop/geodata-2018/IaaGeoDataCleaning/resources/mapinfo/TM_WORLD_BORDERS-0.3.prj',
 'shp': '/Users/thytnguyen/Desktop/geodata-2018/IaaGeoDataCleaning/resources/mapinfo/TM_WORLD_BORDERS-0.3.shp',
 'shx': '/Users/thytnguyen/Desktop/geodata-2018/IaaGeoDataCleaning/resources/mapinfo/TM_WORLD_BORDERS-0.3.shx',
 'zip': '/Users/thytnguyen/Desktop/geodata-2018/IaaGeoDataCleaning/resources/mapinfo/TM_WORLD_BORDERS-0.3.zip'}

In [6]:
# Get .shp shapefile
shape_gdf = get_shape(shape_dict['shp'])
shape_gdf.head()

Unnamed: 0,FIPS,ISO2,ISO3,UN,NAME,AREA,POP2005,REGION,SUBREGION,LON,LAT,geometry
0,AC,AG,ATG,28,Antigua and Barbuda,44,83039,19,29,-61.783,17.078,"(POLYGON ((-61.686668 17.02444100000014, -61.7..."
1,AG,DZ,DZA,12,Algeria,238174,32854159,2,15,2.632,28.163,"POLYGON ((2.96361 36.802216, 2.981389 36.80693..."
2,AJ,AZ,AZE,31,Azerbaijan,8260,8352021,142,145,47.395,40.43,(POLYGON ((45.08332100000001 39.76804400000015...
3,AL,AL,ALB,8,Albania,2740,3153731,150,39,20.068,41.143,"POLYGON ((19.436214 41.021065, 19.450554 41.05..."
4,AM,AM,ARM,51,Armenia,2820,3017661,142,145,44.563,40.534,(POLYGON ((45.57305100000013 40.63248800000008...


In [5]:
# Get projection
crs = get_projection(shape_dict['prj'])
crs

4326

In [7]:
# Create example dataframe with nulls and faulty coordinates
df = pd.DataFrame({
    'City': ['Lagos', 'Istanbul', 'Moscow', 'Tokyo', 'Cairo', 'London', 'Rio de Janeiro', 'Johannesburg'],
    'Country': ['Nigeria', 'Turkey', 'Russia', 'Japan', 'Egypt', 'United Kingdom', 'Brazil', 'South Africa'],
    'Latitude': [3.78, 0, 55.76, 35.69, None, 51.51, 22.91, 26.20],
    'Longitude': [6.52, 0, 37.62, 139.70, None, 13.13, 43.21, 28.05]
})
df

Unnamed: 0,City,Country,Latitude,Longitude
0,Lagos,Nigeria,3.78,6.52
1,Istanbul,Turkey,0.0,0.0
2,Moscow,Russia,55.76,37.62
3,Tokyo,Japan,35.69,139.7
4,Cairo,Egypt,,
5,London,United Kingdom,51.51,13.13
6,Rio de Janeiro,Brazil,22.91,43.21
7,Johannesburg,South Africa,26.2,28.05


In [8]:
# Add country codes for the entries
cc_df = add_country_code(data=df, ctry_col='Country')
cc_df

Unnamed: 0,City,Country,Latitude,Longitude,ISO2,ISO3
0,Lagos,Nigeria,3.78,6.52,NG,NGA
1,Istanbul,Turkey,0.0,0.0,TR,TUR
2,Moscow,Russia,55.76,37.62,RU,RUS
3,Tokyo,Japan,35.69,139.7,JP,JPN
4,Cairo,Egypt,,,EG,EGY
5,London,United Kingdom,51.51,13.13,GB,GBR
6,Rio de Janeiro,Brazil,22.91,43.21,BR,BRA
7,Johannesburg,South Africa,26.2,28.05,ZA,ZAF


In [9]:
# Filter entries with and without coordinate inputs
filtered_df = filter_data_without_coords(data=cc_df, lat_col='Latitude', lng_col='Longitude')
filtered_df[0]

Unnamed: 0,City,Country,Latitude,Longitude,ISO2,ISO3
0,Lagos,Nigeria,3.78,6.52,NG,NGA
2,Moscow,Russia,55.76,37.62,RU,RUS
3,Tokyo,Japan,35.69,139.7,JP,JPN
5,London,United Kingdom,51.51,13.13,GB,GBR
6,Rio de Janeiro,Brazil,22.91,43.21,BR,BRA
7,Johannesburg,South Africa,26.2,28.05,ZA,ZAF


In [10]:
filtered_df[1]

Unnamed: 0,City,Country,Latitude,Longitude,ISO2,ISO3
1,Istanbul,Turkey,0.0,0.0,TR,TUR
4,Cairo,Egypt,,,EG,EGY


In [11]:
# Generate GeoDataFrames for every possible coordinate combination lest they were inverted/flipped
coords_gdf_list = flip_coords(data=cc_df, lat_col='Latitude', lng_col='Longitude', prj=crs)
# the combinations are (lat, lng), (lat, -lng), (-lat, lng), (-lat, -lng),
#                      (lng, lat), (lng, -lat), (-lng, lat), (-lng, -lat)
# note: POINT geometry is formatted as (lng, lat)
len(coords_gdf_list)

8

In [12]:
orig_gdf = coords_gdf_list[0]   # original data
orig_gdf.head() 

Unnamed: 0,City,Country,Latitude,Longitude,ISO2,ISO3,Flipped_Lat,Flipped_Lng,Type,geometry
0,Lagos,Nigeria,3.78,6.52,NG,NGA,3.78,6.52,Original,POINT (6.52 3.78)
1,Istanbul,Turkey,0.0,0.0,TR,TUR,0.0,0.0,Original,POINT (0 0)
2,Moscow,Russia,55.76,37.62,RU,RUS,55.76,37.62,Original,POINT (37.62 55.76)
3,Tokyo,Japan,35.69,139.7,JP,JPN,35.69,139.7,Original,POINT (139.7 35.69)
4,Cairo,Egypt,,,EG,EGY,0.0,0.0,Original,POINT (0 0)


In [13]:
coords_gdf_list[4].head()   # inverted latitude/longitude combination

Unnamed: 0,City,Country,Latitude,Longitude,ISO2,ISO3,Flipped_Lat,Flipped_Lng,Type,geometry
0,Lagos,Nigeria,3.78,6.52,NG,NGA,6.52,3.78,Flipped,POINT (3.78 6.52)
1,Istanbul,Turkey,0.0,0.0,TR,TUR,0.0,0.0,Flipped,POINT (0 0)
2,Moscow,Russia,55.76,37.62,RU,RUS,37.62,55.76,Flipped,POINT (55.76 37.62)
3,Tokyo,Japan,35.69,139.7,JP,JPN,139.7,35.69,Flipped,POINT (35.69 139.7)
4,Cairo,Egypt,,,EG,EGY,0.0,0.0,Flipped,POINT (0 0)


In [14]:
# Can check for just one dataframe
res = check_data_geom(eval_col='City', iso2_col='ISO2', all_geodata=orig_gdf, shapedata=shape_gdf, 
                      shape_geom_col='geometry', shape_iso2_col='ISO2')

res[0]  # correct entries

Unnamed: 0,City,Country,Latitude,Longitude,ISO2,ISO3,Flipped_Lat,Flipped_Lng,Type,geometry
0,Tokyo,Japan,35.69,139.7,JP,JPN,35.69,139.7,Original,POINT (139.7 35.69)
1,Moscow,Russia,55.76,37.62,RU,RUS,55.76,37.62,Original,POINT (37.62 55.76)


In [15]:
res[1]  # incorrect entries

Unnamed: 0,City,Country,Latitude,Longitude,ISO2,ISO3,Flipped_Lat,Flipped_Lng,Type,geometry
0,Lagos,Nigeria,3.78,6.52,NG,NGA,3.78,6.52,Original,POINT (6.52 3.78)
1,Istanbul,Turkey,0.0,0.0,TR,TUR,0.0,0.0,Original,POINT (0 0)
4,Cairo,Egypt,,,EG,EGY,0.0,0.0,Original,POINT (0 0)
5,London,United Kingdom,51.51,13.13,GB,GBR,51.51,13.13,Original,POINT (13.13 51.51)
6,Rio de Janeiro,Brazil,22.91,43.21,BR,BRA,22.91,43.21,Original,POINT (43.21 22.91)
7,Johannesburg,South Africa,26.2,28.05,ZA,ZAF,26.2,28.05,Original,POINT (28.05 26.2)


In [16]:
# To perform checking of coordinate combinations, pass the entire list of GeoDataFrames to check_data_geom
res = check_data_geom(eval_col='City', iso2_col='ISO2', all_geodata=coords_gdf_list, shapedata=shape_gdf,
                     shape_geom_col='geometry', shape_iso2_col='ISO2')
res[0] 
# note: the function can return multiple combinations of the same entry because it is checking at the country level,
#       so if more than one of the combinations fall within the country borders then the user will decide which one 
#       to save.

Unnamed: 0,City,Country,Latitude,Longitude,ISO2,ISO3,Flipped_Lat,Flipped_Lng,Type,geometry
0,Tokyo,Japan,35.69,139.7,JP,JPN,35.69,139.7,Original,POINT (139.7 35.69)
1,Moscow,Russia,55.76,37.62,RU,RUS,55.76,37.62,Original,POINT (37.62 55.76)
2,Rio de Janeiro,Brazil,22.91,43.21,BR,BRA,-22.91,-43.21,Flipped,POINT (-43.21 -22.91)
3,Lagos,Nigeria,3.78,6.52,NG,NGA,6.52,3.78,Flipped,POINT (3.78 6.52)
4,Johannesburg,South Africa,26.2,28.05,ZA,ZAF,-28.05,26.2,Flipped,POINT (26.2 -28.05)
5,Johannesburg,South Africa,26.2,28.05,ZA,ZAF,-26.2,28.05,Flipped,POINT (28.05 -26.2)


In [17]:
res[1]

Unnamed: 0,City,Country,Latitude,Longitude,ISO2,ISO3,Flipped_Lat,Flipped_Lng,Type,geometry
1,Istanbul,Turkey,0.0,0.0,TR,TUR,0.0,0.0,Original,POINT (0 0)
4,Cairo,Egypt,,,EG,EGY,0.0,0.0,Original,POINT (0 0)
5,London,United Kingdom,51.51,13.13,GB,GBR,51.51,13.13,Original,POINT (13.13 51.51)


In [19]:
# Geocode coordinates for incorrect/null entries
geocoded_res = geocode_coordinates(data=res[1], loc_col='City', ctry_col='Country')

geocoded_res[0] # entries that were able to be geocoded

Unnamed: 0,City,Country,Flipped_Lat,Flipped_Lng,Geocoded_Adr,Geocoded_Lat,Geocoded_Lng,ISO2,ISO3,Latitude,Longitude,Type,geometry
0,Istanbul,Turkey,0.0,0.0,"Istanbul, 34126, Istanbul, Turkey",41.009633,28.965165,TR,TUR,0.0,0.0,Geocoded,POINT (0 0)
1,Cairo,Egypt,0.0,0.0,"Cairo, Cairo Governorate, Egypt",30.048819,31.243666,EG,EGY,,,Geocoded,POINT (0 0)
2,London,United Kingdom,51.51,13.13,"London, SW1A 2DU, London, England, United Kingdom",51.507322,-0.127647,GB,GBR,51.51,13.13,Geocoded,POINT (13.13 51.51)
