In [2]:
import pandas as pd
import numpy as np
from geopy.geocoders import Nominatim

rawData = pd.read_xml('raw_data_waste_atlas.xml')

In [3]:
rawData.head()

Unnamed: 0,nr,number,city_number,location_number,name,address,lat,lng,category
0,m0,1,0,0,SINGAPORE,<table class='table table-hover content-mid' c...,1.352083,103.819836,country
1,m1,2,0,0,"HONG KONG SAR, CHINA",<table class='table table-hover content-mid' c...,22.396428,114.109497,country
2,m2,40,0,0,"MACAO SAR, CHINA",<table class='table table-hover content-mid' c...,22.198745,113.543873,country
3,m3,3,0,0,KOREA REP.,<table class='table table-hover content-mid' c...,35.907757,127.766922,country
4,m4,4,0,0,BANGLADESH,<table class='table table-hover content-mid' c...,23.684994,90.356331,country


In [23]:
rawData['category'].value_counts()

city                    1799
Sanitary Landfills      1626
WtE                      716
country                  164
MBT                      130
Dumpsites                 93
Biological Treatment      78
Name: category, dtype: int64

In [104]:
# slice to only markers indicating landfills and dumpsites

slicedDf = rawData.loc[rawData['category'].isin(['Sanitary Landfills', 'Dumpsites'])]

slicedDf.head()

Unnamed: 0,nr,number,city_number,location_number,name,address,lat,lng,category
1963,m1963,13,772,3,BORDO PONIENTE,<table class='table table-hover content-mid' ...,19.452996,-99.016299,Dumpsites
1964,m1964,86,1943,10,CLUJ NAPOCA,<table class='table table-hover content-mid' ...,46.76510351,23.687274,Dumpsites
1965,m1965,16,56,4,JARDIM GRAMACHO,<table class='table table-hover content-mid' ...,-22.7489058,-43.258978,Dumpsites
1966,m1966,92,14,5,PAYATAS,<table class='table table-hover content-mid' ...,14.714258252,121.106758,Dumpsites
1967,m1967,110,13,1737,DANDORA MUNICIPAL DUMPING SITE,<table class='table table-hover content-mid' ...,-1.2492656896,36.895381,Dumpsites


In [113]:
# function for getting country from a lat/long coordinate (using geopy package)

def get_country_from_latLong(lat, long, geolocatorObj):
    location = geolocatorObj.reverse(str(lat) + ',' + str(long), language='en')
    
    try:
        countryStr = location.raw['address']['country']
    except Exception:
        countryStr = np.nan
        
    return countryStr

In [114]:
# get country of each data point
geolocatorObj = Nominatim(user_agent="geoapiExercises")

# note: .iloc[1592] (York County Landfill) seems to have an error in lat/long value (the coordinates plot to middle of the ocean)

slicedDf['country'] = slicedDf[['lat', 'lng']].apply(lambda x: get_country_from_latLong(x['lat'], x['lng'], geolocatorObj), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  slicedDf['country'] = slicedDf[['lat', 'lng']].apply(lambda x: get_country_from_latLong(x['lat'], x['lng'], geolocatorObj), axis=1)


In [129]:
# which countries are most represented?

slicedDf['country'].value_counts()[0:20]

United States     1212
Greece              47
New Zealand         47
Algeria             38
Australia           35
United Kingdom      25
Argentina           24
Serbia              21
Mexico              19
South Africa        18
Portugal            18
Brazil              17
Tunisia             16
Latvia              12
India               10
Chile               10
Peru                10
Nigeria             10
Morocco              9
Ukraine              8
Name: country, dtype: int64

In [125]:
# remove landfills in the US

removedUSdf = slicedDf.loc[slicedDf['country'] != 'United States']
removedUSdf.shape

(507, 10)

In [127]:
# save processed data to csv file

exportDf = removedUSdf[['name', 'country', 'category', 'lat', 'lng']]
exportDf.index = np.arange(len(exportDf))
exportDf.head()

exportDf.to_csv('waste_atlas_latLongs.csv')