<img src="https://i.imgur.com/6U6q5jQ.png"/>


# Mining your GeoDataFrame

Let's bring the map we created last time:

In [None]:
from  fiona import listlayers

brazilMaps='https://github.com/SocialAnalytics-StrategicIntelligence/codes/raw/main/maps/brazilMaps_5641.gpkg'

#layers in maps
listlayers(brazilMaps)

In [None]:
# reading in the data:
import os
import geopandas as gpd

states=gpd.read_file(brazilMaps,layer='states')
municipalities=gpd.read_file(brazilMaps,layer='municipalities')
airports=gpd.read_file(brazilMaps,layer='airports')
rivers=gpd.read_file(brazilMaps,layer='rivers')
border=gpd.read_file(brazilMaps,layer='border')

Now, we are going to add more data. In this [link](https://msi.nga.mil/Publications/WPI) we find the  World Port Index (Pub 150), which contains several data on major ports and terminals world-wide. Download the **UpdatedPub150.csv** file in your **data** folder in GitHUb and read it from there:

In [None]:
import pandas as pd 


infoseaports=pd.read_csv("https://github.com/SocialAnalytics-StrategicIntelligence/codes/raw/main/data/UpdatedPub150.csv")

#columns available (so many)
infoseaports.columns.to_list()

Let's do some preprocessing:

In [None]:
#rename
infoseaports.rename(columns={'Main Port Name':'portName'},inplace=True)
#subset
infoseaports=infoseaports.loc[:,['portName', 'Country Code','Latitude', 'Longitude']]

# we have
infoseaports.info()

In [None]:
# some rows
infoseaports.head()

It looks ready to become a spatial object (GDF of points):

In [None]:
#spatial points (unprojected)
seaports=gpd.GeoDataFrame(data=infoseaports.copy(),
                           geometry=gpd.points_from_xy(infoseaports.Longitude,
                                                       infoseaports.Latitude), 
                          crs=4326)# notice it is unprojected

# subset:
seaports_bra=seaports[seaports['Country Code']=='Brazil'].copy()

# reset indexes
seaports_bra.reset_index(drop=True, inplace=True)

# reprojecting
seaports_bra_5641=seaports_bra.to_crs(5641) # projected crs

Let me plot seaports along with the airports (only large ones) we have:

In [None]:
# subsetting
largeAirports=airports[airports['type']=='large_airport'] #can't use "airports.type"
largeAirports.reset_index(drop=True, inplace=True)

#plotting
base=largeAirports.plot(color='red',marker="^")
seaports_bra_5641.plot(ax=base,alpha=0.5,markersize=3)

# Distance between spatial objects

## Distance between points

The easiest way to understand distance is to compute how far two coordinates are from each other.

You have the seaports:

In [None]:
seaports_bra_5641.head()

..and the large airports:

In [None]:
largeAirports.head()

If both GDFs have the same projected CRS, we can use the **distance** function. In this case, just two selected points:

In [None]:
# distance between 'Guarulhos' and 'Dtse / Gegua Oil Terminal' in km
largeAirports.iloc[0].geometry.distance(seaports_bra_5641.iloc[0].geometry)/1000

What about computing all possible distances between those GDFs?

In [None]:
#try 1: default
seaports_bra_5641.geometry.apply\
(lambda g: largeAirports.geometry.distance(g)/1000)

In [None]:
# try 2: see names (change indexes)

seaports_bra_5641.set_index('portName').geometry.apply\
(lambda g: largeAirports.set_index('name').geometry.distance(g)/1000)

In [None]:
#try 3: reorder previous output

seaports_bra_5641.set_index('portName').geometry.apply\
(lambda g: largeAirports.set_index('name').geometry.distance(g)/1000).\
sort_index(axis=0).sort_index(axis=1)

Let's keep the last one:

In [None]:
distanceMatrixKM_sea_air= seaports_bra_5641.set_index('portName').geometry.apply\
                          (lambda g: largeAirports.set_index('name').geometry.distance(g)/1000).\
                          sort_index(axis=0).sort_index(axis=1)

This a data frame (pandas), and the names of the facilities are row and column indexes. This is useful this way:

In [None]:
# the mean distance from a seaport to all the large airports (sorted)
distanceMatrixKM_sea_air.mean(axis=1).sort_values(ascending=True) #axis=0?

Let's compute more stats:

In [None]:
SomeStats=pd.DataFrame()
SomeStats['mean']=distanceMatrixKM_sea_air.mean(axis=1)
SomeStats['min']=distanceMatrixKM_sea_air.min(axis=1)
SomeStats['max']=distanceMatrixKM_sea_air.max(axis=1)

# see some
SomeStats.head(10)

We can also use **idxmax** to get the particular locations:

In [None]:
# farthest airport to each seaport
distanceMatrixKM_sea_air.idxmax(axis=1)

In [None]:
# farthest seaport to each airport
distanceMatrixKM_sea_air.idxmax(axis=0)

In [None]:
# closest airport to each seaport
distanceMatrixKM_sea_air.idxmin(axis=1)

In [None]:
# closest seaport to each airport
distanceMatrixKM_sea_air.idxmin(axis=0)

## Distance between line and point

Let's take a look at the rivers we have:

In [None]:
rivers

In [None]:
#keep one:

rivers[rivers.NAME.str.contains('Grande')]

You can see that distance works between these two elements:

In [None]:
# distance from each airport to Rio Grande
rivers[rivers.NAME.str.contains('Grande')].iloc[0].geometry.distance(largeAirports.set_index('name').geometry)/1000

Based on what we did previously, let's compute all the distances:

In [None]:
distanceMatrixKM_riv_air=rivers.set_index('NAME').geometry.apply\
(lambda g: largeAirports.set_index('name').geometry.distance(g)/1000).\
sort_index(axis=0).sort_index(axis=1)
distanceMatrixKM_riv_air

Here, we see one row (river), that tells the distance to each column (large airport):

In [None]:
distanceMatrixKM_riv_air.loc['Rio Grande, South America'].sort_values()

Let's try a simple plot of the river and the airports:

In [None]:
base=largeAirports.explore(color='red',marker_kwds=dict(radius=10))
rivers[rivers.NAME.str.contains('Grande')].explore(m=base)

Now, let's focus on the rivers that belong to a system:

In [None]:
rivers[~rivers.SYSTEM.isna()]

Let's dissolve the ones that belong to a system into a multiline:

In [None]:
systems=rivers.dissolve(by='SYSTEM')
systems

Let's do some basic formatting:

In [None]:
# format the GDF:

systems.reset_index(drop=False,inplace=True)
systems.drop(columns='NAME',inplace=True)

# we have
systems

Another distance matrix:

In [None]:
distanceMatrixKM_sys_air=systems.set_index('SYSTEM').geometry.apply\
(lambda g: largeAirports.set_index('name').geometry.distance(g)/1000).\
sort_index(axis=0).sort_index(axis=1)

distanceMatrixKM_sys_air

This time, let me get all the minimum distances:

In [None]:
mins=distanceMatrixKM_sys_air.idxmin(axis="columns") # same as axis=1
mins

In [None]:
# one of them
mins.iloc[1]

Let's see now:

In [None]:
base=systems.explore()
# the closest
largeAirports[largeAirports.name.isin(mins)].explore(m=base,color='red',marker_kwds=dict(radius=10))
# NOT the closest
largeAirports[~largeAirports.name.isin(mins)].explore(m=base,color='blue',marker_kwds=dict(radius=5))


## Polygon to point

Let me create some **convex hull**s (polygons):

In [None]:
# polygon for each system
systems.convex_hull

In [None]:
# see them
systems.convex_hull.plot()

Now, a GDF for the hulls:

In [None]:
systems_hulls=systems.convex_hull.to_frame()
systems_hulls['system']=['Amazon', 'Parana']
systems_hulls.rename(columns={0:'geometry'},inplace=True)
systems_hulls=systems_hulls.set_geometry('geometry')
systems_hulls.crs="EPSG:5641"
systems_hulls

Next, the distance matrix:

In [None]:

distanceMatrixKM_sysHull_air=systems_hulls.set_index('system').geometry.apply\
(lambda g: largeAirports.set_index('name').geometry.distance(g)/1000).\
sort_index(axis=0).sort_index(axis=1)

distanceMatrixKM_sysHull_air

All the minimal differences:

In [None]:
mins=distanceMatrixKM_sysHull_air.idxmin(axis="columns")
mins

In [None]:
# plotting
base=systems_hulls.explore()
largeAirports[largeAirports.name.isin(mins)].explore(m=base,color='red',marker_kwds=dict(radius=10))
largeAirports[~largeAirports.name.isin(mins)].explore(m=base,color='blue',marker_kwds=dict(radius=5))

## Distances using _Buffers_

A very important case in distance analysis is the use of buffers:

In [None]:
# remember:
distanceMatrixKM_riv_air

In [None]:
# getting a value (it can be any value)
distanceMatrixKM_riv_air.loc['Amazon'].min()

We can use any value to create a buffer:

In [None]:
minMts=distanceMatrixKM_riv_air.loc['Amazon'].min()*1000

#the buffer is a polygon:
rivers[rivers.NAME=='Amazon'].buffer(distance = minMts)

In [None]:
# see buffer:
bufferAroundAmazon=rivers[rivers.NAME=='Amazon'].buffer(distance = minMts)
bufferAsBase=bufferAroundAmazon.explore(color='red')
rivers[rivers.NAME=='Amazon'].explore(m=bufferAsBase,color='blue',style_kwds={'weight':0.5})

Above we used the buffer (red polygon), and the river (blue). Let me add a layer of airports (small ones):

In [None]:
small_airports=airports[airports['type']=='small_airport']

# plotting
rivers[rivers.NAME=='Amazon'].explore(m=bufferAsBase,color='blue',style_kwds={'weight':0.5})
small_airports.explore(m=bufferAsBase,color='black')

Now, we can use the buffer (polygon) to keep the airports that are at that particular distance around the river:

In [None]:

riversWithinBuffer=small_airports.clip(mask=bufferAroundAmazon)
#
riversWithinBuffer

In [None]:
bufferAsBase=bufferAroundAmazon.explore(color='red')
rivers[rivers.NAME=='Amazon'].explore(m=bufferAsBase,color='blue',style_kwds={'weight':0.5})
riversWithinBuffer.explore(m=bufferAsBase,color='black')

In [None]:
# minimum of all the minimum by row
distanceMatrixKM_riv_air.min(axis=1).min() 

In [None]:
# using the previous value
minMinMts_5=5*distanceMatrixKM_riv_air.min(axis=1).min()*1000


allMinBuffer=rivers.buffer(distance = minMinMts_5).explore(color='red')
rivers.explore(m=allMinBuffer,color='blue',style_kwds={'weight':0.5})

In [None]:
# you see all the buffer polygons:
rivers.buffer(distance = minMinMts_5)

Now keep small airports in buffer:

In [None]:
allRiversWithinBuffs=small_airports.clip(riversAll_buf)
allRiversWithinBuffs

In [None]:
# simple
base=riversAll_bufDF.plot(color='yellow')
allRiversWithinBuffs.plot(ax=base, color='green', markersize=1)

In [None]:
# folium

base=riversAll_bufDF.explore(color='yellow')
allRiversWithinBuffs.explore(m=base, color='green')

# Using data from spatial objects

This is time to use  local indicators (social, economic, physical, etc.)  at each spatial location to produce some analytics. Let's follow an approach based on a the amount of variables involved.

Let's use GitHub to put this [file](https://drive.google.com/file/d/1EYacndGCRiF1ZHEnGa-avTXSEtqB2e7p/view?usp=sharing) in the maps folder, and this other [file](https://docs.google.com/spreadsheets/d/1xpsz9n-SBTwgtXsugmabpBJ-tCaEwg9_/edit?usp=sharing&ouid=106935788518947165917&rtpof=true&sd=true) in the data folder. 

Let's read the data in from GitHub:

In [None]:
# data table
import pandas as pd

datadis=pd.read_excel("https://github.com/SocialAnalytics-StrategicIntelligence/codes/raw/main/data/dataPeru_indicadores.xlsx",
                     dtype={'Ubigeo': object})
datadis.head()

In [None]:
# map
import geopandas as gpd
datadismap=gpd.read_file("https://github.com/SocialAnalytics-StrategicIntelligence/codes/raw/main/maps/DistritosMap.zip")

datadismap.head()

## Preprocessing

After observing both tables, it would be better if the columns with names have the same capitalization, and no extra blank spaces:

In [None]:
capitalizeColumns=lambda x: x.str.upper().str.strip()
datadis[['Provincia','Distrito']]=datadis[['Provincia','Distrito']].apply(capitalizeColumns)
datadismap[['PROVINCIA','DISTRITO']]=datadismap[['PROVINCIA','DISTRITO']].apply(capitalizeColumns)

The names from non-english speaking countries may come with some symbols that may cause trouble (', ~). Let's get rid of those:

In [None]:
import unidecode


byePunctuation=lambda x: unidecode.unidecode(x)
datadis[['Provincia','Distrito']]=datadis[['Provincia','Distrito']].map(byePunctuation)
datadismap[['PROVINCIA','DISTRITO']]=datadismap[['PROVINCIA','DISTRITO']].map(byePunctuation)

Let me see how many district we have:

In [None]:
len(datadis.Distrito),len(datadismap.DISTRITO)

Are the name of the districts unique?

In [None]:
datadis.Distrito.duplicated().sum(),datadismap.DISTRITO.duplicated().sum()

The presence of duplicates, forces we create  a column of unique values:

In [None]:
# concatenating
datadis['provDist']=["+".join(pd) for pd in zip (datadis.Provincia,datadis.Distrito)]
datadismap['provDist']=["+".join(pd) for pd in zip (datadismap.PROVINCIA,datadismap.DISTRITO)]

In [None]:
# the new column looks like this:
datadis['provDist'].head()

It would be good making sure no *ghost* appears between words:

In [None]:
# replacing dashes and multiple spaces by a simple space
datadis['provDist']=datadis.provDist.str.replace("\-|\_|\s+"," ",regex=True)
datadismap['provDist']=datadismap.provDist.str.replace("\-|\_|\s+"," ",regex=True)

## Merging

Let's find out what is NOT matched between the  tables:

In [None]:
nomatch_df=set(datadis.provDist)- set(datadismap.provDist)
nomatch_gdf=set(datadismap.provDist)-set(datadis.provDist) 

This is what could not be matched:

In [None]:
len(nomatch_df), len(nomatch_gdf)

Let's try renaming the districts using **fuzzy merging**:

In [None]:
# pick the closest match from nomatch_gdf for a value in nomatch_df
from thefuzz import process
[(dis,process.extractOne(dis,nomatch_gdf)) for dis in sorted(nomatch_df)]

If you are comfortable, you prepare a _dictionary_ of changes:

In [None]:
# is this OK?
{dis:process.extractOne(dis,nomatch_gdf)[0] for dis in sorted(nomatch_df)}

In [None]:
# then:
changesDis_df={dis:process.extractOne(dis,nomatch_gdf)[0] for dis in sorted(nomatch_df)}

Now, make the replacements:

In [None]:
datadis.provDist.replace(changesDis_df,inplace=True)

Now the merge can happen:

In [None]:
datadisMap=datadismap.merge(datadis, on='provDist')
# check
datadisMap.info()

In [None]:
bye=['Departamento', 'Provincia', 'Distrito','INSTITUCIO','provDist']
datadisMap.drop(columns=bye,inplace=True)

# keeping
datadisMap.head()

We can save this gdf:

In [None]:
datadisMap.to_file(os.path.join('maps',"distMapDatPeru.gpkg"), layer='distritos', driver="GPKG")

## Neighborhood

We can compute the neighborhood in a map using different algorithms:

In [None]:
from libpysal.weights import Queen, Rook, KNN

# rook
w_rook = Rook.from_dataframe(datadisMap) 

In [None]:
# queen
w_queen = Queen.from_dataframe(datadisMap)

In [None]:
# k nearest neighbors
w_knn = KNN.from_dataframe(datadisMap, k=4)

Let's understand the differences:

In [None]:
# first district in the GDF:
datadisMap.head(1)

In [None]:
# amount of neighbors of that district
len(w_rook.neighbors[0])

In [None]:
# details
datadisMap.iloc[w_rook.neighbors[0],]

In [None]:
# see the neighbor
datadisMap.iloc[w_rook.neighbors[0] ,].plot(facecolor="yellow")

In [None]:
# see whole area
base=datadisMap[datadisMap.PROVINCIA=="TACNA"].plot()
datadisMap.iloc[w_rook.neighbors[0] ,].plot(ax=base,facecolor="yellow",edgecolor='k')
datadisMap.head(1).plot(ax=base,facecolor="red")

Let's do the same with queen neighbors:

In [None]:
# how many
len(w_queen.neighbors[0])

In [None]:
# details
datadisMap.iloc[w_queen.neighbors[0] ,]

In [None]:
# see
datadisMap.iloc[w_queen.neighbors[0] ,].plot(facecolor="yellow")

In [None]:
# whole area
base=datadisMap[datadisMap.PROVINCIA=="TACNA"].plot()
datadisMap.iloc[w_queen.neighbors[0] ,].plot(ax=base,facecolor="yellow",edgecolor='k')
datadisMap.head(1).plot(ax=base,facecolor="red")

In [None]:
w_knn.neighbors[0]

In [None]:
base=datadisMap[datadisMap.PROVINCIA=="TACNA"].plot()
datadisMap.iloc[w_knn.neighbors[0],].plot(ax=base,facecolor="yellow")
datadisMap.head(1).plot(ax=base,facecolor="red")

## Spatial correlation

We need the neighboorhood matrix (the weight matrix) to compute spatial correlation: if the variable value is correlated with the values of its neighbors - which proves a spatial effect.

In [None]:
pd.DataFrame(*w_knn.full()) # 1 means both are neighbors

In [None]:
# needed for spatial correlation
w_knn.transform = 'R'

In [None]:
pd.DataFrame(*w_knn.full()) # 1 means both are neighbors

Spatial correlation is measured by the Moran's I statistic:

In [None]:
from esda.moran import Moran

moranIDH = Moran(datadisMap['IDH2019'], w_knn)
moranIDH.I,moranIDH.p_sim

A significant Moran's I suggest spatial correlation. Let's see the spatial scatter plot

In [None]:
from splot.esda import moran_scatterplot

fig, ax = moran_scatterplot(moranIDH, aspect_equal=True)
ax.set_xlabel('IDH_std')
ax.set_ylabel('SpatialLag_IDH_std');

### Local Spatial Correlation

We can compute a LISA (local Moran) for each case. That will help us find spatial clusters (spots) and spatial outliers:

* A **hotSpot** is a polygon whose value in the variable is high AND is surrounded with polygons with also high values.

* A **coldSpot** is a polygon whose value in the variable is low AND is surrounded with polygons with also low values.

* A **coldOutlier** is a polygon whose value in the variable is low BUT is surrounded with polygons with  high values.

* A **hotOutlier** is a polygon whose value in the variable is high BUT is surrounded with polygons with  low values.

It is also possible that no significant correlation is detected. Let's see those values:

In [None]:
# The scatterplot with local info
from esda.moran import Moran_Local

# calculate Moran_Local and plot
lisaIDH = Moran_Local(y=datadisMap['IDH2019'], w=w_knn,seed=2022)
fig, ax = moran_scatterplot(lisaIDH,p=0.05)
ax.set_xlabel('IDH_std')
ax.set_ylabel('SpatialLag_IDH_std');
#plt.show()

In [None]:
# the map with the spots and outliers
import matplotlib.pyplot as plt

from splot.esda import lisa_cluster
f, ax = plt.subplots(1, figsize=(12, 12))
plt.title('Spots and Outliers')
fig = lisa_cluster(lisaIDH, 
                   datadisMap,ax=ax,
                   legend_kwds={'loc': 'center left', 
                                'bbox_to_anchor': (0.7, 0.6)});
plt.show()

Let me add that data to my gdf:

In [None]:
# quadrant
lisaIDH.q

In [None]:
# significance
lisaIDH.p_sim

In [None]:
# quadrant: 1 HH,  2 LH,  3 LL,  4 HL
pd.Series(lisaIDH.q).value_counts()

The info in **lisaIDH.q** can not be used right away, we need to add if the local spatial correlation is significant:

In [None]:
datadisMap['IDH_quadrant']=[l if p <0.05 else 0 for l,p in zip(lisaIDH.q,lisaIDH.p_sim)  ]
datadisMap['IDH_quadrant'].value_counts()

Now, we recode:

In [None]:
labels = [ '0 no_sig', '1 hotSpot', '2 coldOutlier', '3 coldSpot', '4 hotOutlier']

datadisMap['IDH_quadrant_names']=[labels[i] for i in datadisMap['IDH_quadrant']]

datadisMap['IDH_quadrant_names'].value_counts()

In [None]:
from matplotlib import colors
myColMap = colors.ListedColormap([ 'white', 'pink', 'cyan', 'azure','red'])



# Set up figure and ax
f, ax = plt.subplots(1, figsize=(12,12))
# Plot unique values choropleth including
# a legend and with no boundary lines

plt.title('Spots and Outliers')

datadisMap.plot(column='IDH_quadrant_names', 
                categorical=True,
                cmap=myColMap,
                linewidth=0.1, 
                edgecolor='k',
                legend=True,
                legend_kwds={'loc': 'center left', 
                             'bbox_to_anchor': (0.7, 0.6)},
                ax=ax)
# Remove axis
ax.set_axis_off()
# Display the map
plt.show()

In [None]:
# final update
datadisMap.to_file(os.path.join('maps',"distMapDatPeru.gpkg"), layer='distritos', driver="GPKG")