<img src="https://i.imgur.com/6U6q5jQ.png"/>

_____

# Analytics on GeodataFrames
 

Let's read the data in:

In [None]:
# data table
import pandas as pd
linkData="https://github.com/SocialAnalytics-StrategicIntelligence/OrganizeExploreAndQuery/raw/main/dataFiles/dengue_ok.pkl"
dengue = pd.read_pickle(linkData)
dengue.info()

In [None]:
#check
dengue.head()

In [None]:
# years in data
dengue.ano.value_counts()

Let's subset:

In [None]:
dengue=dengue[dengue.ano>=2012]

We have dengue by level:

In [None]:
dengue.enfermedad.value_counts()

Keeping some:

In [None]:
dengue_alarma=dengue[dengue.enfermedad!='GRAVE']

dengue_alarma.head()

## Reshaping to Long

People per level, by distrit by year:

In [None]:
indexList=['ano','departamento','provincia','enfermedad']
aggregator={'enfermedad':[len]}
dengue_provYear=dengue_alarma.groupby(indexList,observed=True).agg(aggregator)
dengue_provYear

Sending the counts to wide columns:

In [None]:
dengueDraft=dengue_provYear.unstack(3).fillna(0) #leftmost index in rows
dengueDraft

Computing share of dengue, level 'alarm':

In [None]:
dengueDraft['ALARMA_pct']=dengueDraft.iloc[:,1]/(dengueDraft.iloc[:,0] + dengueDraft.iloc[:,1])
dengue_provYear_Alarm_w=dengueDraft['ALARMA_pct'].unstack('ano').fillna(0)
dengue_provYear_Alarm_w

Notice the data type:

In [None]:
dengue_provYear_Alarm_w.columns

We should have text not numbers:

In [None]:
dengue_provYear_Alarm_w.columns=['year'+str(x) for x in dengue_provYear_Alarm_w.columns]

In [None]:
# then
dengue_provYear_Alarm_w

In [None]:
# as usual
dengue_provYear_Alarm_w.reset_index(inplace=True)
dengue_provYear_Alarm_w

Let's call a map:

In [None]:
mapLink='https://github.com/SocialAnalytics-StrategicIntelligence/GeoDF_Analytics/raw/main/maps/ProvsINEI2023.zip'

import geopandas as gpd

provmap=gpd.read_file(mapLink)

provmap.info()

Let me create a column, concatenating two:

In [None]:
provmap['location']=['+'.join(x[0]) for x in zip(provmap.iloc[:,3:5].values)]
provmap.head(10)

I will do the same with the data frame:

In [None]:
dengue_provYear_Alarm_w['location']=['+'.join(x[0]) for x in zip(dengue_provYear_Alarm_w.iloc[:,:2].values)]
dengue_provYear_Alarm_w.head()

## Preprocessing



The names from non-english speaking countries may come with some symbols that may cause trouble (', ~). Let's get rid of those:

In [None]:
import unidecode


byePunctuation=lambda x: unidecode.unidecode(x)
dengue_provYear_Alarm_w['location']=dengue_provYear_Alarm_w['location'].apply(byePunctuation)
provmap['location']=provmap['location'].apply(byePunctuation)

It would be good making sure no *ghost* appears between words:

In [None]:
# replacing dashes and multiple spaces by a simple space
dengue_provYear_Alarm_w['location']=dengue_provYear_Alarm_w.location.str.replace("\-|\_|\s+","",regex=True)
provmap['location']=provmap.location.str.replace("\-|\_|\s+","",regex=True)

## Merging

We need to merge both tables now. That can happen effectively if both tables have a **key** column: a column (or collection of them) whose values in one table are the same in the other one.

The match need not be exact, but only common values in the *key* are merged.

Let's find out what is NOT matched in each table:

In [None]:
nomatch_df=set(dengue_provYear_Alarm_w.location)- set(provmap.location)
nomatch_gdf=set(provmap.location)-set(dengue_provYear_Alarm_w.location) 

This is what could not be matched:

In [None]:
len(nomatch_df), len(nomatch_gdf)

The right way to go is using **fuzzy merging** (remember we need  _the fuzz_):

In [None]:
# pick the closest match from nomatch_gdf for a value in nomatch_df
from thefuzz import process
[(dis,process.extractOne(dis,nomatch_gdf)) for dis in sorted(nomatch_df)]

If you are comfortable, you prepare a _dictionary_ of changes:

In [None]:
# is this OK?
{dis:process.extractOne(dis,nomatch_gdf)[0] for dis in sorted(nomatch_df)}

In [None]:
# then:
changesinDF={dis:process.extractOne(dis,nomatch_gdf)[0] for dis in sorted(nomatch_df)}

Now, make the replacements:

In [None]:
dengue_provYear_Alarm_w.replace({'location': changesinDF}, inplace=True)

Is it over?

In [None]:
nomatch_df=set(dengue_provYear_Alarm_w.location)- set(provmap.location)
nomatch_gdf=set(provmap.location)-set(dengue_provYear_Alarm_w.location) 

[(dis,process.extractOne(dis,nomatch_gdf)) for dis in sorted(nomatch_df)]

Now the merge can happen:

In [None]:
dengue_provYear_Alarm_map=provmap.merge(dengue_provYear_Alarm_w, on='location',how='left',indicator='flag')

In [None]:
# check
dengue_provYear_Alarm_map.info()

In [None]:
# avoid poblems with fillna() 
dengue_provYear_Alarm_map['flag']=dengue_provYear_Alarm_map.flag.astype(str)

We can get rid of some columns:

In [None]:
bye=['departamento', 'provincia', 'CCPP','CCDD']
dengue_provYear_Alarm_map.drop(columns=bye,inplace=True)

# keeping
dengue_provYear_Alarm_map.head()

In [None]:
# filling with zeroes
dengue_provYear_Alarm_map.fillna(0,inplace=True)

We can save this geoDF:

In [None]:
import os
dengue_provYear_Alarm_map.to_file(os.path.join('maps',"provinciasPeru.gpkg"), layer='provinciasDengue', driver="GPKG")

## Exploring one variable

This time, we explore statistically one variable in the map:

In [None]:
# statistics
dengue_provYear_Alarm_map.year2022.describe()

A visual look:

In [None]:
import seaborn as sea

sea.boxplot(dengue_provYear_Alarm_map.year2022, color='yellow',orient='h')

In [None]:

from sklearn.preprocessing import QuantileTransformer
qt = QuantileTransformer(n_quantiles=100, random_state=0,output_distribution='normal')
qt_result=qt.fit_transform(dengue_provYear_Alarm_map[['year2022']])
sea.boxplot(qt_result, color='yellow',orient='h')

In [None]:
dengue_provYear_Alarm_map['year_2022_qt']=qt_result

## Spatial Correlation

### Neighboorhood

We can compute the neighborhood in a map using different algorithms:

In [None]:
from libpysal.weights import Queen, Rook, KNN

# rook

w_rook = Rook.from_dataframe(dengue_provYear_Alarm_map,use_index=False) 

In [None]:
# rook
w_queen = Queen.from_dataframe(dengue_provYear_Alarm_map,use_index=False)

In [None]:
# k nearest neighbors
w_knn = KNN.from_dataframe(dengue_provYear_Alarm_map, k=8)

Let's understand the differences:

In [None]:
# first one
dengue_provYear_Alarm_map.head(1)

In [None]:
# amount neighbors of that district
w_rook.neighbors[0] 

In [None]:
# see
base=dengue_provYear_Alarm_map[dengue_provYear_Alarm_map.PROVINCIA=="CHACHAPOYAS"].plot()
dengue_provYear_Alarm_map.iloc[w_rook.neighbors[0] ,].plot(ax=base,facecolor="yellow",edgecolor='k')
dengue_provYear_Alarm_map.head(1).plot(ax=base,facecolor="red")

Let's do the same:

In [None]:
w_queen.neighbors[0]

In [None]:
base=dengue_provYear_Alarm_map[dengue_provYear_Alarm_map.PROVINCIA=="CHACHAPOYAS"].plot()
dengue_provYear_Alarm_map.iloc[w_queen.neighbors[0] ,].plot(ax=base,facecolor="yellow",edgecolor='k')
dengue_provYear_Alarm_map.head(1).plot(ax=base,facecolor="red")

In [None]:
w_knn.neighbors[0]

In [None]:
base=dengue_provYear_Alarm_map[dengue_provYear_Alarm_map.PROVINCIA=="CHACHAPOYAS"].plot()
dengue_provYear_Alarm_map.iloc[w_knn.neighbors[0] ,].plot(ax=base,facecolor="yellow",edgecolor='k')
dengue_provYear_Alarm_map.head(1).plot(ax=base,facecolor="red")

Let me pay attention to the queen results:

In [None]:
# all the neighbors by row
w_queen.neighbors

In [None]:
# the matrix of neighboorhood:

pd.DataFrame(*w_queen.full()).astype(int) # 1 means both are neighbors

In [None]:
# pct of neighboorhood (density)
w_queen.pct_nonzero

In [None]:
# a province with NO neighbor?
w_queen.islands

## Moran's correlation

We need the neighboorhood matrix (the weight matrix) to compute spatial correlation: if the variable value is correlated with the values of its neighbors - which proves a spatial effect.

In [None]:
# needed for spatial correlation
w_queen.transform = 'R'

In [None]:
pd.DataFrame(*w_queen.full()).sum(axis=1) # 1 means both are neighbors

Spatial correlation is measured by the Moran's I statistic:

In [None]:
from esda.moran import Moran

moranDENGUE = Moran(dengue_provYear_Alarm_map['year_2022_qt'], w_queen)
moranDENGUE.I,moranDENGUE.p_sim

The Moran's I is significant. Let's see:

In [None]:
from splot.esda import moran_scatterplot
import matplotlib.pyplot as plt

fig, ax = moran_scatterplot(moranDENGUE)
ax.set_xlabel('Dengue_alarma_share')
ax.set_ylabel('SpatialLag_Dengue_alarma_share')


### Local Spatial Correlation

We can compute a LISA (local Moran) for each case. That will help us find spatial clusters (spots) and spatial outliers:

* A **hotSpot** is a polygon whose value in the variable is high AND is surrounded with polygons with also high values.

* A **coldSpot** is a polygon whose value in the variable is low AND is surrounded with polygons with also low values.

* A **coldOutlier** is a polygon whose value in the variable is low BUT is surrounded with polygons with  high values.

* A **hotOutlier** is a polygon whose value in the variable is high BUT is surrounded with polygons with  low values.

It is also possible that no significant correlation is detected. Let's see those values:

In [None]:
# The scatterplot with local info

from esda.moran import Moran_Local

# calculate Moran_Local and plot
lisaDENGUE = Moran_Local(y=dengue_provYear_Alarm_map['year_2022_qt'], w=w_knn,seed=2022)
fig, ax = moran_scatterplot(lisaDENGUE,p=0.05)
ax.set_xlabel('Dengue_alarma_share')
ax.set_ylabel('SpatialLag_Dengue_alarma_share');


In [None]:
from splot.esda import plot_local_autocorrelation
plot_local_autocorrelation(lisaDENGUE, dengue_provYear_Alarm_map,'year_2022_qt')
plt.show()

In [None]:
# the map with the spots and outliers

from splot.esda import lisa_cluster
f, ax = plt.subplots(1, figsize=(12, 12))
plt.title('Spots and Outliers')
fig = lisa_cluster(lisaDENGUE, 
                   dengue_provYear_Alarm_map,ax=ax,
                   legend_kwds={'loc': 'center left', 
                                'bbox_to_anchor': (0.7, 0.6)})

Let me add that data to my gdf:

In [None]:
# quadrant
lisaDENGUE.q

In [None]:
# significance
lisaDENGUE.p_sim

In [None]:
# quadrant: 1 HH,  2 LH,  3 LL,  4 HL
pd.Series(lisaDENGUE.q).value_counts()

The info in **lisaDENGUE.q** can not be used right away, we need to add if the local spatial correlation is significant:

In [None]:
dengue_provYear_Alarm_map['DENGUE_quadrant']=[l if p <0.05 else 0 for l,p in zip(lisaDENGUE.q,lisaDENGUE.p_sim)  ]
dengue_provYear_Alarm_map['DENGUE_quadrant'].value_counts()

Now, we recode:

In [None]:
labels = [ '0 no_sig', '1 hotSpot', '2 coldOutlier', '3 coldSpot', '4 hotOutlier']

dengue_provYear_Alarm_map['DENGUE_quadrant_names']=[labels[i] for i in dengue_provYear_Alarm_map['DENGUE_quadrant']]

dengue_provYear_Alarm_map['DENGUE_quadrant_names'].value_counts()
                                  

Let's replot:

In [None]:
from matplotlib import colors
myColMap = colors.ListedColormap([ 'ghostwhite', 'red', 'green', 'black','orange'])




f, ax = plt.subplots(1, figsize=(12,12))


plt.title('Spots and Outliers')

dengue_provYear_Alarm_map.plot(column='DENGUE_quadrant_names', 
                categorical=True,
                cmap=myColMap,
                linewidth=0.1, 
                edgecolor='white',
                legend=True,
                legend_kwds={'loc': 'center left', 
                             'bbox_to_anchor': (0.7, 0.6)},
                ax=ax)
# Remove axis
ax.set_axis_off()
# Display the map
plt.show()

In [None]:
dengue_provYear_Alarm_map.explore("DENGUE_quadrant_names", categorical=True,tooltip='location',cmap=myColMap)

In [None]:
import folium

map1=dengue_provYear_Alarm_map[dengue_provYear_Alarm_map.DENGUE_quadrant_names=='1 hotSpot']
map2=dengue_provYear_Alarm_map[dengue_provYear_Alarm_map.DENGUE_quadrant_names=='2 coldOutlier']
map3=dengue_provYear_Alarm_map[dengue_provYear_Alarm_map.DENGUE_quadrant_names=='3 coldSpot']
map4=dengue_provYear_Alarm_map[dengue_provYear_Alarm_map.DENGUE_quadrant_names=='4 hotOutlier']

m = map1.explore(
    color="red",  
    tooltip=False,  # hide tooltip
    popup=["location"],  # (on-click)
    name="hotSpot"  # name of the layer in the map
)

map2.explore(
    m=m, # notice
    color="green",  
    tooltip=False,  
    popup=["location"],
    name="coldOutlier"
)

map3.explore(
    m=m,
    color="black",  
    tooltip=False,  
    popup=["location"],
    name="coldSpot", 
)

map4.explore(
    m=m,
    color="orange", 
    tooltip=False,  
    popup=["location"],
    name="hotOutlier", 
)

folium.TileLayer("CartoDB positron", show=False).add_to(m)  # use folium to add alternative tiles
folium.LayerControl(collapsed=True).add_to(m)  # use folium to add layer control

m  # show map

## Bivariate LISA

In [None]:
#from esda.moran import Moran_BV, Moran_Local_BV
from esda.moran import Moran_BV

mbi = Moran_BV(dengue_provYear_Alarm_map['year2021'],  dengue_provYear_Alarm_map['year2022'],  w_queen)
mbi.I,mbi.p_sim

In [None]:
# The scatterplot with local info
from esda.moran import Moran_Local_BV

# calculate Moran_Local and plot
lisaDENGUE_bv = Moran_Local_BV(y=dengue_provYear_Alarm_map['year2021'],
                               x=dengue_provYear_Alarm_map['year2022'],
                               w=w_queen)

fig, ax = moran_scatterplot(lisaDENGUE_bv, p=0.05,aspect_equal=True)

ax.set_xlabel('Dengue_2022')
ax.set_ylabel('SpatialLag_Dengue_2021')
plt.show()

In [None]:
dengue_provYear_Alarm_map['DENGUE_quadrant_21_22']=[l if p <0.05 else 0 for l,p in zip(lisaDENGUE_bv.q,lisaDENGUE_bv.p_sim)  ]

labels = [ '0 no_sig', '1 hotSpot', '2 coldOutlier', '3 coldSpot', '4 hotOutlier']

dengue_provYear_Alarm_map['DENGUE_quadrant_21_22_names']=[labels[i] for i in dengue_provYear_Alarm_map['DENGUE_quadrant_21_22']]
                                 

In [None]:
# see new columns
dengue_provYear_Alarm_map

In [None]:
from matplotlib import colors
myColMap = colors.ListedColormap([ 'ghostwhite', 'red', 'green', 'black','orange'])




f, ax = plt.subplots(1, figsize=(12,12))


plt.title('Spots and Outliers')

dengue_provYear_Alarm_map.plot(column='DENGUE_quadrant_21_22_names', 
                categorical=True,
                cmap=myColMap,
                linewidth=0.1, 
                edgecolor='white',
                legend=True,
                legend_kwds={'loc': 'center left', 
                             'bbox_to_anchor': (0.7, 0.6)},
                ax=ax)
# Remove axis
ax.set_axis_off()
# Display the map
plt.show()