### ✍️ Required libraries & settings

In [None]:
import sys
sys.path.insert(1, '/home/sbanik@quansight.com/demo-dashboards')

import warnings
warnings.filterwarnings('ignore')

from intake_utils import (
    catalog_init,
    list_catalog,
    view_catalog)

import numpy as np
import panel as pn
import geoviews as gv
import matplotlib.pyplot as plt
import holoviews as hv
from holoviews import opts,dim
from bokeh.models import HoverTool
hv.extension('bokeh')

### 👓 About dataset

- **Title :** Austin trees data
- **Brief description :** This dataset shows point locations of public trees inventoried by the City of Austin as of March 13th, 2020. Data is compiled from various sources: Development Services Department's Tree Division, AISD, Parks and Recreation Department, and Public Works Department's downtown tree inventory (2013). This is not a complete comprehensive inventory of all trees. Some errors and/or duplicate data may exist. For more information on Austin's urban forest, visit the U.S. Forest Service's Urban Forest Inventory and Analysis report: https://www.fs.usda.gov/treesearch/pubs/50393
- **Unique specie count post cleaning** = 360


#### 🗒 Columns in this Dataset:
- **GEOMETRY** : Feature's geometry type.
- **SPECIES** : Common species name of the tree.
- **DIAMETER** : Diameter of tree measure at breast height 4.5-feet from the ground. Units are measured in inches.
- **LATITUDE** : Estimated latitude of tree location.
- **LONGTITUDE** : Estimated longitude of tree location.
- **New Georeferenced Column** : Similar to GEOMETRY

### 📖 Reading via from `intake` catalog


In [None]:
catalog = catalog_init()

In [None]:
data_trees = catalog.austin_trees.read()
data_trees.head(1)

### Data cleaning, specie coloumn

In [None]:
# convering specie names to lower case 
data_trees['SPECIES'] = [x.lower() for x in data_trees.SPECIES.astype(str)]

In [None]:
"""
A lot is happening here, overall the expected outcome is to have all the specie name is sync:
- First step is to check if the specie name has a `,`
- Then the name is split by `,` reversed and again joined via space 
- Last is to strip any extra spaces
"""
data_trees['SPECIES'] = [str(" ".join(reversed(x.split(","))).strip()) if ',' in x else x for x in data_trees.SPECIES]

In [None]:
# replacing southern live oak's multiple occurances
occurance = ['southern live oak', 'live (southern) oak']
data_trees.loc[data_trees['SPECIES'].isin(occurance), 'SPECIES'] = 'southern live oak'

In [None]:
unique_specie_count = len(data_trees['SPECIES'].unique())
print("Unique specie count =", unique_specie_count)

### ✍️ Random checks 

In [None]:
data_trees.loc[data_trees['DIAMETER'] ==0.00].shape

In [None]:
data_trees.loc[data_trees['SPECIES'] =='unknown'].shape

> **Note**
- 3630 enteries have diameter as 0.00
- We could possibly infer that the trees were *cut/uprooted* due to *industrial/climatic* reasons
- 372 data points have species marked as unknown, possibly rare and not identified yet

### ✅ Data stats and additional info

In [None]:
data_trees.info()

In [None]:
data_trees.describe()

### 🛠 Pre-processing

In [None]:
def process_raw(dataset):
    df_macro = dataset.groupby('SPECIES') \
           .agg({'SPECIES':'count', 'DIAMETER':'mean'}) \
           .rename(columns={'SPECIES':'specie_count','DIAMETER':'mean_diameter'}) \
           .reset_index()\
           .sort_values(by=['specie_count','mean_diameter'], ascending=False)
    return df_macro

#### ⬇️ Function to select subset of dataset


In [None]:
def select_n_specie(count, data_trees):
    if count > 0:
        df_macro = process_raw(data_trees)
        specie_name_list = df_macro[:int(count)].SPECIES.to_list()
        df_raw_subset = data_trees.query('SPECIES in @specie_name_list')
        df_processed_subset = process_raw(df_raw_subset)
    else:
        print("Enter value greater than 0")

    return df_raw_subset, df_processed_subset

#### ⬇️ Usage of the function 

In [None]:
a,b = select_n_specie(3, data_trees)

## 📊 Insights

### 🌱  Violin plot, specie overview based on diameter distribution

**About violin plot**

 A violin plot depicts distributions of numeric data for one or more groups using density curves.  
 The width of each curve corresponds with the approximate frequency of data points in each region.   
 Densities are frequently accompanied by an overlaid chart type, such as box plot, to provide additional information.

In [None]:
key_dimensions   = [('DIAMETER', 'Diameter (inches)')]
value_dimensions = [('SPECIES', 'Specie name')]


count_widget = pn.widgets.IntSlider(name = 'Specie count', value = 7, start = 1, end = 15)
hover = HoverTool(tooltips=[('DIAMETER','$y')])

@pn.depends(count_widget.param.value)
def plotting_specie(count_widget):
    df_top_10_raw,df_top_10_processed = select_n_specie(count_widget, data_trees)
    fig = hv.Violin(df_top_10_raw, value_dimensions, key_dimensions).opts(xrotation=45, width=800, height=400, 
                                                                          violin_fill_color=dim('SPECIES').str(), 
                                                                          cmap='Set1', tools=[hover])
    return fig

widgets = pn.WidgetBox(count_widget)
pn.Column(widgets, plotting_specie)

### Insights

From the max width of the violin plot we can infer the respective concentration of diameter value, for example *Pecan* has maximum concentration (mean) of diameter at 18.4 inch

### 📖 Distribution plot based mean diameter and abundance [Top 10 specie]

In [None]:
value_dimensions   = [('mean_diameter', 'Mean diameter (measure unit=inches)'), ('specie_count', 'Specie Count')]
key_dimensions = [('SPECIES', 'SPECIES')]
df_top_10_raw,df_top_10_processed = select_n_specie(10, data_trees)
macro = hv.Table(df_top_10_processed, key_dimensions, value_dimensions)

plot_each_specie = macro.to.table('specie_count', 'Mean diameter (measure unit=inches)').opts(height=50, width=400)
hover = HoverTool(tooltips=[('DIAMETER','$x')])
plot_each_specie + hv.Distribution(
    data=df_top_10_raw,
    kdims=['DIAMETER'],
    vdims=['SPECIES'],
).groupby(
    'SPECIES'
).opts(tools=[hover])

### Insight
- The distributions are heavily right skewed (Positive skewed) normal distributions.
- We can infer from above distributions that diameter *Mean  >  Median  >  Mode*

### Count of trees among different species based on diamter values

In [None]:
raw_df_trees_subset, processed_df_trees_subset= select_n_specie(10, data_trees)
raw_df_trees_subset['rounded_diameter'] = raw_df_trees_subset['DIAMETER'].round()

raw_df_trees_subset = raw_df_trees_subset.drop(['GEOMETRY', 'LATITUDE', 'LONGTITUDE', 'New Georeferenced Column'], axis=1)
plotable_df_subset = raw_df_trees_subset.loc[raw_df_trees_subset['rounded_diameter']<=40].groupby(
    ['SPECIES']).value_counts(
    'rounded_diameter').reset_index().rename(
    columns={0: 'count_diameter'})

In [None]:
plot = hv.Bars(plotable_df_subset, kdims=['rounded_diameter', 'SPECIES'], vdims=['count_diameter']).aggregate(function=np.sum).sort()
hover = HoverTool(tooltips=[('Specie name','@SPECIES'), 
                            ('Diamter value','@rounded_diameter'), 
                            ('Total number of trees', '@count_diameter')])
plot.opts(width=1200, 
          height=525,
          stacked=True,
          tools=[hover]).relabel("Diversity of diameter values among top 10 species")

### Insight
- Diamter ranges 8-11 are most abundant among the species.
- Diameter zero is common across all the species, indicates trees been uprooted/deforestration

### 🧭 Plot for top 10 specie location: using holoviews, datashader and bokeh as backend

In [None]:
import holoviews as hv
import datashader as ds
import colorcet as cc
import holoviews.operation.datashader as hd
from datashader.utils import lnglat_to_meters as webm
from datashader import transfer_functions as tf
from holoviews.element.tiles import StamenTerrainRetina, StamenTonerRetina

Austin = ((-97.91,  -97.52), (30.17, 30.37))
plot_width  = int(950)
plot_height = int(plot_width//1.2)
tile_selection = StamenTerrainRetina()

raw_df_trees_geo, processed_df_trees_geo = select_n_specie(10, data_trees)
raw_df_trees_geo = raw_df_trees_geo[['LONGTITUDE','LATITUDE', 'SPECIES', 'DIAMETER']]
raw_df_trees_geo.loc[:, 'lon'], raw_df_trees_geo.loc[:, 'lat'] = webm(raw_df_trees_geo['LONGTITUDE'],raw_df_trees_geo['LATITUDE'])
raw_df_trees_geo["SPECIES"]=raw_df_trees_geo["SPECIES"].astype("category")


In [None]:
cats = list(raw_df_trees_geo.SPECIES.unique())
colors    = cc.glasbey_bw_minc_20_maxl_70
color_key = {cat: tuple(int(e*255.) for e in colors[i]) for i, cat in enumerate(cats)}
legend    = hv.NdOverlay({k: hv.Points([0,0], label=str(k)).opts(
                                         color=cc.rgb_to_hex(*v), size=0, apply_ranges=False) 
                          for k, v in color_key.items()}, 'SPECIES')


x_range_w ,y_range_w=webm(*Austin)    
tiles = tile_selection.redim.range(x=tuple(x_range_w), y=tuple(y_range_w))
shaded = hd.datashade(hv.Points(raw_df_trees_geo, ['lon', 'lat'], 
                                x_range=x_range_w, y_range=y_range_w, 
                                plot_height=plot_height, plot_width=plot_width), 
                                color_key=color_key,
                                aggregator=ds.count_cat('SPECIES'))
ropts = dict(tools=['hover'])
hover_data = shaded.opts(**ropts)

tiles * hd.dynspread(shaded, threshold=1, max_px=5, shape='circle').opts(xaxis=None, yaxis=None, width=900, height=500) * (hover_data) * legend
    