# It's Tree Huggin' Time!

#### Before you begin:
* This notebook will establish a MySQL connection, create a database, its
associated tables, without the use of the MySQL UI. Please make sure to 
have mysqlclient installed by running 
```pip install mysqlclient``` 
in your python runtime environment.
* The data set for NYC Trees is rather large and is not stored in this repository, please go to [Kaggle](https://www.kaggle.com/new-york-city/ny-2015-street-tree-census-tree-data/version/12) and download to the resources folder within this repository.
* Remember to insert you password in mysql_scr.py (please do not alter the other variables)
* <b>Warning!</b> The create_db variable will drop and create a database on use!
* Let's get tree huggin'

In [1]:
# import dependencies
import pandas as pd
from sqlalchemy import create_engine
import mysql_scr
import numpy as np
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine
from sqlalchemy import func
import os
from splinter import Browser
from bs4 import BeautifulSoup as bs

In [2]:
# set up mysql initial connections
init_string = (f"root:{mysql_scr.pw}@localhost")
pre_engine = create_engine(f"mysql://{init_string}")

In [3]:
# create mysql db
pre_engine.execute(f"{mysql_scr.create_db}")

<sqlalchemy.engine.result.ResultProxy at 0x285aec48438>

In [4]:
# initialize new mysql db connection
engine = create_engine(f"mysql://{init_string}/tree_db")
engine.execute(f"{mysql_scr.create_tbl}")

<sqlalchemy.engine.result.ResultProxy at 0x285aec60160>

In [5]:
# automap base and check if tables exist...
Base = automap_base()
Base.prepare(engine, reflect=True)
Base.classes.keys()

['nyc_tree', 'tree_species']

In [6]:
# set file
file = os.path.join("Resources","2015-street-tree-census-tree-data.csv")

In [7]:
tree_data = pd.read_csv(file)
tree_data.head()

Unnamed: 0,tree_id,block_id,created_at,tree_dbh,stump_diam,curb_loc,status,health,spc_latin,spc_common,...,boro_ct,state,latitude,longitude,x_sp,y_sp,council district,census tract,bin,bbl
0,180683,348711,2015-08-27T00:00:00,3,0,OnCurb,Alive,Fair,Acer rubrum,red maple,...,4073900,New York,40.723092,-73.844215,1027431.148,202756.7687,29.0,739.0,4052307.0,4022210000.0
1,200540,315986,2015-09-03T00:00:00,21,0,OnCurb,Alive,Fair,Quercus palustris,pin oak,...,4097300,New York,40.794111,-73.818679,1034455.701,228644.8374,19.0,973.0,4101931.0,4044750000.0
2,204026,218365,2015-09-05T00:00:00,3,0,OnCurb,Alive,Good,Gleditsia triacanthos var. inermis,honeylocust,...,3044900,New York,40.717581,-73.936608,1001822.831,200716.8913,34.0,449.0,3338310.0,3028870000.0
3,204337,217969,2015-09-05T00:00:00,10,0,OnCurb,Alive,Good,Gleditsia triacanthos var. inermis,honeylocust,...,3044900,New York,40.713537,-73.934456,1002420.358,199244.2531,34.0,449.0,3338342.0,3029250000.0
4,189565,223043,2015-08-30T00:00:00,21,0,OnCurb,Alive,Good,Tilia americana,American linden,...,3016500,New York,40.666778,-73.975979,990913.775,182202.426,39.0,165.0,3025654.0,3010850000.0


In [8]:
#obtain desired columns
tree_data = tree_data[['borough','spc_common','spc_latin','tree_dbh']]
#clean data
tree_data_drop = tree_data.dropna()
tree_data_clean = tree_data_drop.drop_duplicates()

tree_data_clean.head()

Unnamed: 0,borough,spc_common,spc_latin,tree_dbh
0,Queens,red maple,Acer rubrum,3
1,Queens,pin oak,Quercus palustris,21
2,Brooklyn,honeylocust,Gleditsia triacanthos var. inermis,3
3,Brooklyn,honeylocust,Gleditsia triacanthos var. inermis,10
4,Brooklyn,American linden,Tilia americana,21


In [9]:
#obtain average tree diameter
tree_info = tree_data_clean.groupby(['borough','spc_common'])
tree_diam = pd.DataFrame(tree_info['tree_dbh'].mean())

#obtain number of tree type in postcode

tree_quantity = pd.DataFrame(tree_data_clean.groupby(['borough','spc_common']).size())

tree_summary = tree_quantity.join(tree_diam).reset_index()
tree_summary = tree_summary.rename(columns={0:'tree_count'})
tree_summary.head(2)

Unnamed: 0,borough,spc_common,tree_count,tree_dbh
0,Bronx,'Schubert' chokecherry,17,9.176471
1,Bronx,American beech,11,12.272727


In [10]:
#rename columns to match database table
tree_transformed = tree_summary.rename(columns = {'borough':'borough',
                              'spc_common': 'species_nm',
                              'tree_count': 'count_tree',
                              'tree_dbh': 'avg_diameter'})
tree_transformed.head()

Unnamed: 0,borough,species_nm,count_tree,avg_diameter
0,Bronx,'Schubert' chokecherry,17,9.176471
1,Bronx,American beech,11,12.272727
2,Bronx,American elm,46,23.913043
3,Bronx,American hophornbeam,21,14.238095
4,Bronx,American hornbeam,14,11.428571


In [11]:
#load dataframe to database
tree_transformed.to_sql(name='nyc_tree',con=engine,if_exists='append',index=False)

In [12]:
# validate import
result = engine.execute('Select * from nyc_tree limit 10')
for r in result:
    print(r)

(1, 'Bronx', "'Schubert' chokecherry", 17, 9)
(2, 'Bronx', 'American beech', 11, 12)
(3, 'Bronx', 'American elm', 46, 24)
(4, 'Bronx', 'American hophornbeam', 21, 14)
(5, 'Bronx', 'American hornbeam', 14, 11)
(6, 'Bronx', 'American larch', 7, 9)
(7, 'Bronx', 'American linden', 42, 22)
(8, 'Bronx', 'Amur cork tree', 14, 10)
(9, 'Bronx', 'Amur maackia', 21, 11)
(10, 'Bronx', 'Amur maple', 21, 12)


In [13]:
tree_sp_df = tree_data_clean['spc_common']
tree_sp_unique = pd.DataFrame(tree_sp_df.drop_duplicates())
#test_df_unique.str.strip()

tree_sp_unique['spc_common'] = tree_sp_unique['spc_common'].str.replace(" ","")
tree_sp_ins = tree_sp_unique.rename(columns={'spc_common':'web_common_nm'})
tree_sp_ins['species_nm'] = tree_data_clean['spc_common']
tree_sp_ins.head()

Unnamed: 0,web_common_nm,species_nm
0,redmaple,red maple
1,pinoak,pin oak
2,honeylocust,honeylocust
4,Americanlinden,American linden
9,Londonplanetree,London planetree


In [14]:
tree_sp_ins.to_sql(name='tree_species',con=engine,if_exists='append',index=False)

In [15]:
# validate import
result = engine.execute('Select * from tree_species limit 20')
for r in result:
    print(r)

("'Schubert' chokecherry", None, "'Schubert'chokecherry", None, None, None, None)
('American beech', None, 'Americanbeech', None, None, None, None)
('American elm', None, 'Americanelm', None, None, None, None)
('American hophornbeam', None, 'Americanhophornbeam', None, None, None, None)
('American hornbeam', None, 'Americanhornbeam', None, None, None, None)
('American larch', None, 'Americanlarch', None, None, None, None)
('American linden', None, 'Americanlinden', None, None, None, None)
('Amur cork tree', None, 'Amurcorktree', None, None, None, None)
('Amur maackia', None, 'Amurmaackia', None, None, None, None)
('Amur maple', None, 'Amurmaple', None, None, None, None)
('arborvitae', None, 'arborvitae', None, None, None, None)
('ash', None, 'ash', None, None, None, None)
('Atlantic white cedar', None, 'Atlanticwhitecedar', None, None, None, None)
('Atlas cedar', None, 'Atlascedar', None, None, None, None)
('bald cypress', None, 'baldcypress', None, None, None, None)
('bigtooth aspen',

In [16]:
# initialize base
Tree = Base.classes.tree_species
session = Session(engine)
Tree_species = session.query(Tree.species_nm).all()

# declare tree_list variable and append from database table
tree_list = []

for tree in Tree_species:
    tree_list.append(tree.species_nm)
    
print(tree_list)

["'Schubert' chokecherry", 'American beech', 'American elm', 'American hophornbeam', 'American hornbeam', 'American larch', 'American linden', 'Amur cork tree', 'Amur maackia', 'Amur maple', 'arborvitae', 'ash', 'Atlantic white cedar', 'Atlas cedar', 'bald cypress', 'bigtooth aspen', 'black cherry', 'black locust', 'black maple', 'black oak', 'black pine', 'black walnut', 'blackgum', 'blue spruce', 'boxelder', 'bur oak', 'Callery pear', 'catalpa', 'cherry', 'Chinese chestnut', 'Chinese elm', 'Chinese fringetree', 'Chinese tree lilac', 'cockspur hawthorn', 'common hackberry', 'Cornelian cherry', 'crab apple', 'crepe myrtle', 'crimson king maple', 'cucumber magnolia', 'dawn redwood', 'Douglas-fir', 'eastern cottonwood', 'eastern hemlock', 'eastern redbud', 'eastern redcedar', 'empress tree', 'English oak', 'European alder', 'European beech', 'European hornbeam', 'false cypress', 'flowering dogwood', 'ginkgo', 'golden raintree', 'green ash', 'hardy rubber tree', 'hawthorn', 'hedge maple',

In [17]:
# Create webkey (will be replaced) for looking up to website
trees  = [x.replace(' ','').lower() for x in tree_list]

for i in trees:
    print (i)

'schubert'chokecherry
americanbeech
americanelm
americanhophornbeam
americanhornbeam
americanlarch
americanlinden
amurcorktree
amurmaackia
amurmaple
arborvitae
ash
atlanticwhitecedar
atlascedar
baldcypress
bigtoothaspen
blackcherry
blacklocust
blackmaple
blackoak
blackpine
blackwalnut
blackgum
bluespruce
boxelder
buroak
callerypear
catalpa
cherry
chinesechestnut
chineseelm
chinesefringetree
chinesetreelilac
cockspurhawthorn
commonhackberry
corneliancherry
crabapple
crepemyrtle
crimsonkingmaple
cucumbermagnolia
dawnredwood
douglas-fir
easterncottonwood
easternhemlock
easternredbud
easternredcedar
empresstree
englishoak
europeanalder
europeanbeech
europeanhornbeam
falsecypress
floweringdogwood
ginkgo
goldenraintree
greenash
hardyrubbertree
hawthorn
hedgemaple
himalayancedar
holly
honeylocust
horsechestnut
japanesehornbeam
japanesemaple
japanesesnowbell
japanesetreelilac
japanesezelkova
katsuratree
kentuckycoffeetree
kentuckyyellowwood
kousadogwood
littleleaflinden
londonplanetree
magnoli


### Please Change/ Un-comment the executable path based on your OS!

In [18]:
# Open browser connection

# For Mac Users:
# executable_path = {'executable_path': '/usr/local/bin/chromedriver'}

# For Windows Users:
executable_path = {'executable_path': 'chromedriver.exe'}

browser = Browser('chrome', **executable_path, headless=False)

In [19]:
# declare url and parser
url = "http://leafsnap.com/species/"
browser.visit(url)
html = browser.html
soup = bs(html, "html.parser")

### Get raking! Get it? Because its trees!
(if you don't like that joke, you can leaf)

In [20]:
# Get webscraping!
# Get div data related to the first species tab
ne_species = soup.find('div', id='species-1')

In [21]:
#find all tr rows to loop through the data
species = ne_species.table.tbody.find_all('tr')

for td in species:
    img = td.find_all('img')
    name = td.find_all('td')
    link = td.find_all('a')
    qry_name = name[3].text.replace(" ", "").lower()
    
    #check to see if web tree is in sql tree
    if qry_name in trees:
        image_link = img[0]['src']
        web_name = name[3].text
#         print(image_link)
        print(name[3].text)
#         print(link[0]['href'])
        
        ## update sql table
        session.query(Tree.img_loc)\
                .filter(func.lower(func.replace(Tree.species_nm,' ','')) == qry_name)\
                .update({"img_loc":image_link}, synchronize_session='fetch')
        session.commit()

        ## go to tree data page
        browser.click_link_by_partial_href(link[0]['href'])
        
        #get new webpage html and parse
        html_species = browser.html
        soup_species = bs(html_species, "html.parser")
        
        
        tree = soup_species.dl.find_all('div')
        habitat = 'N/A'
        growth= 'N/A'
        bloom= 'N/A'
        longevity= 'N/A'
        for div in tree:
            #print(div.dt.text,div.dd.text)
            if div.dt.text == 'Habitat:':
                habitat = div.dd.text
            if div.dt.text == 'Growth Habit:':
                growth = div.dd.text
            if div.dt.text == 'Bloom Time:':
                bloom = div.dd.text
            if div.dt.text == 'Longevity:':
                longevity = div.dd.text
#         print(habitat)
#         print(growth)
#         print(bloom)
#         print(longevity)
        
#         try:
#             habitat = tree[0].text
#         except IndexError:
#             habitat = 'N/A'
#         try:
#             growth = tree[1].text
#         except IndexError:
#             growth = 'N/A'
#         try:
#             bloom = tree[2].text
#         except IndexError:
#             bloom = "N/A"
#         try:
#             longevity = tree[3].text
#         except IndexError:
#             longevity = 'N/A'
        
        session.query(Tree.habitat, Tree.growth_habit, Tree.bloom_time, Tree.longevity )\
                .filter(func.lower(func.replace(Tree.species_nm,' ','')) == qry_name)\
                .update({"habitat":habitat, "growth_habit": growth, "bloom_time": bloom, "longevity": longevity}, synchronize_session='fetch')

        session.commit()
#         print(habitat, growth, bloom, longevity)
 
        browser.back()
print("Scrape Complete!")

American  Beech
American  Elm
American  Hornbeam
American  Linden
Amur  Corktree
Amur  Maple
Atlas  Cedar
Bald  Cypress
Bigtooth  Aspen
Black  Cherry
Black  Locust
Black  Maple
Black  Oak
Black  Walnut
Blue  Spruce
Box  Elder
Bur  Oak
Callery  Pear
Chinese  Elm
Chinese  Fringetree
Cockspur  Hawthorn
Cornelian  Cherry
Douglas-Fir
Eastern  Cottonwood
Eastern  Redbud
Eastern  Redcedar
Empress  Tree
English  Oak
European  Alder
European  Beech
European  Hornbeam
Flowering  Dogwood
Ginkgo
Green  Ash
Hardy  Rubbertree
Hedge  Maple
Himalayan  Cedar
Honeylocust
Japanese  Maple
Japanese  Snowbell
Japanese  Zelkova
Japanese Tree  Lilac
Katsura  Tree
Kentucky  Coffeetree
Kousa  Dogwood
Littleleaf  Linden
London Plane  Tree
Mimosa
Norway  Maple
Norway  Spruce
Ohio  Buckeye
Paper  Birch
Paperbark  Maple
Pignut  Hickory
Pin  Oak
Pitch  Pine
Quaking  Aspen
Red  Maple
Red  Pine
River  Birch
Sawtooth  Oak
Scarlet  Oak
Shingle  Oak
Siberian  Elm
Silver  Linden
Silver  Maple
Southern  Magnolia
Sugar  Map

In [22]:
spec = engine.execute('select * from tree_species limit 10')
for s in spec:
    print(s)

("'Schubert' chokecherry", None, "'Schubert'chokecherry", None, None, None, None)
('American beech', 'http://api.leafsnap.com/v1/species/Fagus%20grandifolia/images/LTV-RBD-00221_crop.jpg?crop=4,86,1067,1148&h=150&w=150', 'Americanbeech', 'Prefers damp woods with rich soil.', 'Deciduous tree, growing to 22.9 m tall with oval or pyramidal crown.', 'Mid-spring.', '300-400 years.')
('American elm', 'http://api.leafsnap.com/v1/species/Ulmus%20americana/images/LTV-RBD-01412_crop.jpg?crop=8,70,1059,1121&h=150&w=150', 'Americanelm', 'Rich woods, floodplains, stream banks.', 'Deciduous tree, growing up to 36 m tall.', 'Early spring.', 'Up to 300 years.')
('American hophornbeam', None, 'Americanhophornbeam', None, None, None, None)
('American hornbeam', 'http://api.leafsnap.com/v1/species/Carpinus%20caroliniana/images/LTV-RBD-06629_crop.jpg?crop=27,132,1047,1152&h=150&w=150', 'Americanhornbeam', 'Moist soils in woods and along streams.', 'Deciduous shrub or small tree, growing to 5-12 m tall.', 