# It's Tree Huggin' Time!

#### Before you begin:
* This notebook will establish a MySQL connection, create a database, its
associated tables, without the use of the MySQL UI. Please make sure to 
have mysqlclient installed by running 
```pip install mysqlclient``` 
in your python runtime environment.
* The data set for NYC Trees is rather large and is not stored in this repository, please go to [Kaggle](https://www.kaggle.com/new-york-city/ny-2015-street-tree-census-tree-data/version/12) and download to the resources folder within this repository.
* Remember to insert you password in mysql_scr.py (please do not alter the other variables)
* <b>Warning!</b> The create_db variable will drop and create a database on use!
* Let's get tree huggin'

In [None]:
# import dependencies
import pandas as pd
from sqlalchemy import create_engine
import mysql_scr
import numpy as np
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine
from sqlalchemy import func
import os
from splinter import Browser
from bs4 import BeautifulSoup as bs

In [None]:
# set up mysql initial connections
init_string = (f"root:{mysql_scr.pw}@localhost")
pre_engine = create_engine(f"mysql://{init_string}")

In [None]:
# create mysql db
pre_engine.execute(f"{mysql_scr.create_db}")

In [None]:
# initialize new mysql db connection
engine = create_engine(f"mysql://{init_string}/tree_db")
engine.execute(f"{mysql_scr.create_tbl}")

In [None]:
# automap base and check if tables exist...
Base = automap_base()
Base.prepare(engine, reflect=True)
Base.classes.keys()

In [None]:
# set file
file = os.path.join("Resources","2015-street-tree-census-tree-data.csv")

In [None]:
tree_data = pd.read_csv(file)
tree_data.head()

In [None]:
#obtain desired columns
tree_data = tree_data[['postcode','spc_common','spc_latin','tree_dbh']]
#clean data
tree_data_drop = tree_data.dropna()
tree_data_clean = tree_data_drop.drop_duplicates()

tree_data_clean.head()

In [None]:
#obtain average tree diameter
tree_info = tree_data_clean.groupby(['postcode','spc_common'])
tree_diam = pd.DataFrame(tree_info['tree_dbh'].mean())

#obtain number of tree type in postcode

tree_quantity = pd.DataFrame(tree_data_clean.groupby(['postcode','spc_common']).size())

tree_summary = tree_quantity.join(tree_diam).reset_index()
tree_summary = tree_summary.rename(columns={0:'tree_count'})
tree_summary.head(2)

In [None]:
#rename columns to match database table
tree_transformed = tree_summary.rename(columns = {'postcode':'zip_code',
                              'spc_common': 'species_nm',
                              'tree_count': 'count_tree',
                              'tree_dbh': 'avg_diameter'})
tree_transformed.head()

In [None]:
#load dataframe to database
tree_transformed.to_sql(name='nyc_tree',con=engine,if_exists='append',index=False)

In [None]:
# validate import
result = engine.execute('Select * from nyc_tree limit 10')
for r in result:
    print(r)

In [None]:
tree_sp_df = tree_data_clean['spc_common']
tree_sp_unique = pd.DataFrame(tree_sp_df.drop_duplicates())
#test_df_unique.str.strip()

tree_sp_unique['spc_common'] = tree_sp_unique['spc_common'].str.replace(" ","")
tree_sp_ins = tree_sp_unique.rename(columns={'spc_common':'web_common_nm'})
tree_sp_ins['species_nm'] = tree_data_clean['spc_common']
tree_sp_ins.head()

In [None]:
tree_sp_ins.to_sql(name='tree_species',con=engine,if_exists='append',index=False)

In [None]:
# validate import
result = engine.execute('Select * from tree_species limit 20')
for r in result:
    print(r)

In [None]:
# initialize base
Tree = Base.classes.tree_species
session = Session(engine)
Tree_species = session.query(Tree.species_nm).all()

# declare tree_list variable and append from database table
tree_list = []

for tree in Tree_species:
    tree_list.append(tree.species_nm)
    
print(tree_list)

In [None]:
# Create webkey (will be replaced) for looking up to website
trees  = [x.replace(' ','').lower() for x in tree_list]

for i in trees:
    print (i)


### Please Change/ Un-comment the executable path based on your OS!

In [None]:
# Open brower connection

# For Mac Users:
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}

# For Windows Users:
# executable_path = {'executable_path': 'chromedriver.exe'}

browser = Browser('chrome', **executable_path, headless=False)

In [None]:
# declare url and parser
url = "http://leafsnap.com/species/"
browser.visit(url)
html = browser.html
soup = bs(html, "html.parser")

### Get raking! Get it? Because its trees!
(if you don't like that joke, you can leaf)

In [None]:
# Get webscraping!
# Get div data related to the first species tab
ne_species = soup.find('div', id='species-1')

In [None]:
#find all tr rows to loop through the data
species = ne_species.table.tbody.find_all('tr')

for td in species:
    img = td.find_all('img')
    name = td.find_all('td')
    link = td.find_all('a')
    qry_name = name[3].text.replace(" ", "").lower()
    
    #check to see if web tree is in sql tree
    if qry_name in trees:
        image_link = img[0]['src']
        web_name = name[3].text
        print(image_link)
        print(name[3].text)
        print(link[0]['href'])
        
        ## update sql table
        session.query(Tree.img_loc)\
                .filter(func.lower(func.replace(Tree.species_nm,' ','')) == qry_name)\
                .update({"img_loc":image_link}, synchronize_session='fetch')
        session.commit()

        ## go to tree data page
        browser.click_link_by_partial_href(link[0]['href'])
        
        #get new webpage html and parse
        html_species = browser.html
        soup_species = bs(html_species, "html.parser")
        
        tree = soup_species.find_all('dd')
        
        try:
            habitat = tree[0].text
        except IndexError:
            habitat = 'N/A'
        try:
            growth = tree[1].text
        except IndexError:
            growth = 'N/A'
        try:
            bloom = tree[2].text
        except IndexError:
            bloom = "N/A"
        try:
            longevity = tree[3].text
        except IndexError:
            longevity = 'N/A'
        
        session.query(Tree.habitat, Tree.growth_habit, Tree.bloom_time, Tree.longevity )\
                .filter(func.lower(func.replace(Tree.species_nm,' ','')) == qry_name)\
                .update({"habitat":habitat, "growth_habit": growth, "bloom_time": bloom, "longevity": longevity}, synchronize_session='fetch')

        session.commit()
        print(habitat, growth, bloom, longevity)
 
        browser.back()

In [None]:
spec = engine.execute('select * from tree_species limit 10')
for s in spec:
    print(s)