In [None]:
from pandas import DataFrame, read_csv
from collections import defaultdict
import pandas as pd
import numpy as np
import math

In [None]:
def check_n(row):
    """
    Checks for negative human intervention in a plot
    """
    if (row['DSTRBCD1'] == 80.0):
        return 1
    if (row['DSTRBCD2'] == 80.0):
        return 1
    if (row['DSTRBCD3'] == 80.0):
        return 1
    if (row['TRTCD1'] == 10.0):
        return 1
    if (row['TRTCD2'] == 10.0):
        return 1
    if (row['TRTCD3'] == 10.0):
        return 1
    return 0

In [None]:
def check_p(row):
    """
    Checks for positive human intervention in a plot
    """
    if (row['TRTCD1'] == 30.0):
        return 1
    if (row['TRTCD1'] == 50.0):
        return 1
    if (row['TRTCD2'] == 30.0):
        return 1
    if (row['TRTCD2'] == 50.0):
        return 1
    if (row['TRTCD3'] == 30.0):
        return 1
    if (row['TRTCD3'] == 50.0):
        return 1
    return 0

In [None]:
def area(row):
    """
    Find the DIA parameter of a tree and calculate area
    """
    if np.isnan(row['DIA']):
            dia = row['PREVDIA'] if np.isnan(row['DIACALC']) else row['DIACALC']
    else:
        dia = row['DIA']
    return math.pi * ((dia / 2) ** 2) * row['TPA_UNADJ']

In [None]:
def carbon(row):
    """
    Find the carbon in a tree
    """
    return (row['CARBON_AG'] + row['CARBON_BG']) * row['TPA_UNADJ']

In [None]:
class Plot(object):
    """
    Contains all the subplots/trees measured for a particular year on a particular plot.
    Internally contains a dataframe of all the trees and their subplots.
    Has methods to computer total BA/TPA and species importance values
    """
    def __init__(self, trees, plot, dstrb, py):
        self.df = trees
        self.py = py
        self.na = None
        self.tpa = None
        self.ba = None
        self.carb = None
        self.lon = plot.loc['LON']
        self.lat = plot['LAT']
        self.human_n = check_n(dstrb)
        self.human_p = check_p(dstrb)
        
    def calc_na(self):
        """
        Calculate the number of trees missing a TPA_UNADJ field.
        """
        if self.na == None:
            total = 0
            for i, row in self.df.iterrows():
                if np.isnan(row['TPA_UNADJ']):
                    total += 1
            self.na = total
        return self.na
    
    def calc_carb(self):
        """
        Calculate the carbon for the plot.
        """
        if self.carb == None:
            total = 0.0
            for i, row in self.df.iterrows():
                if not np.isnan(carbon(row)):
                    total += carbon(row)
            self.carb = total
        return self.carb
    
    def calc_tpa(self):
        """
        Calculate the TPA for the plot.
        """
        if self.tpa == None:
            total = 0.0
            for i, row in self.df.iterrows():
                if not np.isnan(row['TPA_UNADJ']):
                    total += row['TPA_UNADJ']
            self.tpa = total
        return self.tpa
    
    def calc_ba(self):
        """
        Calculates the total basal area for the plot.
        """
        if self.ba == None:
            total = 0.0
            for i, row in self.df.iterrows():
                if not np.isnan(area(row)):
                    total += area(row)
            self.ba = total
        return self.ba
    
    def calc_iv(self):
        """
        Calculates all importance values for species in this plot.
        Returns:
        {
            <spp1>: <impval1>,
            ...
        }
        
        """
        
        #Element 0 is TPA
        #Element 1 is BA
        #Element 2 is subplots
        param_dd = defaultdict(lambda: [0,0,set()])
        for i, row in self.df.iterrows():
            param_dd[row['SPCD']][0] += row['TPA_UNADJ']
            param_dd[row['SPCD']][1] += area(row)
            param_dd[row['SPCD']][2].add(row['SUBP'])
        total_subp = set()
        for spp in param_dd:
            total_subp = total_subp.union(param_dd[spp][2])
        sum_freq = 0.0
        for spp in param_dd:
            param_dd[spp][2] = len(param_dd[spp][2])/float(len(total_subp))
            sum_freq += param_dd[spp][2]
        for spp in param_dd:
            param_dd[spp][0] /= self.calc_tpa()
            param_dd[spp][1] /= self.calc_ba()
            param_dd[spp][2] /= sum_freq
        iv_dd = dict()
        for spp in param_dd:
            iv_dd['iv'+str(spp)] = sum(param_dd[spp][x] for x in np.arange(3)) / 3.0
        return iv_dd
          
    def plot_stats(self):
        """
        Returns a dictionary of all the plot stats.
        Can be used as a row in the dataframe used for clustering
        """
        stats = {
            'py': self.py,
            'carb': self.calc_carb(),
            'samples': len(self.df.index), 
            'na': self.calc_na(),
            'lon': self.lon,
            'lat': self.lat,
            'human_p': self.human_p,
            'human_n': self.human_n
        }
        stats.update(self.calc_iv())
        return stats

In [1]:
def parse(state):
    """
    Takes the raw FIA file and returns a bunch of Plot objects
    """
    TREES_WEB = "http://apps.fs.fed.us/fiadb-downloads/CSV/"+state+"_TREE.csv"
    PLOT_WEB = "http://apps.fs.fed.us/fiadb-downloads/CSV/"+state+"_PLOT.csv"
    DSTRB_WEB = "http://apps.fs.fed.us/fiadb-downloads/CSV/"+state+"_COND.csv"
    
    TREES_COLS = [
        'INVYR','PLOT','STATUSCD','CARBON_AG','CARBON_BG','TPA_UNADJ','DIA','PREVDIA','DIACALC','SPCD','SUBP'
    ]
    PLOT_COLS = ['INVYR', 'PLOT', 'LAT', 'LON']
    DSTRB_COLS = [
        'PLOT', 'INVYR', 'DSTRBCD1', 'DSTRBCD2', 'DSTRBCD3',
        'TRTCD1', 'TRTCD2', 'TRTCD3'
    ]
    
    trees_df = pd.read_csv(TREES_WEB, usecols=TREES_COLS, engine='c')
    plot_df = pd.read_csv(PLOT_WEB, usecols=PLOT_COLS, engine='c')
    dstrb_df = pd.read_csv(DSTRB_WEB, usecols=DSTRB_COLS, engine='c')
                           
    trees_df = trees_df[trees_df.STATUSCD == 1]
    
    grouped = trees_df.groupby(['PLOT', 'INVYR'])
    for name, group in grouped:
        yield Plot(
            group, 
            plot_df[(plot_df.INVYR == name[1]) & (plot_df.PLOT == name[0])].iloc[0], 
            dstrb_df[(dstrb_df.INVYR == name[1]) & (dstrb_df.PLOT == name[0])].iloc[0],
            name[0] * 10000 + name[1]
        )
            
def cluster_prep_file(plots, state):
    """
    Given a list of Plot objects, write them to a named CSV
    """
    out_filename = 'data/'+state+'/'+state+'_1.csv'
    df = pd.DataFrame([p.plot_stats() for p in plots])
    df = df.fillna(0)
    df.to_csv(out_filename, index=False)
    return out_filename

In [None]:
for STATE in ['ME']:
    plots = parse(STATE)
    cluster_prep_file(plots, STATE)