In [128]:
from pandas import DataFrame, read_csv
from collections import defaultdict
import pandas as pd
import numpy as np
import math

In [101]:
FIA_COLS = [
    'CN','PREV_TRE_CN','INVYR','PLOT','SUBP',
    'SPGRPCD','DIA','DIACALC','PREVDIA','TPA_UNADJ'
]

trees_csv = 'data/ME_TREE.csv'
trees_df = pd.read_csv(trees_csv, usecols=FIA_COLS)

In [102]:
#Take the TPA_UNADJ from the same tree from previous years
#This step takes a while (25 seconds in Athena computer)
cn_tpa = defaultdict(float)
for i, row in trees_df.iterrows():
    if (np.isnan(row['TPA_UNADJ'])): 
        trees_df.set_value(i, 'TPA_UNADJ', cn_tpa[row['PREV_TRE_CN']])
    cn_tpa[row['CN']] = row['TPA_UNADJ']

In [103]:
#We don't need CN or PREV_TRE_CN anymore, drop them
trees_df.drop(labels=['CN', 'PREV_TRE_CN'], axis=1, inplace=True)

In [150]:
#Sort data by plot-year so it is easier to retrieve later
trees_df.sort_values(by=['PLOT', 'INVYR'], axis=0, inplace=True)

In [31]:
class pys(object):
    """The data of a species for a given plot and year.
    Attributes:
    * num: number of trees
    * ba: sum of basal areas
    * subp: set of subplots in which the species appears
    """
    def __init__(self):
        self.num = 0
        self.ba = 0
        self.subp = set()
    def update(self, tpa, dia, subp):
        self.num += tpa
        self.ba += math.pi * ((dia / 2) ** 2) * tpa
        self.subp.add(subp)
        return None
    def iv(self, total):
        return (self.num / total.num + self.ba / total.ba + len(self.subp) / len(total.subp)) / 3.0

In [97]:
def fetch_dia(row):
    """Find the DIA parameter of a tree
    """
    if np.isnan(row['DIA']):
            return row['PREVDIA'] if np.isnan(row['DIACALC']) else row['DIACALC']
    else:
        return row['DIA']

In [160]:
#This seems to be reaaaaaaaally slow
data_points = pd.DataFrame()
pys_dd = defaultdict(pys)
prev_py = trees_df.loc[0, 'PLOT'] * 10000 + trees_df.loc[0, 'INVYR']
for i, row in trees_df.iterrows():
    cur_py = int(row['PLOT'] * 10000 + row['INVYR'])
    if prev_py == cur_py:
        pys_dd[row['SPGRPCD']].update(row['TPA_UNADJ'], fetch_dia(row), row['SUBP'])
        pys_dd['TOTAL'].update(row['TPA_UNADJ'], fetch_dia(row), row['SUBP'])
    else:       
        var_l = defaultdict(float)       
        var_l['NUM'] = pys_dd['TOTAL'].num
        var_l['BA'] = pys_dd['TOTAL'].ba
        for spp in pys_dd:
            var_l['IV'+str(spp)] = pys_dd[spp].iv(pys_dd['TOTAL'])
        data_points = data_points.append(pd.DataFrame(var_l, index=[prev_py]))
        prev_py = cur_py
        pys_dd = defaultdict(pys)
        pys_dd[row['SPGRPCD']].update(row['TPA_UNADJ'], fetch_dia(row), row['SUBP'])
        pys_dd['TOTAL'].update(row['TPA_UNADJ'], fetch_dia(row), row['SUBP'])
var_l = defaultdict(float)       
var_l['NUM'] = pys_dd['TOTAL'].num
var_l['BA'] = pys_dd['TOTAL'].ba
for spp in pys_dd:
    var_l['IV'+str(spp)] = pys_dd[spp].iv(pys_dd['TOTAL'])
data_points = data_points.append(pd.DataFrame(var_l, index=[cur_py]))
data_points.fillna(0)

KeyboardInterrupt: 