In [None]:
from pandas import DataFrame, read_csv
from collections import defaultdict
import pandas as pd
import numpy as np

location = 'data/ME_TREE.csv'
trees_df = pd.read_csv(location,usecols={'INVYR','PLOT','SUBP','SPGRPCD','DIA','DIACALC','PREVDIA'})

In [None]:
#All defaultdicts are indexed plot number->year->(species)->value
#plot_n stores the number of trees (density)
#plot_a stores the sum of the basal areas (area)
#plot_s stores the subplots where a species appears
#plot_st stores the subplots in which a plot is divided into
#s_spec stores the list of species numbers
#This step takes a while (110 seconds in Athena computer)
plot_n=defaultdict(lambda:defaultdict(lambda:defaultdict(int)))
plot_a=defaultdict(lambda:defaultdict(lambda:defaultdict(float)))
plot_s=defaultdict(lambda:defaultdict(lambda:defaultdict(set)))
plot_st=defaultdict(lambda:defaultdict(set))
for i,row in trees_df.iterrows():
    plot_n[row['PLOT']][row['INVYR']]['TOTAL']+=1
    plot_n[row['PLOT']][row['INVYR']][row['SPGRPCD']]+=1
    plot_s[row['PLOT']][row['INVYR']][row['SPGRPCD']].add(row['SUBP'])
    plot_st[row['PLOT']][row['INVYR']].add(row['SUBP'])
    if (np.isnan(row['DIA'])):            
        if (np.isnan(row['DIACALC'])):
            plot_a[row['PLOT']][row['INVYR']]['TOTAL']+=3.141592653*row['PREVDIA']*row['PREVDIA']/4
            plot_a[row['PLOT']][row['INVYR']][row['SPGRPCD']]+=3.141592653*row['PREVDIA']*row['PREVDIA']/4
        else:
            plot_a[row['PLOT']][row['INVYR']]['TOTAL']+=3.141592653*row['DIACALC']*row['DIACALC']/4
            plot_a[row['PLOT']][row['INVYR']][row['SPGRPCD']]+=3.141592653*row['DIACALC']*row['DIACALC']/4
    else:
        plot_a[row['PLOT']][row['INVYR']]['TOTAL']+=3.141592653*row['DIA']*row['DIA']/4
        plot_a[row['PLOT']][row['INVYR']][row['SPGRPCD']]+=3.141592653*row['DIA']*row['DIA']/4

In [None]:
#plot_f stores the frequency of a species (subplots it appears in)/(number of subplots)
plot_f=defaultdict(lambda:defaultdict(lambda:defaultdict(float)))
for plot,a1 in plot_s.iteritems():
    for year,a2 in plot_s[plot].iteritems():
        for species,count in plot_s[plot][year].iteritems():
            plot_f[plot][year][species]=len(plot_s[plot][year][species])/float(len(plot_st[plot][year]))
            plot_f[plot][year]['TOTAL']+=len(plot_s[plot][year][species])/float(len(plot_st[plot][year]))

In [None]:
#plot_i stores the importance value of a species (average of relative frequency, relative area and relative density)
plot_i=defaultdict(lambda:defaultdict(lambda:defaultdict(float)))
for plot,a1 in plot_n.iteritems():
    for year,a2 in plot_n[plot].iteritems():
        for species,count in plot_n[plot][year].iteritems():
            if (species!='TOTAL'):
                plot_i[plot][year][species]=plot_n[plot][year][species]/float(3*plot_n[plot][year]['TOTAL'])
for plot,a1 in plot_a.iteritems():
    for year,a2 in plot_a[plot].iteritems():
        for species,count in plot_a[plot][year].iteritems():
            if (species!='TOTAL'):
                plot_i[plot][year][species]+=plot_a[plot][year][species]/(3*plot_a[plot][year]['TOTAL'])
for plot,a1 in plot_f.iteritems():
    for year,a2 in plot_f[plot].iteritems():
        for species,count in plot_f[plot][year].iteritems():
            if (species!='TOTAL'):
                plot_i[plot][year][species]+=plot_f[plot][year][species]/(3*plot_f[plot][year]['TOTAL'])

In [None]:
#s_spec stores the numbers being used as species codes
#n_rows stores plot*time, i.e. the number of rows the frame will need
s_spec=set()
n_rows=0
for plot,a1 in plot_f.iteritems():
    for year,a2 in plot_f[plot].iteritems():
        n_rows+=1
        for species,count in plot_f[plot][year].iteritems():
            if (species!='TOTAL'):
                s_spec.add(species)

In [None]:
#l_data are the columns of the frame (number of trees, area and each of the importance values)
l_data=['PLOT','YEAR','TREES','AREA']
for num in s_spec:
    l_data.append('IV'+str(int(num)))

In [None]:
#plots_df is the frame containing number of trees, area and importance values
next=0
plots_df=DataFrame(0,columns=l_data,index=np.arange(n_rows))
for plot,a1 in plot_i.iteritems():
    for year,a2 in plot_i[plot].iteritems():
        plots_df.set_value(next,'PLOT',plot)
        plots_df.set_value(next,'YEAR',year)
        plots_df.set_value(next,'TREES',plot_n[plot][year]['TOTAL'])
        plots_df.set_value(next,'AREA',plot_a[plot][year]['TOTAL'])
        for species,iv in plot_i[plot][species].iteritems():
            plots_df.set_value(next,('IV'+str(int(species))),iv)
        next+=1
plots_df