In [None]:
import pandas as pd
from pandas import DataFrame, read_csv
import numpy as np
import bisect

In [None]:
def check(row):
    """
    Checks for human intervention in a plot
    """
    if (row['DSTRBCD1'] == 80.0):
        return True
    if (row['DSTRBCD2'] == 80.0):
        return True
    if (row['DSTRBCD3'] == 80.0):
        return True
    if (row['TRTCD1'] == 10.0):
        return True
    if (row['TRTCD1'] == 30.0):
        return True
    if (row['TRTCD1'] == 50.0):
        return True
    if (row['TRTCD2'] == 10.0):
        return True
    if (row['TRTCD2'] == 30.0):
        return True
    if (row['TRTCD2'] == 50.0):
        return True
    if (row['TRTCD3'] == 10.0):
        return True
    if (row['TRTCD3'] == 30.0):
        return True
    if (row['TRTCD3'] == 50.0):
        return True
    return False

In [None]:
def clean(state):
    """
    Cleans the data for usage in the analysis.
    """
    in_file = 'data/'+state+'/'+state+'_1.csv'
    out_file = 'data/'+state+'/'+state+'_2a.csv'
    dstrb_web = "http://apps.fs.fed.us/fiadb-downloads/CSV/"+state+"_COND.csv"
    data_points = pd.read_csv(in_file)
    
    #Remove entries before the year 1999
    MIN_YR = 1999
    data_points = data_points[data_points['py'] % 10000 >= MIN_YR]
    #Remove entries with few trees
    MIN_TREES = 5
    data_points = data_points[data_points['samples'] - data_points['na'] >= MIN_TREES]
    #Remove entries with too many invalid trees
    NA_THRESHOLD = 5
    data_points = data_points[data_points['na'] < NA_THRESHOLD]
    #Keep only most importaqnt species
    MIN_IV = 0.7
    keep_cols = [col for col in list(data_points) if not col.startswith('iv')]
    col_iv = [col for col in list(data_points) if col.startswith('iv')]
    sorted_iv = data_points[col_iv].apply(sum, axis=0).sort_values(ascending=False)
    cutoff = bisect.bisect_left(np.cumsum(sorted_iv), len(data_points.index) * MIN_IV) +1
    #Add 1 to the cutoff so the total IV is guaranteed to be over MIN_IV
    for i in np.arange(cutoff):
        keep_cols.append(sorted_iv.index[i])  
    data_points = data_points.loc[:, keep_cols]
    #Drop samples and na.  They're not necessary anymore
    data_points = data_points.drop(['samples','na'], axis = 1)
    data_points.to_csv(out_file, index=False)
    out_file = 'data/'+state+'/'+state+'_2b.csv'
    
    
    DSTRB_COLS = [
        'PLOT', 'INVYR', 'DSTRBCD1', 'DSTRBCD2', 'DSTRBCD3',
        'TRTCD1', 'TRTCD2', 'TRTCD3'
    ]
    disturbance = pd.read_csv(dstrb_web, usecols=DSTRB_COLS)
    
    #Mark plots where human intervention happened
    data_points.set_index('py', inplace=True)
    data_points.loc[:,'human'] = 0
    for i, row in disturbance.iterrows():
        py = int(row['INVYR'] + row['PLOT'] * 10000)
        if py in data_points.index:
            if check(row):
                data_points.loc[int(row['INVYR'] + row['PLOT'] * 10000), 'human'] = 1
    data_points.reset_index(inplace=True)
    data_points.loc[:, 'py'] = data_points.loc[:, 'py'].apply(int)
    #Re-number the plots so that human interventions are not applied
    cur_np = 1
    prev_id = data_points.loc[0, 'py'] // 10000
    for i, row in data_points.iterrows():
        if (prev_id != row['py'] // 10000) or (row['human'] == 1):
            cur_np += 1
        prev_id = row['py'] // 10000
        data_points.loc[i, 'py'] = int(cur_np * 10000 + row['py'] % 10000)
    data_points = data_points.drop(['human'], axis = 1)
    data_points.to_csv(out_file, index=False)

In [None]:
for STATE in ['ME']:
    clean(STATE)

In [None]:
'AL','AZ','AR','CA','CO','CT','DE','DC','FL','GA','ID','IL','IA','KS','KY','LA','ME','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','OH','OK','OR','PA','RI','SC','SD','TN','TX','UT','VT','VA','WA','WI','WY'