In [246]:
# import necessary python modules
import csv
import pandas as pd
import numpy as np

### Create new data frames and import raw data
---

In [247]:
# create a list of the states we'll be analyzing, will be used frequently
statelist = ["AZ", "CA", "CO", "ID", "MT", "NM", "NV", "OR", "UT", "WY"]

In [248]:
# Create a new empty DataFrame for each table
allotments = pd.DataFrame()
operators = pd.DataFrame()
permits = pd.DataFrame()
health = pd.DataFrame()

#create a variable storing the path for each set of raw files
allotments_path = 'data/rangeland-administration-system/allotment-info/'
operators_path = 'data/rangeland-administration-system/operator-info/'
permits_path = 'data/rangeland-administration-system/permit-schedule-info/'
health_path = 'data/rangeland-health/'

# Import those files for allotments ... (probably should be a function)
for s in statelist:
    csv_file = '{}{}.csv'.format(allotments_path, s)
    new_data = pd.read_csv(csv_file, dtype={'Allotment Number': 'object', 'Auth No': 'object'})
    allotments = allotments.append(new_data)

# ... and for operators
for s in statelist:
    csv_file = '{}{}.csv'.format(operators_path, s)
    new_data = pd.read_csv(csv_file, dtype={'Allotment Number': 'object', 'Auth No': 'object'})
    operators = operators.append(new_data)

# ... and for permits
for s in statelist:
    csv_file = '{}{}.csv'.format(permits_path, s)
    new_data = pd.read_csv(csv_file, dtype={'Allotment Number': 'object', 'Auth No': 'object'})
    permits = permits.append(new_data)

# ... and for land health standards
for s in statelist:
    csv_file = '{}{}.csv'.format(health_path, s)
    new_data = pd.read_csv(csv_file, dtype={'Allotment Number': 'object', 'Auth No': 'object'})
    health = health.append(new_data)

### Process Field Offices table
---

Creating a unique entry for each field office with an ID that can be referenced by other tables

In [249]:
# create a new df with field office info
field_offices = allotments[["Admin Office", "Field Office"]]

In [250]:
#  select only uniques from the DB
field_offices.drop_duplicates('Admin Office', inplace = True)

In [251]:
# save and re-read as a CSV for a janky but fast way of generating unique IDs starting at 1
field_offices.to_csv('data/processed/field_offices.csv')
field_offices = pd.read_csv('data/processed/field_offices.csv')

In [252]:
# Create a new column for field office state.
field_offices["State"] = field_offices["Admin Office"]
field_offices["State"] = field_offices["State"].str[2:4]

In [253]:
field_offices.loc[field_offices['State'] == 'AZ', 'StateCode'] = 1
field_offices.loc[field_offices['State'] == 'CA', 'StateCode'] = 2
field_offices.loc[field_offices['State'] == 'CO', 'StateCode'] = 3
field_offices.loc[field_offices['State'] == 'ID', 'StateCode'] = 4
field_offices.loc[field_offices['State'] == 'MT', 'StateCode'] = 5
field_offices.loc[field_offices['State'] == 'NM', 'StateCode'] = 6
field_offices.loc[field_offices['State'] == 'NV', 'StateCode'] = 7
field_offices.loc[field_offices['State'] == 'OR', 'StateCode'] = 8
field_offices.loc[field_offices['State'] == 'UT', 'StateCode'] = 9
field_offices.loc[field_offices['State'] == 'WY', 'StateCode'] = 10

field_offices['StateCode'] = field_offices['StateCode'].astype('int64')

In [254]:
field_offices['id'] = field_offices.index
field_offices['id'] = field_offices['id'] + 1

In [255]:
field_offices=field_offices.rename(columns = {'Admin Office':'office_code', 'Field Office': 'office_name', 'StateCode': 'state_id'})

In [256]:
field_offices = field_offices[['id', 'office_code', 'office_name', 'state_id']]
field_offices.to_csv("data/processed/field_offices.csv")

### Process operators table
---

Create a table of operators linked to field offices, and with a unique ID that can be referencecd by other tables

In [288]:
#rename operators columsn to remove spaces and capital
operators=operators.rename(columns = {'Off CD': 'office_code', 'Auth No':'auth_no', 'Operator Display Name': 'operator_display_name', 'Address1': 'address1', 'Address2': 'address2', 'City': 'city', 'Phone Number': 'phone_number', 'Release Text': 'release_text', 'Zipcode1 5': 'zipcode15', 'Zipcode6 9': 'zipcode69'})

In [259]:
#assign an id based on the index, but skip the 0
operators["id"] = operators.index + 1

In [260]:
#create a new dataframe that joins operators with field offices
new_ops = pd.merge(operators, field_offices, on='office_code', how='inner')

In [261]:
#that worked great now overwrite operators with that same data
operators = new_ops

In [269]:
#rename the two different id fields so we've got what we want.
operators['id'] = operators['id_x'].astype(int)
operators['field_office_id'] = operators['id_y'].astype(int)

In [273]:
# shed the data we don't want by reassigning the variable name 'operators' new a new dataframe with only these columns selected
operators = operators[['id', 'auth_no', 'operator_display_name', 'address1', 'address2', 'city', 'zipcode15', 'zipcode69', 'ST2', 'phone_number', 'release_text', 'field_office_id']]

In [287]:
#write that new datafame out to a CSV that can easily be uploaded into Django
operators.to_csv('data/processed/operators.csv')

### Process allotments table
---

Create a table of operators linked to field offices and operators, along witha unique ID that can be referencecd by other tables such as "health" and "boundary"

In [289]:
allotments=allotments.rename(columns = {'Admin Office': 'office_code', '' })

Unnamed: 0,Adm State,Admin Office,Field Office,Allotment Number,Allotment Name,Available For Grazing,Grazing Decision,Public Acres,Amp Text,Amp Implement Date,Management Stat Text,Auth No,Permitted Aums,Suspended Aums,Susp Use Temp
0,AZ,LLAZA01000,ARIZONA STRIP FO,00096,HERD HOUSE,Y,,2390,AMP IMPLEMENTED,11/01/1981,MAINTAIN CATEGORY,0201014,95,0,
1,AZ,LLAZA01000,ARIZONA STRIP FO,00097,TUCKUP,Y,,16276,,,MAINTAIN CATEGORY,0201965,1075,4,0
2,AZ,LLAZA01000,ARIZONA STRIP FO,00099,ROCK CANYON,Y,,1343,,,CUSTODIAL CATEGORY,0201955,126,67,0
3,AZ,LLAZA01000,ARIZONA STRIP FO,00114,HURRICANE RIM,Y,,33099,AMP IMPLEMENTED,08/30/1983,MAINTAIN CATEGORY,0201502,3424,347,
4,AZ,LLAZA01000,ARIZONA STRIP FO,00119,BIG WARREN,Y,,9647,AMP IMPLEMENTED,06/27/1991,IMPROVE CATEGORY,0200111,704,74,
5,AZ,LLAZA01000,ARIZONA STRIP FO,02012,LIME SPRING,Y,,3596,,,IMPROVE CATEGORY,,,,
6,AZ,LLAZA01000,ARIZONA STRIP FO,04804,WHITEROCK-SOAPSTONE,Y,,18284,AMP IMPLEMENTED,03/01/1971,MAINTAIN CATEGORY,0201013,660,,
7,AZ,LLAZA01000,ARIZONA STRIP FO,04804,WHITEROCK-SOAPSTONE,Y,,18284,AMP IMPLEMENTED,03/01/1971,MAINTAIN CATEGORY,0201959,660,0,0
8,AZ,LLAZA01000,ARIZONA STRIP FO,04805,COYOTE SPRING,Y,,20992,,,IMPROVE CATEGORY,0201014,1359,484,
9,AZ,LLAZA01000,ARIZONA STRIP FO,04808,MAINSTREET,Y,,156742,AMP IMPLEMENTED,07/01/1974,MAINTAIN CATEGORY,0201005,14535,3482,


In [290]:
#save a copy of all the states to a new csv just because
allotments.to_csv('data/rangeland-administration-system/allotment-info/all_states.csv')

In [295]:
#create a new column with unique id for allotments based on state and allotment number (allotment numbers are unique within states according to BLM documentation)
allotments['allotment_unique'] = allotments['Adm State'] + allotments['Allotment Number']

In [299]:
#check the first few lines to make sure we're all good
allotments["allotment_unique"][:5]

0    AZ00096
1    AZ00097
2    AZ00099
3    AZ00114
4    AZ00119
Name: allotment_unique, dtype: object

In [321]:
# create a new dataframe and drop the duplicate allotment numbers out of it
allotments_trimmed = allotments[['allotment_unique', 'Admin Office', 'Allotment Name', 'Allotment Number', 'Available For Grazing', 'Grazing Decision', 'Public Acres', 'Amp Text', 'Amp Implement Date', 'Management Stat Text']]

In [322]:
# drop the duplicates out of it
allotments_trimmed.drop_duplicates('allotment_unique', inplace = True)

In [344]:
# rename columns to get rid of messy spaces and capitals
allotments_trimmed = allotments_trimmed.rename(columns={'Admin Office': 'office_code', 'Allotment Number': 'allotment_number', 'Allotment Name': 'allotment_name', 'Available For Grazing': 'available_for_grazing', 'Grazing Decision': 'grazing_decision', 'Public Acres': 'public_acres', 'Amp Text': 'amp_text', 'Amp Implement Date': 'amp_implement_date', 'Management Stat Text': 'management_stat_text'})

In [345]:
# check it out to make sure we're still good ... 
allotments_trimmed[:5]

Unnamed: 0,allotment_unique,office_code,allotment_name,allotment_number,available_for_grazing,grazing_decision,public_acres,amp_text,amp_implement_date,management_stat_text
0,AZ00096,LLAZA01000,HERD HOUSE,96,Y,,2390,AMP IMPLEMENTED,11/01/1981,MAINTAIN CATEGORY
1,AZ00097,LLAZA01000,TUCKUP,97,Y,,16276,,,MAINTAIN CATEGORY
2,AZ00099,LLAZA01000,ROCK CANYON,99,Y,,1343,,,CUSTODIAL CATEGORY
3,AZ00114,LLAZA01000,HURRICANE RIM,114,Y,,33099,AMP IMPLEMENTED,08/30/1983,MAINTAIN CATEGORY
4,AZ00119,LLAZA01000,BIG WARREN,119,Y,,9647,AMP IMPLEMENTED,06/27/1991,IMPROVE CATEGORY


In [369]:
# join it with field office
allotments_with_field_office = pd.merge(allotments_trimmed, field_offices, on='office_code', how='inner')

In [370]:
#strip the comma out of "public acres" and convert it to a float field so we can calc on it. Then describe() just to make sure it worked
allotments_with_field_office['public_acres'] = allotments_with_field_office['public_acres'].str.replace(',', '')
allotments_with_field_office['public_acres'] = allotments_with_field_office['public_acres'].astype(float)
allotments_with_field_office['public_acres'].describe()

count      21205.000000
mean        7286.288847
std        30588.340811
min            0.000000
25%          240.000000
50%          960.000000
75%         4091.000000
max      1459993.000000
Name: public_acres, dtype: float64

In [371]:
allotments_with_field_office = allotments_with_field_office.rename(columns={'id': 'field_office_id'})

In [378]:
#reassign the variable name to a data frame selecting only the fields we want for our table
allotments_with_field_office = allotments_with_field_office[['allotment_unique', 'allotment_number', 'allotment_name', 'available_for_grazing', 'grazing_decision', 'public_acres', 'amp_text', 'management_stat_text', 'field_office_id']]

In [379]:
#convert 'amp_implement_date' into a Django-friendly format, then check it with the first five rows jsut to make sure
allotments_with_field_office['amp_implement_date'] = allotments_with_field_office['amp_implement_date'].str.replace('/', '-')
allotments_with_field_office['amp_implement_date'] = allotments_with_field_office['amp_implement_date'] + ' 00:00:00'
allotments_with_field_office['amp_implement_date'][:5]

KeyError: 'amp_implement_date'

In [380]:
# assign each allotment a unique id
allotments_with_field_office['id'] = allotments_with_field_office.index + 1
allotments_with_field_office['id'][:5]

0    1
1    2
2    3
3    4
4    5
Name: id, dtype: int64

In [381]:
allotments_with_field_office.to_csv('data/processed/allotments.csv')