In [1]:
import pandas as pd
import os 
import glob

In [2]:
# Change this year to match the excel file you are adding
year = '2008'

In [3]:
# Imports the running dataset which the year selected will be added
if os.path.isfile('cumltowntaxincome.csv'):
    cumltowntax = pd.read_csv("cumltowntaxincome.csv")
else:
    cumltowntax =  pd.DataFrame()
    
if os.path.isfile('cumltowntotal.csv'):
    cumltowntotal = pd.read_csv("cumltowntotal.csv")
else:
    cumltowntotal = pd.DataFrame()


In [4]:
if cumltowntax.empty:
    print('New Town Dataframe Created')
else:
    print("Years already stored for Towns:" + str(pd.unique(cumltowntax['Year'])))
    
if cumltowntotal.empty:
    print('New County Dataframe Created')
else:
    print("Years already stored for County:" + str(pd.unique(cumltowntotal['Year'])))

Years already stored for Towns:[2005 2006 2007]
Years already stored for County:[2005 2006 2007]


In [5]:
glob.glob('./*.xls')

['.\\townGEOIDs.xls',
 '.\\town_income_2005_detail_report.xls',
 '.\\town_income_2006_detail_report.xls',
 '.\\town_income_2007_detail_report.xls',
 '.\\town_income_2008_detail_report.xls',
 '.\\town_income_2009_detail_report.xls',
 '.\\town_income_2010_detail_report.xls',
 '.\\town_income_2011_detail_report.xls',
 '.\\town_income_2012_detail_report.xls',
 '.\\town_income_2013_detail_report.xls',
 '.\\town_income_2014_detail_report.xls',
 '.\\town_income_2015_detail_report.xls']

In [6]:
# Imports excel file and gives list of sheets
data = pd.ExcelFile('town_income_'+year+'_detail_report.xls')
colList = data.sheet_names
print(colList)

['Sheet1', 'Sheet2', 'Sheet3']


In [7]:
# Parse each sheet into it's own dataframes
towntax = data.parse('Sheet1',  skiprows=2)

## Rename Columns

Because shapefiles, the final format for this data, has a limit of 10 characters for headers, it makes sense to abbreviate this now both to save on typing out the names and to make it friendly when joining it later. 

In [8]:
towntaxCol = ['AGIClss',  'Return', 'Exempt', 'MarJnt', 'Single','MarSep', 'HdHous', 'AGI',
               'FedTI', 'VTI', 'NetVTax']

towntax.columns = towntaxCol

In [9]:
# There are some footnotes in the excel file. Important to note but they are cut from the final file at the end 
towntax.tail()

Unnamed: 0,AGIClss,Return,Exempt,MarJnt,Single,MarSep,HdHous,AGI,FedTI,VTI,NetVTax
4354,40000 - 49999,15.0,40.0,11.0,4.0,0.0,0.0,679107.0,391527.0,403787.0,12064.0
4355,50000 - 74999,19.0,47.0,13.0,5.0,0.0,1.0,1138297.0,731828.0,726040.0,23379.6456
4356,75000 +,58.0,149.0,49.0,9.0,0.0,0.0,10926915.0,8806367.0,7819530.0,412271.1453
4357,,,,,,,,,,,
4358,Grand Total,208.0,391.0,107.0,91.0,3.0,7.0,10472092.0,10366625.0,9387508.0,461630.9895


In [10]:
# Remove the several blank rows from the spreadsheet
def removeNA(dfname):
    dfname = dfname[(dfname.iloc[:,0].notnull())]
    return dfname
 
towntax = removeNA(towntax)
towntax = towntax.reset_index(drop=True)

## Add Columns and Town Names

The original excel files did not place the county name on each line but rather at the top of section of Adjusted Gross Income breakdowns. This means that it cannot be easily sorted, or joined to a geospatial dataset, without some intense modification. First a copy of the AGI Class needs to be created so it can be made into the Town Names.

In [11]:
towntax['Year'] = year
towntax['Town'] = towntax['AGIClss']

In [12]:
# Pulls list of unique values in AGIClss
# TownList = pd.unique(towntax['AGIClss'])

# Manually set a list of what are not valid town names, this might need to be updated if the brackets change in the future
delvalue =  ['Loss or None', '0.01 - 4999', '5000 - 9999','10000 - 14999', '15000 - 19999', '20000 - 24999',
             '25000 - 29999','30000 - 34999', '35000 - 39999', '40000 - 44999', '45000 - 49999', '50000 - 59999', 
             '60000 - 74999', '75000 - 99999', '100000 - 149999', '150000 +', '0.01 - 9999', '10000 - 19999', 
             '20000 - 29999','30000 - 39999', '40000 - 49999', '50000 - 74999', '75000 +','Grand Total']


In [13]:
# Creates a list of town names by excluding anything that is in delvalue list
townlist = towntax['Town']
townlist = [x for x in townlist if x not in delvalue]
townlist

['Addison',
 'Albany',
 'Alburgh',
 'Andover',
 'Arlington',
 'Athens',
 'Averill',
 "Avery's Gore",
 'Bakersfield',
 'Baltimore',
 'Barnard',
 'Barnet',
 'Barre City',
 'Barre Town',
 'Barton',
 'Belvidere',
 'Bennington',
 'Benson',
 'Berkshire',
 'Berlin',
 'Bethel',
 'Bloomfield',
 'Bolton',
 'Bradford',
 'Braintree',
 'Brandon',
 'Brattleboro',
 'Bridgewater',
 'Bridport',
 'Brighton',
 'Bristol',
 'Brookfield',
 'Brookline',
 'Brownington',
 'Brunswick',
 "Buel's Gore",
 'Burke',
 'Burlington',
 'Cabot',
 'Calais',
 'Cambridge',
 'Canaan',
 'Castleton',
 'Cavendish',
 'Charleston',
 'Charlotte',
 'Chelsea',
 'Chester',
 'Chittenden',
 'Clarendon',
 'Colchester',
 'Concord',
 'Corinth',
 'Cornwall',
 'Coventry',
 'Craftsbury',
 'Danby',
 'Danville',
 'Derby',
 'Dorset',
 'Dover',
 'Dummerston',
 'Duxbury',
 'East Haven',
 'East Montpelier',
 'Eden',
 'Elmore',
 'Enosburg',
 'Essex Junction',
 'Essex Town',
 'Fair Haven',
 'Fairfax',
 'Fairfield',
 'Fairlee',
 'Fayston',
 'Ferdinan

In [14]:
# Iterate through the list and if the value matches the townlist, return that value, otherwise return the previous value

data = towntax['Town']
data = list(data)

newname = []
index1= 0
index2= -1 

for x in data:
    if data[index1] in townlist:
        newname.append(data[index1])
    else:
        newname.append(newname[index2])
    index1+=1
    index2+=1
    
    

In [15]:
# Add newly created town names to the town column
towntax['Town'] = newname

In [16]:
# Originaly the town name was on it's own row, this removes those and just leaves valid information. it also removed those footnotes
towntax = towntax[(towntax['Return'].notnull())]
towntax.head(25)

Unnamed: 0,AGIClss,Return,Exempt,MarJnt,Single,MarSep,HdHous,AGI,FedTI,VTI,NetVTax,Year,Town
1,Loss or None,20,34.0,11.0,8.0,1.0,0.0,-599714.0,0.0,139.0,96.0,2008,Addison
2,0.01 - 4999,88,69.0,4.0,77.0,1.0,6.0,242664.0,6298.0,6232.0,221.0,2008,Addison
3,5000 - 9999,86,75.0,9.0,75.0,0.0,2.0,628333.0,54860.0,54563.0,1873.183,2008,Addison
4,10000 - 14999,79,114.0,14.0,59.0,2.0,4.0,980665.0,215668.0,215242.0,7409.432,2008,Addison
5,15000 - 19999,75,99.0,8.0,62.0,0.0,5.0,1315829.0,545329.0,543534.0,17942.71,2008,Addison
6,20000 - 24999,77,127.0,17.0,50.0,3.0,7.0,1753134.0,772268.0,758372.0,26439.86,2008,Addison
7,25000 - 29999,56,97.0,18.0,32.0,0.0,6.0,1552839.0,730763.0,727141.0,25297.08,2008,Addison
8,30000 - 34999,53,110.0,24.0,23.0,3.0,3.0,1736929.0,838562.0,823500.0,29182.97,2008,Addison
9,35000 - 39999,56,92.0,11.0,35.0,3.0,7.0,2094095.0,1297535.0,1293260.0,44939.0,2008,Addison
10,40000 - 44999,53,118.0,26.0,22.0,2.0,3.0,2265374.0,1218335.0,1206181.0,41905.62,2008,Addison


## Column Order

In order for this data to be intuative, changing column order in necessary. First year, then town and then the AGI Class means that this data can be cut numerious ways

In [17]:
towntotal = towntax[(towntax['AGIClss'] == "Grand Total")]
towntax = towntax[(towntax['AGIClss'] != "Grand Total")]

In [18]:
towntotal.columns

Index(['AGIClss', 'Return', 'Exempt', 'MarJnt', 'Single', 'MarSep', 'HdHous',
       'AGI', 'FedTI', 'VTI', 'NetVTax', 'Year', 'Town'],
      dtype='object')

In [19]:
towntotal.columns = ['Type', 'Return', 'Exempt', 'MarJnt', 'Single', 'MarSep', 'HdHous',
       'AGI', 'FedTI', 'VTI', 'NetVTax', 'Year', 'Town']

In [20]:
# Reorders the columns
towntax = towntax[['Year', 'Town','AGIClss', 'Return', 'Exempt', 'MarJnt', 'Single', 'MarSep', 'HdHous',
       'AGI', 'FedTI', 'VTI', 'NetVTax']]
towntotal = towntotal[['Year', 'Town','Type', 'Return', 'Exempt', 'MarJnt', 'Single', 'MarSep', 'HdHous',
       'AGI', 'FedTI', 'VTI', 'NetVTax']]

In [21]:
towntax.tail(25)

Unnamed: 0,Year,Town,AGIClss,Return,Exempt,MarJnt,Single,MarSep,HdHous,AGI,FedTI,VTI,NetVTax
4068,2008,Woodstock,150000 +,129,316.0,105.0,20.0,2.0,2.0,45320905.0,35970438.0,31570637.0,1924407.0
4071,2008,Worcester,Loss or None,9,11.0,3.0,6.0,0.0,0.0,-146604.0,0.0,0.0,0.0
4072,2008,Worcester,0.01 - 4999,50,51.0,9.0,39.0,0.0,2.0,141237.0,-5282.0,5012.0,185.0
4073,2008,Worcester,5000 - 9999,55,49.0,6.0,45.0,2.0,2.0,426436.0,47120.0,52701.0,1920.0
4074,2008,Worcester,10000 - 14999,35,60.0,8.0,20.0,2.0,5.0,440937.0,70277.0,70424.0,2625.0
4075,2008,Worcester,15000 - 19999,26,43.0,7.0,13.0,1.0,5.0,450872.0,122315.0,125496.0,4519.0
4076,2008,Worcester,20000 - 24999,22,54.0,11.0,6.0,0.0,5.0,499791.0,118365.0,115739.0,4165.0
4077,2008,Worcester,25000 - 29999,27,53.0,9.0,13.0,0.0,5.0,742310.0,302888.0,326817.0,10041.93
4078,2008,Worcester,30000 - 34999,29,49.0,10.0,15.0,0.0,4.0,938645.0,513356.0,513309.0,17377.78
4079,2008,Worcester,35000 - 39999,27,69.0,14.0,10.0,0.0,3.0,1013736.0,489082.0,486320.0,17418.0


In [22]:
towntotal.tail(25)

Unnamed: 0,Year,Town,Type,Return,Exempt,MarJnt,Single,MarSep,HdHous,AGI,FedTI,VTI,NetVTax
3729,2008,West Haven,Grand Total,136,251.0,66.0,62.0,0.0,8.0,5988478.0,3935268.0,3602176.0,156326.3
3747,2008,West Rutland,Grand Total,1308,2330.0,502.0,671.0,15.0,120.0,61961570.0,39632915.0,36187983.0,1757746.0
3765,2008,West Windsor,Grand Total,521,966.0,266.0,223.0,16.0,16.0,104989200.0,91488991.0,76466910.0,4243604.0
3775,2008,Westfield,Grand Total,245,477.0,128.0,100.0,3.0,14.0,9075569.0,5384737.0,5217771.0,201091.7
3793,2008,Westford,Grand Total,987,1995.0,483.0,421.0,17.0,66.0,64425880.0,45174904.0,44854020.0,2264053.0
3811,2008,Westminster,Grand Total,1481,2740.0,651.0,693.0,29.0,108.0,70542960.0,46188278.0,45726359.0,2074731.0
3821,2008,Westmore,Grand Total,171,321.0,93.0,67.0,2.0,9.0,8213056.0,5404869.0,5171945.0,249205.4
3831,2008,Weston,Grand Total,325,570.0,145.0,159.0,4.0,17.0,22206980.0,14948198.0,14387812.0,684444.3
3849,2008,Weybridge,Grand Total,409,845.0,218.0,161.0,6.0,24.0,35108430.0,26636370.0,25597517.0,1507293.0
3859,2008,Wheelock,Grand Total,313,628.0,158.0,133.0,4.0,18.0,12375200.0,7373770.0,7287961.0,298163.1


## Appending to Running Database

The point of all of this is to make adding multiple years of data together almost effortless. For the very first year the running data, stored as a csv, needs to be created. After that all you have to do is change the year at the top and rerun the kernel and it should automatically add the year selected to the running database  

In [23]:
if cumltowntax.empty:
    cumltowntax = towntax 
else:
    cumltowntax = cumltowntax.append(towntax, ignore_index=True)
    

if cumltowntotal.empty:
    cumltowntotal = towntotal
else:
    cumltowntotal = cumltowntotal.append(towntotal, ignore_index=True)

In [24]:
cumltowntax.tail(20)

Unnamed: 0,Year,Town,AGIClss,Return,Exempt,MarJnt,Single,MarSep,HdHous,AGI,FedTI,VTI,NetVTax
14038,2008,Worcester,15000 - 19999,26,43.0,7.0,13.0,1.0,5.0,450872.0,122315.0,125496.0,4519.0
14039,2008,Worcester,20000 - 24999,22,54.0,11.0,6.0,0.0,5.0,499791.0,118365.0,115739.0,4165.0
14040,2008,Worcester,25000 - 29999,27,53.0,9.0,13.0,0.0,5.0,742310.0,302888.0,326817.0,10041.932
14041,2008,Worcester,30000 - 34999,29,49.0,10.0,15.0,0.0,4.0,938645.0,513356.0,513309.0,17377.7792
14042,2008,Worcester,35000 - 39999,27,69.0,14.0,10.0,0.0,3.0,1013736.0,489082.0,486320.0,17418.0
14043,2008,Worcester,40000 - 44999,31,53.0,8.0,15.0,4.0,4.0,1311166.0,862297.0,836479.0,30675.0
14044,2008,Worcester,45000 - 49999,15,34.0,10.0,1.0,1.0,3.0,722072.0,399132.0,419983.0,14620.9197
14045,2008,Worcester,50000 - 59999,30,66.0,20.0,7.0,0.0,3.0,1637233.0,1049509.0,1048896.0,39708.0
14046,2008,Worcester,60000 - 74999,29,76.0,23.0,5.0,0.0,1.0,1927200.0,1288847.0,1289174.0,47521.1618
14047,2008,Worcester,75000 - 99999,56,145.0,51.0,4.0,0.0,1.0,4789814.0,3414724.0,3408386.0,139257.4185


In [25]:
cumltowntotal.tail(20)

Unnamed: 0,Year,Town,Type,Return,Exempt,MarJnt,Single,MarSep,HdHous,AGI,FedTI,VTI,NetVTax
996,2008,Westminster,Grand Total,1481,2740.0,651.0,693.0,29.0,108.0,70542960.0,46188278.0,45726359.0,2074731.0
997,2008,Westmore,Grand Total,171,321.0,93.0,67.0,2.0,9.0,8213056.0,5404869.0,5171945.0,249205.4
998,2008,Weston,Grand Total,325,570.0,145.0,159.0,4.0,17.0,22206980.0,14948198.0,14387812.0,684444.3
999,2008,Weybridge,Grand Total,409,845.0,218.0,161.0,6.0,24.0,35108430.0,26636370.0,25597517.0,1507293.0
1000,2008,Wheelock,Grand Total,313,628.0,158.0,133.0,4.0,18.0,12375200.0,7373770.0,7287961.0,298163.1
1001,2008,Whiting,Grand Total,244,441.0,96.0,123.0,8.0,17.0,9493149.0,5920465.0,5899941.0,260139.9
1002,2008,Whitingham,Grand Total,610,1175.0,293.0,271.0,6.0,40.0,26457020.0,15796910.0,16118040.0,600373.0
1003,2008,Williamstown,Grand Total,1719,3242.0,734.0,792.0,24.0,169.0,71479120.0,45440916.0,44868107.0,1900496.0
1004,2008,Williston,Grand Total,4489,8762.0,2076.0,2086.0,49.0,278.0,354286600.0,258076523.0,256557885.0,14534840.0
1005,2008,Wilmington,Grand Total,1101,1888.0,412.0,578.0,28.0,83.0,55918790.0,38428865.0,36967670.0,1961300.0


In [26]:
# export to a csv, if the index is not set to false it will add an unnamed column with the original index which will need to be deleted individually
cumltowntax.to_csv("cumltowntaxincome.csv", index=False)
cumltowntotal.to_csv("cumltowntotal.csv", index=False)

## Adding to a Shapefile

Once all the years are added together it is time to join them with the Town Boundaries Shapefile in whichever GIS program you prefer. In the CSV there is a set of data that is marked in the Town column as Surpressed/Not Stated which cannot be joined and will be marked as ignored. It is important not to lose that data, while it is small it might be important given a specific problem. 