# MinneMUDAC Project 1 Part 1

In [56]:
import pandas as pd
import datetime
from toolz import first
from dfply import *

### Reading in one dataset to make sure it works

In [2]:
metro_02 = pd.read_csv("../MinneMUDAC_raw_files/2002_metro_tax_parcels.txt",chunksize=500,sep="|")
first(metro_02).columns

Index(['ACRES_DEED', 'ACRES_POLY', 'AGPRE_ENRD', 'AGPRE_EXPD', 'AG_PRESERV',
       'BASEMENT', 'BLDG_NUM', 'BLOCK', 'CITY', 'CITY_USPS', 'COOLING',
       'COUNTY_ID', 'DWELL_TYPE', 'EMV_BLDG', 'EMV_LAND', 'EMV_TOTAL',
       'FIN_SQ_FT', 'GARAGE', 'GARAGESQFT', 'GREEN_ACRE', 'HEATING',
       'HOMESTEAD', 'HOME_STYLE', 'LANDMARK', 'LOT', 'MULTI_USES', 'NUM_UNITS',
       'OPEN_SPACE', 'OWNER_MORE', 'OWNER_NAME', 'OWN_ADD_L1', 'OWN_ADD_L2',
       'OWN_ADD_L3', 'OWN_NAME', 'PARC_CODE', 'PIN', 'PIN_1', 'PLAT_NAME',
       'PREFIXTYPE', 'PREFIX_DIR', 'SALE_DATE', 'SALE_VALUE', 'SCHOOL_DST',
       'SPEC_ASSES', 'STREET', 'STREETNAME', 'STREETTYPE', 'STRUC_TYPE',
       'SUFFIX_DIR', 'Shape_Area', 'Shape_Leng', 'TAX_ADD_L1', 'TAX_ADD_L2',
       'TAX_ADD_L3', 'TAX_ADD_LI', 'TAX_CAPAC', 'TAX_EXEMPT', 'TAX_NAME',
       'TOTAL_TAX', 'UNIT_INFO', 'USE1_DESC', 'USE2_DESC', 'USE3_DESC',
       'USE4_DESC', 'WSHD_DIST', 'XUSE1_DESC', 'XUSE2_DESC', 'XUSE3_DESC',
       'XUSE4_DESC', 'YEAR_BUILT

### Creating string pattern to create list of file names

In [25]:
def output_address(dt):
    fmt = "../MinneMUDAC_raw_files/20%y_metro_tax_parcels.txt"
    return dt.strftime(fmt)

In [26]:
dts = [datetime.datetime(y,1,1) for y in range(2004, 2016)]
dts[:2]

[datetime.datetime(2004, 1, 1, 0, 0), datetime.datetime(2005, 1, 1, 0, 0)]

In [27]:
file_names = [output_address(year) for year in dts]
file_names[:2]

['../MinneMUDAC_raw_files/2004_metro_tax_parcels.txt',
 '../MinneMUDAC_raw_files/2005_metro_tax_parcels.txt']

### Reading in all the datasets

In [28]:
dfs = [pd.read_csv(file,chunksize=500,sep='|') for file in file_names]

### Pulling off the first chunks 

In [29]:
first_chunks = [first(df) for df in dfs]

### Creating a set of column names

In [30]:
col_names = [set(df.columns) for df in first_chunks]

### Unioning all the sets of column names

In [31]:
union_names = set([])
for i in col_names:
    union_names=union_names.union(i)
len(union_names)

77

### Intersecting all the sets of column names

In [32]:
intersect_names = col_names[1]

for i in col_names[2:]:
    intersect_names = intersect_names.intersection(i)
len(intersect_names)

70

In [33]:
[len(col) for col in col_names]

[71, 70, 70, 72, 72, 72, 71, 70, 70, 70, 74, 70]

### Differencing the sets of column names

#### Each set of column names - the intersection of all column names

In [34]:
[(file_name,len(col_name.difference(intersect_names))) for file_name,col_name in zip(file_names,col_names)]

[('../MinneMUDAC_raw_files/2004_metro_tax_parcels.txt', 1),
 ('../MinneMUDAC_raw_files/2005_metro_tax_parcels.txt', 0),
 ('../MinneMUDAC_raw_files/2006_metro_tax_parcels.txt', 0),
 ('../MinneMUDAC_raw_files/2007_metro_tax_parcels.txt', 2),
 ('../MinneMUDAC_raw_files/2008_metro_tax_parcels.txt', 2),
 ('../MinneMUDAC_raw_files/2009_metro_tax_parcels.txt', 2),
 ('../MinneMUDAC_raw_files/2010_metro_tax_parcels.txt', 1),
 ('../MinneMUDAC_raw_files/2011_metro_tax_parcels.txt', 0),
 ('../MinneMUDAC_raw_files/2012_metro_tax_parcels.txt', 0),
 ('../MinneMUDAC_raw_files/2013_metro_tax_parcels.txt', 0),
 ('../MinneMUDAC_raw_files/2014_metro_tax_parcels.txt', 4),
 ('../MinneMUDAC_raw_files/2015_metro_tax_parcels.txt', 0)]

In [35]:
intersect_names

{'ACRES_DEED',
 'ACRES_POLY',
 'AGPRE_ENRD',
 'AGPRE_EXPD',
 'AG_PRESERV',
 'BASEMENT',
 'BLDG_NUM',
 'BLOCK',
 'CITY',
 'CITY_USPS',
 'COOLING',
 'COUNTY_ID',
 'DWELL_TYPE',
 'EMV_BLDG',
 'EMV_LAND',
 'EMV_TOTAL',
 'FIN_SQ_FT',
 'GARAGE',
 'GARAGESQFT',
 'GREEN_ACRE',
 'HEATING',
 'HOMESTEAD',
 'HOME_STYLE',
 'LANDMARK',
 'LOT',
 'MULTI_USES',
 'NUM_UNITS',
 'OPEN_SPACE',
 'OWNER_MORE',
 'OWNER_NAME',
 'OWN_ADD_L1',
 'OWN_ADD_L2',
 'OWN_ADD_L3',
 'PARC_CODE',
 'PIN',
 'PLAT_NAME',
 'PREFIXTYPE',
 'PREFIX_DIR',
 'SALE_DATE',
 'SALE_VALUE',
 'SCHOOL_DST',
 'SPEC_ASSES',
 'STREETNAME',
 'STREETTYPE',
 'SUFFIX_DIR',
 'Shape_Area',
 'Shape_Leng',
 'TAX_ADD_L1',
 'TAX_ADD_L2',
 'TAX_ADD_L3',
 'TAX_CAPAC',
 'TAX_EXEMPT',
 'TAX_NAME',
 'TOTAL_TAX',
 'UNIT_INFO',
 'USE1_DESC',
 'USE2_DESC',
 'USE3_DESC',
 'USE4_DESC',
 'WSHD_DIST',
 'XUSE1_DESC',
 'XUSE2_DESC',
 'XUSE3_DESC',
 'XUSE4_DESC',
 'YEAR_BUILT',
 'Year',
 'ZIP',
 'ZIP4',
 'centroid_lat',
 'centroid_long'}

In [36]:
pd.DataFrame(list(intersect_names)).to_csv("./data/parcel_common_columns_2004_2014.csv", index=False)