# Lab 11 - Pruning Parcels

## Tasks

#### Task 1 - Proptype on 2004 parcel data

Build a query that will filter the 2004 parcel file

1. Read in the 2004 parce file
2. Create a column that contains True if the parcel is close to our 49 lakes (see lab 10) and flase otherwise. (Hint: create `lat_long` -> map to `code` -> check `isin` for the lakes we identified in lab 10).
3. Create a query that selects the correct columns, filters on the new column, then adds lake_names, lake_codes, and lake distance.  Finally filter by distance to the lake.

How much have we reduced the size of the parcel file?  Can you safely union all the files after these filters?

#### Task 2 - Read all 11 files and apply the process defined in the last task


#### Task 3 - Union all 11 files then write out the result.

In [1]:
import pandas as pd
from dfply import *
import datetime as dt
from glob import glob
import re
from toolz import first
from more_dfply import recode
from functoolz import pipeable

In [2]:
from project_cols_to_keep_and_drop import cols_to_keep

In [3]:
from project_raw_pd_parcel_types import common_parcel_types

In [4]:
from project_data_Miertschin import common_columns, ll_dist_dict, ll_code_dict, code_name_dict, ll_idnamedist_dict

In [5]:
files = glob('../MinneMUDAC_raw_files/20**_metro_tax_parcels.txt')[2:-1]

In [6]:
lake_stats = pd.read_csv("./data/lakes_stats.csv")

In [7]:
lake_stats.head()

Unnamed: 0.1,Unnamed: 0,DNR_ID_Site_Number,year,LAKE_NAME,mean_secchi,med_secchi,sd_secchi,mean_phos,med_phos,sd_phos
0,0,19002100-01,2004,Alimagnet Lake,0.445,0.5,0.204736,0.1645,0.107,0.137039
1,1,19002100-01,2005,Alimagnet Lake,0.528,0.5,0.219484,0.1234,0.1275,0.038945
2,2,19002100-01,2006,Alimagnet Lake,0.525,0.5,0.185164,0.154375,0.126,0.090448
3,3,19002100-01,2007,Alimagnet Lake,0.507,0.415,0.247792,0.124,0.1125,0.064014
4,4,19002100-01,2008,Alimagnet Lake,0.605,0.6,0.252533,0.106167,0.1025,0.04058


In [8]:
lakes_2004 = pd.read_csv(files[0],sep='|',dtype = common_parcel_types)
lakes_2004.head()

Unnamed: 0,ACRES_DEED,ACRES_POLY,AGPRE_ENRD,AGPRE_EXPD,AG_PRESERV,BASEMENT,BLDG_NUM,BLOCK,CITY,CITY_USPS,...,XUSE1_DESC,XUSE2_DESC,XUSE3_DESC,XUSE4_DESC,YEAR_BUILT,Year,ZIP,ZIP4,centroid_lat,centroid_long
0,0.0,8.03,,,N,,,,SAINT FRANCIS,,...,,,,,1980.0,2004,,,45.41332,-93.26739
1,0.0,0.93,,,N,,24457.0,,SAINT FRANCIS,BETHEL,...,,,,,1974.0,2004,55005.0,,45.41354,-93.2701
2,0.0,8.75,,,N,,24442.0,,SAINT FRANCIS,BETHEL,...,,,,,1969.0,2004,55005.0,,45.41318,-93.27344
3,0.0,11.17,,,N,,410.0,,SAINT FRANCIS,BETHEL,...,,,,,1989.0,2004,55005.0,,45.41167,-93.27684
4,0.0,14.46,,,N,,480.0,,SAINT FRANCIS,BETHEL,...,,,,,1995.0,2004,55070.0,,45.41169,-93.27849


In [9]:
codes_with_complete_data = lake_stats.DNR_ID_Site_Number

In [10]:
rows_to_keep = (pd.Series(zip(lakes_2004.centroid_lat,lakes_2004.centroid_long))
          .map(ll_code_dict)
          .isin(codes_with_complete_data)
         )
rows_to_keep.sample(20)

1959044    False
834864     False
1456802    False
660621      True
604830     False
890712     False
681595     False
677627     False
1284129    False
1132335    False
63401       True
83718      False
1076867    False
580259     False
1331999    False
1441226    False
1270169    False
845057     False
494093     False
486247     False
dtype: bool

In [11]:
rows_to_keep.value_counts()

False    1786339
True      196036
dtype: int64

In [12]:
lakes_2004_condensed = (lakes_2004
                        >> filter_by(rows_to_keep)
                       )
lakes_2004_condensed.head()

Unnamed: 0,ACRES_DEED,ACRES_POLY,AGPRE_ENRD,AGPRE_EXPD,AG_PRESERV,BASEMENT,BLDG_NUM,BLOCK,CITY,CITY_USPS,...,XUSE1_DESC,XUSE2_DESC,XUSE3_DESC,XUSE4_DESC,YEAR_BUILT,Year,ZIP,ZIP4,centroid_lat,centroid_long
8,0.0,4.28,,,N,,,,SAINT FRANCIS,,...,,,,,0.0,2004,,,45.4117,-93.28091
21,0.0,1.64,,,N,,,,SAINT FRANCIS,,...,,,,,0.0,2004,,,45.41325,-93.35273
30,0.0,64.56,,,N,,,,SAINT FRANCIS,,...,,,,,0.0,2004,,,45.4101,-93.38513
38,0.0,39.25,,,N,,,,SAINT FRANCIS,,...,,,,,0.0,2004,,,45.41164,-93.39853
46,0.0,5.05,,,N,,24436.0,1.0,SAINT FRANCIS,SAINT FRANCIS,...,,,,,1995.0,2004,55070.0,,45.41222,-93.32077


In [13]:
lakes_2004_condensed.shape

(196036, 71)

In [14]:
def clean_parcel_file(df):
    new_df = (df
              >> select(cols_to_keep)
              >> mutate(lat_long = pd.Series(zip(df.centroid_lat,df.centroid_long))) 
              >> mutate(lake_code = recode(X.lat_long,ll_code_dict))
              >> filter_by(X.lake_code.isin(codes_with_complete_data))
              >> mutate(distance_to_lake = recode(X.lat_long,ll_dist_dict),
                        lake_name = recode(X.lake_code,code_name_dict))
              >> filter_by(X.distance_to_lake <= 1600)
             )
    return new_df

In [15]:
lakes_2004_complete = clean_parcel_file(lakes_2004)

In [16]:
lakes_2004_complete.head()

Unnamed: 0,ACRES_DEED,ACRES_POLY,AGPRE_ENRD,AG_PRESERV,BASEMENT,CITY,COOLING,DWELL_TYPE,EMV_BLDG,EMV_LAND,...,XUSE3_DESC,XUSE4_DESC,YEAR_BUILT,Year,centroid_lat,centroid_long,lat_long,lake_code,distance_to_lake,lake_name
8,0.0,4.28,,N,,SAINT FRANCIS,,,0.0,21800.0,...,,,0.0,2004,45.4117,-93.28091,"(45.4117, -93.28091)",82015900-01,187.280315,Forest Lake
21,0.0,1.64,,N,,SAINT FRANCIS,,,0.0,50400.0,...,,,0.0,2004,45.41325,-93.35273,"(45.41325, -93.35273)",19002900-01,742.058706,Lee Lake
30,0.0,64.56,,N,,SAINT FRANCIS,,,0.0,229646.0,...,,,0.0,2004,45.4101,-93.38513,"(45.4101, -93.38513)",82011602-01,1550.29402,Armstrong Lake
38,0.0,39.25,,N,,SAINT FRANCIS,,,0.0,140000.0,...,,,0.0,2004,45.41164,-93.39853,"(45.41164, -93.39853)",27062700-01,571.475034,Northwood Lake
56,0.0,331.82,,N,,SAINT FRANCIS,,,429784.0,1138600.0,...,,,1982.0,2004,45.40427,-93.49863,"(45.40427, -93.49863)",82009400-01,1474.663652,Colby Lake


In [17]:
lakes_2004_complete.shape

(122363, 43)

In [34]:
@dfpipe
def union_all(left_df, right_df, ignore_index=True):
    return pd.concat([left_df, right_df], ignore_index=ignore_index)

In [35]:
parcel_union = pd.DataFrame(columns=['ACRES_DEED', 'ACRES_POLY', 'AGPRE_ENRD', 'AG_PRESERV', 'BASEMENT',
       'CITY', 'COOLING', 'DWELL_TYPE', 'EMV_BLDG', 'EMV_LAND', 'FIN_SQ_FT',
       'GARAGE', 'GARAGESQFT', 'GREEN_ACRE', 'HOMESTEAD', 'LANDMARK',
       'OWN_ADD_L1', 'OWN_ADD_L2', 'OWN_ADD_L3', 'PARC_CODE', 'PIN',
       'SALE_VALUE', 'SPEC_ASSES', 'TAX_CAPAC', 'TAX_EXEMPT', 'TOTAL_TAX',
       'USE1_DESC', 'USE2_DESC', 'USE3_DESC', 'USE4_DESC', 'WSHD_DIST',
       'XUSE1_DESC', 'XUSE2_DESC', 'XUSE3_DESC', 'XUSE4_DESC', 'YEAR_BUILT',
       'Year', 'centroid_lat', 'centroid_long', 'lat_long', 'lake_code',
       'distance_to_lake', 'lake_name'])
parcel_union

Index(['ACRES_DEED', 'ACRES_POLY', 'AGPRE_ENRD', 'AG_PRESERV', 'BASEMENT',
       'CITY', 'COOLING', 'DWELL_TYPE', 'EMV_BLDG', 'EMV_LAND', 'FIN_SQ_FT',
       'GARAGE', 'GARAGESQFT', 'GREEN_ACRE', 'HOMESTEAD', 'LANDMARK',
       'OWN_ADD_L1', 'OWN_ADD_L2', 'OWN_ADD_L3', 'PARC_CODE', 'PIN',
       'SALE_VALUE', 'SPEC_ASSES', 'TAX_CAPAC', 'TAX_EXEMPT', 'TOTAL_TAX',
       'USE1_DESC', 'USE2_DESC', 'USE3_DESC', 'USE4_DESC', 'WSHD_DIST',
       'XUSE1_DESC', 'XUSE2_DESC', 'XUSE3_DESC', 'XUSE4_DESC', 'YEAR_BUILT',
       'Year', 'centroid_lat', 'centroid_long', 'lat_long', 'lake_code',
       'distance_to_lake', 'lake_name'],
      dtype='object')

In [37]:
for f in files:
    print("Beginning file {0}".format(f))
    df = pd.read_csv(f,sep='|',dtype = common_parcel_types)
    df_complete = clean_parcel_file(df)
    parcel_union = parcel_union >> union_all(df_complete)

Beginning file ../MinneMUDAC_raw_files/2004_metro_tax_parcels.txt
Beginning file ../MinneMUDAC_raw_files/2005_metro_tax_parcels.txt
Beginning file ../MinneMUDAC_raw_files/2006_metro_tax_parcels.txt
Beginning file ../MinneMUDAC_raw_files/2007_metro_tax_parcels.txt


  interactivity=interactivity, compiler=compiler, result=result)


Beginning file ../MinneMUDAC_raw_files/2008_metro_tax_parcels.txt


  interactivity=interactivity, compiler=compiler, result=result)


Beginning file ../MinneMUDAC_raw_files/2009_metro_tax_parcels.txt


  interactivity=interactivity, compiler=compiler, result=result)


Beginning file ../MinneMUDAC_raw_files/2010_metro_tax_parcels.txt


  interactivity=interactivity, compiler=compiler, result=result)


Beginning file ../MinneMUDAC_raw_files/2011_metro_tax_parcels.txt
Beginning file ../MinneMUDAC_raw_files/2012_metro_tax_parcels.txt
Beginning file ../MinneMUDAC_raw_files/2013_metro_tax_parcels.txt
Beginning file ../MinneMUDAC_raw_files/2014_metro_tax_parcels.txt


  interactivity=interactivity, compiler=compiler, result=result)


In [38]:
parcel_union.sample(20)

Unnamed: 0,ACRES_DEED,ACRES_POLY,AGPRE_ENRD,AG_PRESERV,BASEMENT,CITY,COOLING,DWELL_TYPE,EMV_BLDG,EMV_LAND,...,XUSE3_DESC,XUSE4_DESC,YEAR_BUILT,Year,centroid_lat,centroid_long,lat_long,lake_code,distance_to_lake,lake_name
176615,0.15,0.17,,,Y,ST. PAUL,CNTRL,SINGLE FAMILY DWELLING,0.0,0.0,...,,,1923.0,2005,44.93496,-93.19643,"(44.93496, -93.19643)",82009002-01,1432.423625,Wilmes Lake
790576,0.0,0.23,,N,,BLOOMINGTON,,,48700.0,98800.0,...,,,1953.0,2010,44.85379,-93.26021,"(44.85379, -93.26021)",19034800-01,312.16051,Valley Lake
184850,0.49,0.0,,,,FOREST LAKE,,,0.0,50000.0,...,,,0.0,2005,45.2641,-92.96919,"(45.2641, -92.96919)",70002600-01,613.036341,Lower Prior Lake
172973,0.32,0.31,,,Y,ST. PAUL,A/CON,SINGLE FAMILY DWELLING,0.0,0.0,...,,,1854.0,2005,44.93981,-93.10647,"(44.93981, -93.10647)",82009700-01,1370.414567,La Lake
1330462,0.0,1.12,,N,,MINNEAPOLIS,,,3301900.0,611600.0,...,,,1967.0,2014,44.96234,-93.28359,"(44.96234, -93.28359)",82015900-01,383.384869,Forest Lake
922131,0.0,4.96,,N,,MARSHAN TWP,,S.FAM.RES,163200.0,82700.0,...,,,1973.0,2011,44.65763,-92.8552,"(44.65763, -92.8552)",27007000-01,778.39556,Mitchell Lake
821208,0.0,1.27,,N,,LAKE ELMO,,,0.0,100.0,...,,,0.0,2010,45.00648,-92.87566,"(45.00648, -92.87566)",19002100-01,1088.878362,Alimagnet Lake
396009,0.0,0.11,,,,MINNEAPOLIS,,,154900.0,70600.0,...,,,1925.0,2007,44.91837,-93.24769,"(44.91837, -93.24769)",19034800-01,1059.585788,Valley Lake
256773,0.0,0.31,,N,,INVER GROVE HEIGHTS,,S.FAM.RES,246200.0,84500.0,...,,,1998.0,2006,44.81511,-93.10257,"(44.81511, -93.10257)",19002500-01,314.80092,Keller Lake
601721,0.0,0.04,1899-12-30,,,BLOOMINGTON,,,127400.0,31200.0,...,,,1972.0,2008,44.81229,-93.29331,"(44.81229, -93.29331)",19002200-01,726.190586,Long Lake


In [39]:
parcel_union.shape

(1416470, 43)

In [40]:
parcel_union.to_csv("./data/union_of_parcel_data.csv")